diff --git a/apps/client/src/features/ai-chat/components/ai-chat-window.tsx b/apps/client/src/features/ai-chat/components/ai-chat-window.tsx index 92c125ce..5c67e177 100644 --- a/apps/client/src/features/ai-chat/components/ai-chat-window.tsx +++ b/apps/client/src/features/ai-chat/components/ai-chat-window.tsx @@ -48,6 +48,7 @@ import { useAiRolesQuery, } from "@/features/ai-chat/queries/ai-chat-query.ts"; import { + shouldClearLatchOnQueryError, shouldClearStoppingLatch, shouldObserveRun, } from "@/features/ai-chat/utils/run-polling.ts"; @@ -320,14 +321,32 @@ export default function AiChatWindow() { // Safety net (#234 F4 review): after handleServerStop clears the run cache, // `run` is null until the current turn's run is fetched fresh, and the terminal // effect below holds the latch via `if (!run) return`. If that refetch instead - // ERRORS (a transient GET-run failure) while we are no longer the streamer, the - // run stays null, its status-keyed refetchInterval is off, and nothing would - // ever observe a terminal run — freezing the view with the observer merge - // suppressed. Release the latch on that error so the live view resumes rather - // than stays stuck (the local stopRun may already have succeeded independently). + // ERRORS PERMANENTLY (the GET-run keeps failing) while we are no longer the + // streamer, the run stays null, its status-keyed refetchInterval is off, and + // nothing would ever observe a terminal run — freezing the view with the + // observer merge suppressed. Release the latch on that error so the live view + // resumes rather than stays stuck (the local stopRun may already have succeeded + // independently). + // + // #234 F7: this must NOT fire on a TRANSIENT error while `run` is still an + // ACTIVE held run. In TanStack Query v5 (retry:false) the query's `data` is + // RETAINED on error, so `runQueryFailed` can be true while `run` is still + // pending/running — releasing then would re-open the observer merge and flash + // the growing detached run over the frozen row (the very flash F4 prevents). The + // decision is the pure, unit-tested `shouldClearLatchOnQueryError`, which gates + // on the run NOT being active: it cures only the genuine permanent-null-freeze + // (`run === null`) and never releases against an active run. useEffect(() => { - if (stoppingRun && !localStreaming && runQueryFailed) setStoppingRun(false); - }, [stoppingRun, localStreaming, runQueryFailed]); + if ( + shouldClearLatchOnQueryError({ + stoppingRun, + isLocalStreaming: localStreaming, + runQueryFailed, + run, + }) + ) + setStoppingRun(false); + }, [stoppingRun, localStreaming, runQueryFailed, run]); // The run's incrementally-persisted assistant message to merge into the thread, // but only while we are an observer (never when we are the streamer — guards // against a stale poll fighting the live stream). Includes a terminal run so the diff --git a/apps/client/src/features/ai-chat/utils/run-polling.test.ts b/apps/client/src/features/ai-chat/utils/run-polling.test.ts index 580f759f..b93f08fb 100644 --- a/apps/client/src/features/ai-chat/utils/run-polling.test.ts +++ b/apps/client/src/features/ai-chat/utils/run-polling.test.ts @@ -7,6 +7,7 @@ import { runPollInterval, shouldObserveRun, shouldClearStoppingLatch, + shouldClearLatchOnQueryError, mergeObservedMessage, } from "./run-polling.ts"; @@ -176,6 +177,105 @@ describe("shouldClearStoppingLatch (#234 latch-release decision)", () => { }); }); +describe("shouldClearLatchOnQueryError (#234 F7 error-safety-net decision)", () => { + // This guards the REAL anti-flash decision the component's run-query-error + // safety-net effect uses (ai-chat-window.tsx wires the effect to THIS helper, + // not a copy — so the test is non-vacuous vs the live code). + + // (b) The F7 hole: a TRANSIENT run-query error while `run` is STILL ACTIVE must + // NOT clear the latch. TanStack Query v5 retains `data` on error, so + // runQueryFailed can be true while the held run is still pending/running. + // Against the PRE-F7 condition (without `!isRunActive(run)`) this would return + // true — so this assertion fails on the buggy code (non-vacuous). + it("does NOT clear on a transient error while the run is still ACTIVE (F7)", () => { + expect( + shouldClearLatchOnQueryError({ + stoppingRun: true, + isLocalStreaming: false, + runQueryFailed: true, + run: makeRun("running"), + }), + ).toBe(false); + expect( + shouldClearLatchOnQueryError({ + stoppingRun: true, + isLocalStreaming: false, + runQueryFailed: true, + run: makeRun("pending"), + }), + ).toBe(false); + }); + + // (a) The genuine permanent-null-freeze: run cache cleared by removeQueries + + // the refetch keeps ERRORING, so `run === null`. This is the ONLY case the + // safety-net exists to cure — it MUST clear so the frozen view resumes. + it("clears on a permanent error when the run is null (permanent-null-freeze)", () => { + expect( + shouldClearLatchOnQueryError({ + stoppingRun: true, + isLocalStreaming: false, + runQueryFailed: true, + run: null, + }), + ).toBe(true); + expect( + shouldClearLatchOnQueryError({ + stoppingRun: true, + isLocalStreaming: false, + runQueryFailed: true, + run: undefined, + }), + ).toBe(true); + }); + + // A TERMINAL run also satisfies `!isRunActive`; clearing then is harmless — the + // terminal effect (shouldClearStoppingLatch) already clears for a terminal run, + // so this only ever agrees with it. Asserted so the (c) reasoning is pinned. + it("clears on an error when the run is terminal (harmless, agrees with terminal effect)", () => { + expect( + shouldClearLatchOnQueryError({ + stoppingRun: true, + isLocalStreaming: false, + runQueryFailed: true, + run: makeRun("aborted"), + }), + ).toBe(true); + }); + + it("does NOT clear without an actual query error", () => { + expect( + shouldClearLatchOnQueryError({ + stoppingRun: true, + isLocalStreaming: false, + runQueryFailed: false, + run: null, + }), + ).toBe(false); + }); + + it("does NOT clear while this tab is the local streamer", () => { + expect( + shouldClearLatchOnQueryError({ + stoppingRun: true, + isLocalStreaming: true, + runQueryFailed: true, + run: null, + }), + ).toBe(false); + }); + + it("does NOT clear when no stop was requested", () => { + expect( + shouldClearLatchOnQueryError({ + stoppingRun: false, + isLocalStreaming: false, + runQueryFailed: true, + run: null, + }), + ).toBe(false); + }); +}); + describe("mergeObservedMessage", () => { it("replaces the message with the same id in place (per-step growth)", () => { const prev = [makeMsg("u1", "hi"), makeMsg("a1", "step 1")]; diff --git a/apps/client/src/features/ai-chat/utils/run-polling.ts b/apps/client/src/features/ai-chat/utils/run-polling.ts index fa204f95..560b675f 100644 --- a/apps/client/src/features/ai-chat/utils/run-polling.ts +++ b/apps/client/src/features/ai-chat/utils/run-polling.ts @@ -82,6 +82,56 @@ export function shouldClearStoppingLatch(args: { return !!run && !isRunActive(run); } +/** + * Should the "stopping" latch be RELEASED by the run-query ERROR safety-net? + * (#234 F7 — a NEW path of the same re-stream flash the F4 latch exists to + * prevent.) After Stop, `handleServerStop` clears the run cache; the terminal + * effect then holds the latch via `if (!run) return` until the CURRENT turn's run + * is fetched fresh. If that refetch instead ERRORS permanently, `run` stays null, + * its status-keyed refetchInterval is off, and nothing would ever observe a + * terminal run — freezing the view with the observer merge suppressed. This + * safety-net cures ONLY that genuine permanent-null-freeze. + * + * All four must hold: + * - `stoppingRun`: we actually requested a stop (otherwise nothing to release); + * - `!isLocalStreaming`: this tab is NOT the local streamer (same reason as + * {@link shouldClearStoppingLatch}); + * - `runQueryFailed`: the run query is in its error state (TanStack Query v5 with + * retry:false — isError); + * - `!isRunActive(run)`: the observed `run` is NOT an active (pending/running) + * held run. This is the F7 gate. In TanStack Query v5 the query's `data` is + * RETAINED on error, so `runQueryFailed` can be true while `run` is STILL an + * ACTIVE run (a single transient GET-run failure in the window between Stop and + * settle). Without this gate a transient error would release the latch early — + * re-opening the observer merge and flashing the growing detached run over the + * frozen row (exactly the F4 flash). Gating on the run NOT being active means we + * only ever cure the permanent-null-freeze (`run === null`, so + * `isRunActive(null)` is false), never release against an active run. + * + * (A terminal `run` also satisfies `!isRunActive(run)`; clearing then is harmless + * — the terminal effect's {@link shouldClearStoppingLatch} already clears the + * latch for a terminal run, so this only ever agrees with it, never conflicts.) + * + * INVARIANT (do not break): clearing the latch on the `run === null` branch is safe + * ONLY because the run query's `refetchInterval` (see {@link runPollInterval}) stops + * polling when the data is empty — so after we clear on null+error there is no + * subsequent auto-poll that could return a still-active detached run and re-open the + * merge. If `refetchInterval` is ever changed to keep polling on `run === null`/on + * error, this null-branch clear would re-open the F7 flash through the null path. + * Do not change the run query's refetchInterval without re-checking this path. + */ +export function shouldClearLatchOnQueryError(args: { + stoppingRun: boolean; + isLocalStreaming: boolean; + runQueryFailed: boolean; + run: IAiChatRun | null | undefined; +}): boolean { + const { stoppingRun, isLocalStreaming, runQueryFailed, run } = args; + return ( + stoppingRun && !isLocalStreaming && runQueryFailed && !isRunActive(run) + ); +} + /** * Merge an observed assistant message into the rendered list: replace the message * with the same id in place (the in-progress assistant row is already seeded from