fix(ai-chat): harden run finalize + restore int-spec, cover terminal callbacks (#184 round-2)
Round-2 review fixes for PR #234 (#184 autonomous agent runs). F6 (stability): finalizeRun no longer drops the in-memory entry before the terminal write. It now UPDATEs first with a bounded retry; only on success does it arm the idempotency once-gate (a new `settled` set keyed on "row already terminal", not "entry deleted") and free the chat's active slot. If every attempt fails the entry is RETAINED and the run left unsettled so a later finalize / requestStop->onAbort / sweep can retry — a transient blip can no longer strand a run 'running' and 409 every future turn in the chat. Idempotency preserved (double-settle still collapses to a single write). F7 (regression from F2): int-spec constructs AiChatRunService with the 2nd EnvironmentService arg ({ isCloud: () => false }) so the file type-checks and all integration tests compile+run again. F8 (regression from F1): the windowed "stale but not fresh" case now calls sweepRunning({ staleMs: SWEEP_RUN_STALE_MS }); added an int-level variant-C case proving the no-arg boot sweep aborts even a FRESH running run. F9 (coverage): run-race spec now captures streamText's options and invokes onStepFinish/onFinish/onAbort/onError, asserting the #184 run hooks (onStep / onSettled completed|aborted|error) fire with the right args. F10 (docs): added an autonomousRuns single-instance-only note to .env.example so the warnIfMultiInstance JSDoc reference is accurate. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -91,6 +91,24 @@ export class AiChatRunService implements OnModuleInit {
|
||||
// restart) still records `stop_requested_at` on the row.
|
||||
private readonly active = new Map<string, ActiveRun>();
|
||||
|
||||
// runIds whose TERMINAL row write has SUCCEEDED — the idempotency once-gate
|
||||
// (F6). A finalize must short-circuit only AFTER the terminal write has landed,
|
||||
// NOT merely after the in-memory entry was dropped: a transient UPDATE failure
|
||||
// has to stay retryable, so "already settled" means "row already terminal", not
|
||||
// "entry already gone". Grows by one short UUID per finished run over process
|
||||
// uptime — negligible in phase 1's single process.
|
||||
private readonly settled = new Set<string>();
|
||||
|
||||
// Bounded retry for the terminal write (F6): a single PK UPDATE can fail
|
||||
// transiently under many fire-and-forget writes (pool exhaustion, deadlock, a
|
||||
// brief connection blip). Riding out that blip in-place matters because the
|
||||
// dominant success path (streamText onFinish) settles exactly ONCE — if that
|
||||
// write is dropped and never retried, the row is stranded 'running' and the
|
||||
// one-active-run gate 409s every future turn in the chat until a restart (no
|
||||
// periodic sweep in phase 1).
|
||||
private static readonly FINALIZE_MAX_ATTEMPTS = 3;
|
||||
private static readonly FINALIZE_RETRY_BASE_MS = 50;
|
||||
|
||||
constructor(
|
||||
private readonly runRepo: AiChatRunRepo,
|
||||
private readonly environment: EnvironmentService,
|
||||
@@ -244,19 +262,27 @@ export class AiChatRunService implements OnModuleInit {
|
||||
|
||||
/**
|
||||
* Finalize a run to its terminal status (succeeded / failed / aborted),
|
||||
* stamping finishedAt + any error, and DROP its in-memory entry. Best-effort.
|
||||
* stamping finishedAt + any error. Best-effort, but ROBUST against a transient
|
||||
* terminal-write failure (F6).
|
||||
*
|
||||
* IDEMPOTENT (#184 review): the terminal write happens AT MOST ONCE per run.
|
||||
* `this.active.delete(runId)` returns false when the run was already settled
|
||||
* (its in-memory entry already dropped); in that case we no-op. This collapses
|
||||
* a legitimate double-settle to a single write: AiChatService.stream wraps the
|
||||
* turn in a safety-net catch that settles the run to 'error' on any failure
|
||||
* BEFORE streamText's terminal callbacks own the lifecycle — and on the rare
|
||||
* path where streamText DID attach (so a callback also settles) the two would
|
||||
* otherwise both call onSettled. The first caller wins and writes the terminal
|
||||
* row; the second returns early, so a late settle can never clobber the real
|
||||
* terminal status or double-write. beginRun always registers the entry before
|
||||
* the turn can settle, so a legitimate first finalize always finds it.
|
||||
* ORDER MATTERS (F6): the terminal UPDATE happens FIRST; only once it SUCCEEDS
|
||||
* do we record the run as settled and drop its in-memory entry. If the UPDATE
|
||||
* fails on every bounded attempt we KEEP the in-memory entry and do NOT mark it
|
||||
* settled — so a later settle (a streamText callback, a requestStop -> onAbort,
|
||||
* a future sweep) can retry the terminal write. A run is therefore never
|
||||
* silently stranded 'running' (which would 409 every future turn in the chat
|
||||
* until a restart, since phase 1 has no periodic sweep).
|
||||
*
|
||||
* IDEMPOTENT on SUCCESS (#184 review): the terminal write happens AT MOST ONCE
|
||||
* per run. The once-gate keys off {@link settled} (the terminal row already
|
||||
* written), NOT off entry-deletion — so a dropped-then-retried write is still
|
||||
* allowed, while a genuine double-settle collapses to a single write.
|
||||
* AiChatService.stream wraps the turn in a safety-net catch that settles the run
|
||||
* to 'error' on any failure BEFORE streamText's terminal callbacks own the
|
||||
* lifecycle — and on the rare path where streamText DID attach (so a callback
|
||||
* also settles) both routes call onSettled. The FIRST to write the terminal row
|
||||
* wins; the second sees `settled` and returns early, so a late settle can never
|
||||
* clobber the real terminal status or double-write the row.
|
||||
*/
|
||||
async finalizeRun(
|
||||
runId: string,
|
||||
@@ -264,20 +290,45 @@ export class AiChatRunService implements OnModuleInit {
|
||||
turnStatus: TurnTerminalStatus,
|
||||
error?: string,
|
||||
): Promise<void> {
|
||||
if (!this.active.delete(runId)) return;
|
||||
try {
|
||||
await this.runRepo.update(runId, workspaceId, {
|
||||
status: mapTurnStatusToRun(turnStatus),
|
||||
finishedAt: new Date(),
|
||||
error: error ?? null,
|
||||
});
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`Failed to finalize run ${runId}: ${
|
||||
err instanceof Error ? err.message : 'unknown error'
|
||||
}`,
|
||||
);
|
||||
// Already terminally written -> idempotent no-op.
|
||||
if (this.settled.has(runId)) return;
|
||||
|
||||
for (
|
||||
let attempt = 1;
|
||||
attempt <= AiChatRunService.FINALIZE_MAX_ATTEMPTS;
|
||||
attempt++
|
||||
) {
|
||||
try {
|
||||
await this.runRepo.update(runId, workspaceId, {
|
||||
status: mapTurnStatusToRun(turnStatus),
|
||||
finishedAt: new Date(),
|
||||
error: error ?? null,
|
||||
});
|
||||
// Terminal write landed: arm the once-gate, then (and only then) free the
|
||||
// chat's active slot by dropping the in-memory entry.
|
||||
this.settled.add(runId);
|
||||
this.active.delete(runId);
|
||||
return;
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`Failed to finalize run ${runId} (attempt ${attempt}/${
|
||||
AiChatRunService.FINALIZE_MAX_ATTEMPTS
|
||||
}): ${err instanceof Error ? err.message : 'unknown error'}`,
|
||||
);
|
||||
if (attempt < AiChatRunService.FINALIZE_MAX_ATTEMPTS) {
|
||||
await this.delay(AiChatRunService.FINALIZE_RETRY_BASE_MS * attempt);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Every attempt failed: deliberately KEEP the in-memory entry and leave the
|
||||
// run UNsettled so a later finalize/requestStop/sweep can retry — the run is
|
||||
// not stranded.
|
||||
}
|
||||
|
||||
/** Small async backoff between terminal-write retries (F6). Isolated so it is
|
||||
* trivial to stub/fake-time in tests. */
|
||||
private delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user