From 28f4662e1c50dd583818903eac76b89149ac7436 Mon Sep 17 00:00:00 2001 From: claude code agent 227 Date: Tue, 23 Jun 2026 05:16:20 +0300 Subject: [PATCH] =?UTF-8?q?fix(ai-chat):=20root-cause=20#140=20=E2=80=94?= =?UTF-8?q?=20stop=20aborting=20z.ai's=20slow=20first=20byte?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AI chat stream to z.ai (GLM-5.2, api/coding/paas/v4) broke in production on every heavy turn while `curl` to the same endpoint worked. ROOT CAUSE (reproduced in ai-http.spec.ts): z.ai's coding endpoint is a reasoning model with a long, variable TIME-TO-FIRST-RESPONSE-HEADER on a heavy chat request (tools + system prompt + document + history) — it emits nothing for tens of seconds before the first SSE byte. A trivial ping returns <2s, which is why "test connection" always passed. `curl` succeeds because it imposes no time-to-first-header limit. The prior attempt (#141) made it STRICTLY worse: it set undici `headersTimeout: 60_000` (aborting every heavy turn at ~60s — the prod logs show ~61-62s failures) AND added `UND_ERR_HEADERS_TIMEOUT` to the RetryAgent retry codes. Retrying a POST-with-body after a headers-timeout abort re-sends the body against a torn-down request and throws `UND_ERR_REQ_CONTENT_LENGTH_MISMATCH` — the exact production error. Fix — behave like curl: - Disable headersTimeout/bodyTimeout by default (0), env-overridable via AI_HTTP_HEADERS_TIMEOUT_MS / AI_HTTP_BODY_TIMEOUT_MS (sanitized so a typo can't crash the AI layer at import). The transport now waits for z.ai's slow first byte instead of killing the stream. - Keep the RetryAgent reconnecting ONLY genuine connection resets on a fresh socket; never retry a header/body timeout (it corrupts the POST body). - STT (transcribeJsonBase64) gains an explicit AbortSignal.timeout, since it shared aiFetch and previously relied on undici's default transport timeout. Tests: loopback reproduction proving the #141 retry config yields ContentLengthMismatch while the corrected set surfaces an honest HeadersTimeout, plus a curl-parity test (a finite headersTimeout aborts a slow first byte; aiFetch delivers the 200). Co-Authored-By: Claude Opus 4.8 --- .../src/integrations/ai/ai-http.spec.ts | 152 +++++++++++++++++- apps/server/src/integrations/ai/ai-http.ts | 57 ++++++- apps/server/src/integrations/ai/ai.service.ts | 14 ++ 3 files changed, 220 insertions(+), 3 deletions(-) diff --git a/apps/server/src/integrations/ai/ai-http.spec.ts b/apps/server/src/integrations/ai/ai-http.spec.ts index 044c18f9..570b1b6d 100644 --- a/apps/server/src/integrations/ai/ai-http.spec.ts +++ b/apps/server/src/integrations/ai/ai-http.spec.ts @@ -1,7 +1,44 @@ -import { RetryAgent } from 'undici'; +import * as http from 'node:http'; +import { Agent, RetryAgent } from 'undici'; import { aiFetch } from './ai-http'; +/** + * Spin up a throwaway loopback HTTP server. `onReq` decides how it responds (or + * deliberately stalls). Returns the base URL and a close fn. + */ +async function loopback( + onReq: (req: http.IncomingMessage, res: http.ServerResponse) => void, +): Promise<{ url: string; close: () => Promise }> { + const server = http.createServer(onReq); + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); + const { port } = server.address() as { port: number }; + return { + url: `http://127.0.0.1:${port}`, + close: () => + new Promise((resolve) => server.close(() => resolve())), + }; +} + +async function codeChain(p: Promise): Promise { + try { + await p; + return ['']; + } catch (e) { + const chain: string[] = []; + let cur: unknown = e; + for (let d = 0; d < 4 && cur; d++) { + const x = cur as { code?: string; cause?: unknown }; + // Capture BOTH the undici error .code (when present) and the stringified + // error (which carries the class name, e.g. RequestContentLengthMismatchError) + // so the chain is meaningful regardless of how the code surfaces. + chain.push(`${x.code ?? ''} ${String(x)}`.trim()); + cur = x.cause; + } + return chain; + } +} + /** * Light, dependency-free unit checks for the shared AI HTTP layer. The module * constructs its undici dispatcher eagerly at import time, so importing it here @@ -44,4 +81,117 @@ describe('ai-http', () => { spy.mockRestore(); } }); + + /** + * #140 ROOT-CAUSE LOCK. These tests reproduce, against a loopback server that + * NEVER sends response headers (standing in for z.ai's slow-first-byte + * reasoning turn), exactly why the issue-#141 transport config "broke every + * time" — and assert the corrected config does not. They use SHORT finite + * timeouts on locally-built agents so the suite stays fast; the shipped + * `aiFetch` disables these timeouts entirely (see HEADERS_TIMEOUT_MS). + */ + describe('#140 headers-timeout retry hazard', () => { + it('BAD (issue #141): retrying a POST on UND_ERR_HEADERS_TIMEOUT throws CONTENT_LENGTH_MISMATCH', async () => { + // A server that consumes the request body but never replies -> the client + // headersTimeout fires. This mirrors z.ai stalling before the first byte. + const srv = await loopback((req) => req.resume()); + const base = new Agent({ + headersTimeout: 300, + bodyTimeout: 300, + connect: { timeout: 1000 }, + }); + // The exact mistake of #141: retry POST on the headers-timeout code. + const bad = new RetryAgent(base, { + maxRetries: 2, + minTimeout: 50, + maxTimeout: 100, + methods: ['POST'], + statusCodes: [], + errorCodes: ['ECONNRESET', 'UND_ERR_HEADERS_TIMEOUT'], + }); + const chain = await codeChain( + fetch(`${srv.url}/chat/completions`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + // Multibyte body, like a real Russian chat turn. + body: JSON.stringify({ msg: `Привет ${'слово '.repeat(20)}😀` }), + dispatcher: bad, + } as RequestInit), + ); + // The hallmark production error (#140/#141) — proves the retry corrupts + // the re-sent POST body rather than surfacing the real timeout. undici + // exposes it as code UND_ERR_REQ_CONTENT_LENGTH_MISMATCH / + // RequestContentLengthMismatchError. + expect(chain.join('|')).toMatch(/ContentLengthMismatch|CONTENT_LENGTH/); + await bad.close(); // closing the RetryAgent wrapper closes the base pool + await srv.close(); + }); + + it('GOOD (this fix): NOT retrying the headers-timeout surfaces the honest UND_ERR_HEADERS_TIMEOUT', async () => { + const srv = await loopback((req) => req.resume()); + const base = new Agent({ + headersTimeout: 300, + bodyTimeout: 300, + connect: { timeout: 1000 }, + }); + // Our shipped retry set: connection resets only, NO timeout codes. + const good = new RetryAgent(base, { + maxRetries: 2, + minTimeout: 50, + maxTimeout: 100, + methods: ['POST'], + statusCodes: [], + errorCodes: ['ECONNRESET', 'UND_ERR_SOCKET', 'EPIPE'], + }); + const chain = await codeChain( + fetch(`${srv.url}/chat/completions`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ msg: `Привет ${'слово '.repeat(20)}😀` }), + dispatcher: good, + } as RequestInit), + ); + // The honest timeout (UND_ERR_HEADERS_TIMEOUT / HeadersTimeoutError) — no + // CONTENT_LENGTH_MISMATCH corruption. + expect(chain.join('|')).toMatch(/HeadersTimeout|HEADERS_TIMEOUT/); + expect(chain.join('|')).not.toMatch(/ContentLengthMismatch|CONTENT_LENGTH/); + await good.close(); // closing the RetryAgent wrapper closes the base pool + await srv.close(); + }); + + it('aiFetch awaits a slow-first-byte response that a finite headersTimeout would abort (curl parity)', async () => { + // The SAME server delays the response headers by 2.5s (z.ai's slow first + // byte). A control agent with a finite headersTimeout aborts it; the shipped + // aiFetch (timeouts disabled) must instead deliver the 200 — proving it waits + // like curl rather than killing the stream the way #141's 60s cap would. The + // gap is wide because undici's timeout timer wheel is coarse (~1s); a 2.5s + // delay vs a 1s control timeout makes the abort deterministic, not flaky. + const srv = await loopback((_req, res) => { + setTimeout(() => { + res.writeHead(200, { 'content-type': 'text/event-stream' }); + res.end('data: ok\n\n'); + }, 2500); + }); + const url = `${srv.url}/chat/completions`; + const init: RequestInit = { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ msg: 'Привет 😀' }), + }; + + // CONTROL: a finite headersTimeout (like #141) aborts the slow first byte. + const finite = new Agent({ headersTimeout: 1000, bodyTimeout: 1000 }); + const controlChain = await codeChain( + fetch(url, { ...init, dispatcher: finite } as RequestInit), + ); + expect(controlChain.join('|')).toMatch(/HeadersTimeout|HEADERS_TIMEOUT/); + await finite.close(); + + // THE FIX: aiFetch (disabled timeouts) waits and delivers the 200. + const res = await aiFetch(url, init); + expect(res.status).toBe(200); + await res.text(); + await srv.close(); + }); + }); }); diff --git a/apps/server/src/integrations/ai/ai-http.ts b/apps/server/src/integrations/ai/ai-http.ts index 650c5f60..9e34fa1c 100644 --- a/apps/server/src/integrations/ai/ai-http.ts +++ b/apps/server/src/integrations/ai/ai-http.ts @@ -39,8 +39,49 @@ import { Logger } from '@nestjs/common'; * and the payload was partially consumed" instead of the raw ECONNRESET. The * stream is NEVER corrupted (undici guards against concatenation) — only the * error message for that rarer mid-stream case changes. + * + * ROOT CAUSE OF #140 (z.ai chat stream "breaks every time", curl does not) + * ---------------------------------------------------------------------- + * z.ai's coding endpoint (`api/coding/paas/v4`, GLM-5.2 — a REASONING model) + * has a long, variable TIME-TO-FIRST-RESPONSE-HEADER on a heavy chat request + * (tools + system prompt + injected document + history): it sits in its + * reasoning phase emitting nothing for tens of seconds before the first SSE + * byte. A trivial `generateText` ping returns in <2s, which is why the Settings + * "test connection" always passes while the real chat stalls. `curl` succeeds + * because it imposes NO time-to-first-header limit — it simply waits. + * + * The earlier "fix" (issue #141) made this STRICTLY WORSE in two ways: + * 1. It set `headersTimeout: 60_000`, so undici ABORTS every heavy turn at + * ~60s — deterministically, before z.ai answers (the prod logs show every + * request failing at ~61-62s). That is the "ломается КАЖДЫЙ раз" regression. + * 2. It added `UND_ERR_HEADERS_TIMEOUT` to the RetryAgent retry codes. Retrying + * a POST-with-body after a headers-timeout abort makes undici re-send the + * body against a torn-down request and throw + * `UND_ERR_REQ_CONTENT_LENGTH_MISMATCH` — the exact production error. + * (Reproduced in ai-http.spec.ts.) + * + * THE FIX: behave like curl. Disable the time-to-first-header / inter-chunk + * timeouts by default (env-overridable) so the transport WAITS for z.ai's slow + * first byte instead of aborting it, and NEVER retry a header/body timeout at the + * transport layer (only genuine connection resets are retried on a fresh socket). */ +// Time-to-FIRST-response-headers (`headersTimeout`) and gap-between-streamed- +// body-chunks (`bodyTimeout`) bounds, in ms. Default 0 = DISABLED — wait like +// `curl`. See the #140 root-cause note above: any finite headersTimeout (undici's +// 300s default, or the 60s of #141) aborts z.ai's slow-first-byte reasoning turn +// before it answers. Operators who want a finite cap can set the env vars. +// +// undici REQUIRES a non-negative integer here and throws at construction time on +// anything else, so a typo'd env value (e.g. "60s") must NOT reach it — that +// would crash the whole AI layer at import. Sanitize: unset/invalid/negative → 0. +const envTimeoutMs = (name: string): number => { + const n = Number(process.env[name]); + return Number.isInteger(n) && n >= 0 ? n : 0; +}; +const HEADERS_TIMEOUT_MS = envTimeoutMs('AI_HTTP_HEADERS_TIMEOUT_MS'); +const BODY_TIMEOUT_MS = envTimeoutMs('AI_HTTP_BODY_TIMEOUT_MS'); + const baseAgent = new Agent({ // Cap TCP/TLS connect so a stuck connect fails fast and gets retried instead // of hanging indefinitely. @@ -49,8 +90,11 @@ const baseAgent = new Agent({ // a stale/half-closed socket can be reused, which is exactly the condition // that produces `read ECONNRESET`. Do NOT raise this. keepAliveTimeout: 4_000, - // Do NOT override headersTimeout/bodyTimeout — keep undici defaults so - // long-lived SSE streaming responses are not killed mid-stream. + // 0 = disabled: wait for z.ai's slow first byte / streamed chunks like curl, + // instead of aborting the heavy chat stream prematurely (#140). Do NOT lower + // these to a finite value without re-reading the root-cause note above. + headersTimeout: HEADERS_TIMEOUT_MS, + bodyTimeout: BODY_TIMEOUT_MS, }); const dispatcher: Dispatcher = new RetryAgent(baseAgent, { @@ -71,6 +115,15 @@ const dispatcher: Dispatcher = new RetryAgent(baseAgent, { // An explicit copy of undici 7.x's default connection-error code set, pinned // here so a future undici upgrade can't silently change which transport errors // we reconnect on. These are the errors we retry on a FRESH connection. + // + // CRITICAL — do NOT add timeout codes here: + // - `UND_ERR_HEADERS_TIMEOUT` / `UND_ERR_BODY_TIMEOUT` are NOT connection + // resets; retrying them re-sends a POST body against a torn-down request + // and throws `UND_ERR_REQ_CONTENT_LENGTH_MISMATCH` (the #140/#141 prod + // error — see ai-http.spec.ts), and merely re-incurs the same slow wait. + // A header/body timeout means the upstream is slow, not that the socket is + // poisoned — it must surface, not retry. (Timeouts are disabled anyway; see + // HEADERS_TIMEOUT_MS / BODY_TIMEOUT_MS.) errorCodes: [ 'ECONNRESET', 'ECONNREFUSED', diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts index f5f4dca3..83fa7b9f 100644 --- a/apps/server/src/integrations/ai/ai.service.ts +++ b/apps/server/src/integrations/ai/ai.service.ts @@ -296,6 +296,9 @@ export class AiService { ); } const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`; + // Bound the STT call explicitly. aiFetch disables transport-level + // headers/body timeouts by default (see ai-http.ts #140 note), so without an + // app-level signal a hung STT endpoint would wait forever. Mirror embedTexts. const res = await aiFetch(url, { method: 'POST', headers: { @@ -310,6 +313,7 @@ export class AiService { format, }, }), + signal: AbortSignal.timeout(AiService.sttTimeoutMs()), }); if (!res.ok) { // Surface status + body so the real reason reaches the user; never log the key. @@ -376,6 +380,16 @@ export class AiService { return Number.isFinite(raw) && raw > 0 ? raw : 120_000; } + /** + * Per-transcription (STT) call timeout in ms. Configurable via + * AI_STT_TIMEOUT_MS; falls back to 120000 (2 min) when unset or invalid. + * Needed because aiFetch disables transport-level timeouts by default (#140). + */ + private static sttTimeoutMs(): number { + const raw = Number(process.env.AI_STT_TIMEOUT_MS); + return Number.isFinite(raw) && raw > 0 ? raw : 120_000; + } + // Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only // as a connectivity probe for the STT endpoint in testConnection. private static silentWavProbe(): Uint8Array {