From d7454c887ded2bddcbce3bdbf314c2b0511a2424 Mon Sep 17 00:00:00 2001
From: claude code agent 227 <claude_code@vvzvlad.xyz>
Date: Tue, 23 Jun 2026 05:16:20 +0300
Subject: [PATCH] =?UTF-8?q?fix(ai-chat):=20root-cause=20#140=20=E2=80=94?=
 =?UTF-8?q?=20stop=20aborting=20z.ai's=20slow=20first=20byte?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The AI chat stream to z.ai (GLM-5.2, api/coding/paas/v4) broke in
production on every heavy turn while `curl` to the same endpoint worked.

ROOT CAUSE (reproduced in ai-http.spec.ts): z.ai's coding endpoint is a
reasoning model with a long, variable TIME-TO-FIRST-RESPONSE-HEADER on a
heavy chat request (tools + system prompt + document + history) — it emits
nothing for tens of seconds before the first SSE byte. A trivial ping
returns <2s, which is why "test connection" always passed. `curl` succeeds
because it imposes no time-to-first-header limit.

The prior attempt (#141) made it STRICTLY worse: it set undici
`headersTimeout: 60_000` (aborting every heavy turn at ~60s — the prod logs
show ~61-62s failures) AND added `UND_ERR_HEADERS_TIMEOUT` to the RetryAgent
retry codes. Retrying a POST-with-body after a headers-timeout abort re-sends
the body against a torn-down request and throws
`UND_ERR_REQ_CONTENT_LENGTH_MISMATCH` — the exact production error.

Fix — behave like curl:
- Disable headersTimeout/bodyTimeout by default (0), env-overridable via
  AI_HTTP_HEADERS_TIMEOUT_MS / AI_HTTP_BODY_TIMEOUT_MS (sanitized so a typo
  can't crash the AI layer at import). The transport now waits for z.ai's
  slow first byte instead of killing the stream.
- Keep the RetryAgent reconnecting ONLY genuine connection resets on a fresh
  socket; never retry a header/body timeout (it corrupts the POST body).
- STT (transcribeJsonBase64) gains an explicit AbortSignal.timeout, since it
  shared aiFetch and previously relied on undici's default transport timeout.

Tests: loopback reproduction proving the #141 retry config yields
ContentLengthMismatch while the corrected set surfaces an honest
HeadersTimeout, plus a curl-parity test (a finite headersTimeout aborts a
slow first byte; aiFetch delivers the 200).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../src/integrations/ai/ai-http.spec.ts       | 201 +++++++++++++-----
 apps/server/src/integrations/ai/ai-http.ts    |  79 ++++---
 apps/server/src/integrations/ai/ai.service.ts |  14 ++
 3 files changed, 210 insertions(+), 84 deletions(-)
diff --git a/apps/server/src/integrations/ai/ai-http.spec.ts b/apps/server/src/integrations/ai/ai-http.spec.ts
index 4fbfab04..570b1b6d 100644
--- a/apps/server/src/integrations/ai/ai-http.spec.ts
+++ b/apps/server/src/integrations/ai/ai-http.spec.ts
@@ -1,12 +1,44 @@
 import * as http from 'node:http';
-import { RetryAgent } from 'undici';
-
-// A short header timeout makes the #140 "header stall" deterministic and fast.
-// Must be set BEFORE importing ai-http (the undici agents read it at module load).
-process.env.AI_HTTP_HEADERS_TIMEOUT_MS = '800';
+import { Agent, RetryAgent } from 'undici';
 
 import { aiFetch } from './ai-http';
 
+/**
+ * Spin up a throwaway loopback HTTP server. `onReq` decides how it responds (or
+ * deliberately stalls). Returns the base URL and a close fn.
+ */
+async function loopback(
+  onReq: (req: http.IncomingMessage, res: http.ServerResponse) => void,
+): Promise<{ url: string; close: () => Promise<void> }> {
+  const server = http.createServer(onReq);
+  await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve));
+  const { port } = server.address() as { port: number };
+  return {
+    url: `http://127.0.0.1:${port}`,
+    close: () =>
+      new Promise<void>((resolve) => server.close(() => resolve())),
+  };
+}
+
+async function codeChain(p: Promise<unknown>): Promise<string[]> {
+  try {
+    await p;
+    return ['<no-throw>'];
+  } catch (e) {
+    const chain: string[] = [];
+    let cur: unknown = e;
+    for (let d = 0; d < 4 && cur; d++) {
+      const x = cur as { code?: string; cause?: unknown };
+      // Capture BOTH the undici error .code (when present) and the stringified
+      // error (which carries the class name, e.g. RequestContentLengthMismatchError)
+      // so the chain is meaningful regardless of how the code surfaces.
+      chain.push(`${x.code ?? ''} ${String(x)}`.trim());
+      cur = x.cause;
+    }
+    return chain;
+  }
+}
+
 /**
  * Light, dependency-free unit checks for the shared AI HTTP layer. The module
  * constructs its undici dispatcher eagerly at import time, so importing it here
@@ -49,64 +81,117 @@ describe('ai-http', () => {
       spy.mockRestore();
     }
   });
-});
 
-/**
- * #140 regression: a provider that accepts the request but stalls without ever
- * sending response headers must FAIL FAST (at headersTimeout — set to 800ms
- * above, not undici's 300s default) and be RETRIED on a fresh connection.
- * headersTimeout only bounds time-to-headers, so a healthy fast response is
- * unaffected. Uses a real loopback server; makes no external network calls.
- */
-describe('aiFetch header-stall resilience (#140)', () => {
-  function makeServer(
-    handler: http.RequestListener,
-  ): Promise<{ url: string; close: () => Promise<void> }> {
-    return new Promise((resolve) => {
-      const server = http.createServer(handler);
-      server.listen(0, '127.0.0.1', () => {
-        const port = (server.address() as { port: number }).port;
-        resolve({
-          url: `http://127.0.0.1:${port}/health`,
-          close: () => new Promise<void>((r) => server.close(() => r())),
-        });
+  /**
+   * #140 ROOT-CAUSE LOCK. These tests reproduce, against a loopback server that
+   * NEVER sends response headers (standing in for z.ai's slow-first-byte
+   * reasoning turn), exactly why the issue-#141 transport config "broke every
+   * time" — and assert the corrected config does not. They use SHORT finite
+   * timeouts on locally-built agents so the suite stays fast; the shipped
+   * `aiFetch` disables these timeouts entirely (see HEADERS_TIMEOUT_MS).
+   */
+  describe('#140 headers-timeout retry hazard', () => {
+    it('BAD (issue #141): retrying a POST on UND_ERR_HEADERS_TIMEOUT throws CONTENT_LENGTH_MISMATCH', async () => {
+      // A server that consumes the request body but never replies -> the client
+      // headersTimeout fires. This mirrors z.ai stalling before the first byte.
+      const srv = await loopback((req) => req.resume());
+      const base = new Agent({
+        headersTimeout: 300,
+        bodyTimeout: 300,
+        connect: { timeout: 1000 },
       });
+      // The exact mistake of #141: retry POST on the headers-timeout code.
+      const bad = new RetryAgent(base, {
+        maxRetries: 2,
+        minTimeout: 50,
+        maxTimeout: 100,
+        methods: ['POST'],
+        statusCodes: [],
+        errorCodes: ['ECONNRESET', 'UND_ERR_HEADERS_TIMEOUT'],
+      });
+      const chain = await codeChain(
+        fetch(`${srv.url}/chat/completions`, {
+          method: 'POST',
+          headers: { 'content-type': 'application/json' },
+          // Multibyte body, like a real Russian chat turn.
+          body: JSON.stringify({ msg: `Привет ${'слово '.repeat(20)}😀` }),
+          dispatcher: bad,
+        } as RequestInit),
+      );
+      // The hallmark production error (#140/#141) — proves the retry corrupts
+      // the re-sent POST body rather than surfacing the real timeout. undici
+      // exposes it as code UND_ERR_REQ_CONTENT_LENGTH_MISMATCH /
+      // RequestContentLengthMismatchError.
+      expect(chain.join('|')).toMatch(/ContentLengthMismatch|CONTENT_LENGTH/);
+      await bad.close(); // closing the RetryAgent wrapper closes the base pool
+      await srv.close();
     });
-  }
 
-  it('retries a header stall on a fresh connection and recovers', async () => {
-    let attempts = 0;
-    const { url, close } = await makeServer((_req, res) => {
-      attempts++;
-      // First attempt: never send headers -> UND_ERR_HEADERS_TIMEOUT -> retry.
-      if (attempts === 1) return;
-      res.writeHead(200, { 'content-type': 'application/json' });
-      res.end(JSON.stringify({ ok: true, servedOnAttempt: attempts }));
+    it('GOOD (this fix): NOT retrying the headers-timeout surfaces the honest UND_ERR_HEADERS_TIMEOUT', async () => {
+      const srv = await loopback((req) => req.resume());
+      const base = new Agent({
+        headersTimeout: 300,
+        bodyTimeout: 300,
+        connect: { timeout: 1000 },
+      });
+      // Our shipped retry set: connection resets only, NO timeout codes.
+      const good = new RetryAgent(base, {
+        maxRetries: 2,
+        minTimeout: 50,
+        maxTimeout: 100,
+        methods: ['POST'],
+        statusCodes: [],
+        errorCodes: ['ECONNRESET', 'UND_ERR_SOCKET', 'EPIPE'],
+      });
+      const chain = await codeChain(
+        fetch(`${srv.url}/chat/completions`, {
+          method: 'POST',
+          headers: { 'content-type': 'application/json' },
+          body: JSON.stringify({ msg: `Привет ${'слово '.repeat(20)}😀` }),
+          dispatcher: good,
+        } as RequestInit),
+      );
+      // The honest timeout (UND_ERR_HEADERS_TIMEOUT / HeadersTimeoutError) — no
+      // CONTENT_LENGTH_MISMATCH corruption.
+      expect(chain.join('|')).toMatch(/HeadersTimeout|HEADERS_TIMEOUT/);
+      expect(chain.join('|')).not.toMatch(/ContentLengthMismatch|CONTENT_LENGTH/);
+      await good.close(); // closing the RetryAgent wrapper closes the base pool
+      await srv.close();
     });
-    try {
-      const res = await aiFetch(url, { method: 'GET' });
-      expect(res.status).toBe(200);
-      const body = (await res.json()) as { servedOnAttempt: number };
-      expect(attempts).toBeGreaterThanOrEqual(2); // the stalled attempt was retried
-      expect(body.servedOnAttempt).toBeGreaterThanOrEqual(2);
-    } finally {
-      await close();
-    }
-  }, 15000);
 
-  it('passes a healthy fast response straight through (one attempt)', async () => {
-    let attempts = 0;
-    const { url, close } = await makeServer((_req, res) => {
-      attempts++;
-      res.writeHead(200, { 'content-type': 'application/json' });
-      res.end(JSON.stringify({ ok: true }));
-    });
-    try {
-      const res = await aiFetch(url, { method: 'GET' });
+    it('aiFetch awaits a slow-first-byte response that a finite headersTimeout would abort (curl parity)', async () => {
+      // The SAME server delays the response headers by 2.5s (z.ai's slow first
+      // byte). A control agent with a finite headersTimeout aborts it; the shipped
+      // aiFetch (timeouts disabled) must instead deliver the 200 — proving it waits
+      // like curl rather than killing the stream the way #141's 60s cap would. The
+      // gap is wide because undici's timeout timer wheel is coarse (~1s); a 2.5s
+      // delay vs a 1s control timeout makes the abort deterministic, not flaky.
+      const srv = await loopback((_req, res) => {
+        setTimeout(() => {
+          res.writeHead(200, { 'content-type': 'text/event-stream' });
+          res.end('data: ok\n\n');
+        }, 2500);
+      });
+      const url = `${srv.url}/chat/completions`;
+      const init: RequestInit = {
+        method: 'POST',
+        headers: { 'content-type': 'application/json' },
+        body: JSON.stringify({ msg: 'Привет 😀' }),
+      };
+
+      // CONTROL: a finite headersTimeout (like #141) aborts the slow first byte.
+      const finite = new Agent({ headersTimeout: 1000, bodyTimeout: 1000 });
+      const controlChain = await codeChain(
+        fetch(url, { ...init, dispatcher: finite } as RequestInit),
+      );
+      expect(controlChain.join('|')).toMatch(/HeadersTimeout|HEADERS_TIMEOUT/);
+      await finite.close();
+
+      // THE FIX: aiFetch (disabled timeouts) waits and delivers the 200.
+      const res = await aiFetch(url, init);
       expect(res.status).toBe(200);
-      expect(attempts).toBe(1);
-    } finally {
-      await close();
-    }
-  }, 15000);
+      await res.text();
+      await srv.close();
+    });
+  });
 });
diff --git a/apps/server/src/integrations/ai/ai-http.ts b/apps/server/src/integrations/ai/ai-http.ts
index 403b0f7d..9e34fa1c 100644
--- a/apps/server/src/integrations/ai/ai-http.ts
+++ b/apps/server/src/integrations/ai/ai-http.ts
@@ -7,9 +7,7 @@ import { Logger } from '@nestjs/common';
  * WHY THIS EXISTS
  * ---------------
  * Production logs showed the AI chat stream (and title generation) failing with
- * `read ECONNRESET` after the AI SDK's own retries were exhausted, and
- * (z.ai GLM coding endpoint, #140) intermittently stalling without ever sending
- * response headers until undici's 300s default cut the request with no retry. The provider
+ * `read ECONNRESET` after the AI SDK's own retries were exhausted. The provider
  * clients were built with NO custom `fetch`, so all outbound LLM traffic used
  * Node's default global undici agent: default keep-alive pooling and NO
  * transport-level reconnect on connection resets. `read ECONNRESET` is a TCP RST
@@ -41,22 +39,48 @@ import { Logger } from '@nestjs/common';
  * and the payload was partially consumed" instead of the raw ECONNRESET. The
  * stream is NEVER corrupted (undici guards against concatenation) — only the
  * error message for that rarer mid-stream case changes.
+ *
+ * ROOT CAUSE OF #140 (z.ai chat stream "breaks every time", curl does not)
+ * ----------------------------------------------------------------------
+ * z.ai's coding endpoint (`api/coding/paas/v4`, GLM-5.2 — a REASONING model)
+ * has a long, variable TIME-TO-FIRST-RESPONSE-HEADER on a heavy chat request
+ * (tools + system prompt + injected document + history): it sits in its
+ * reasoning phase emitting nothing for tens of seconds before the first SSE
+ * byte. A trivial `generateText` ping returns in <2s, which is why the Settings
+ * "test connection" always passes while the real chat stalls. `curl` succeeds
+ * because it imposes NO time-to-first-header limit — it simply waits.
+ *
+ * The earlier "fix" (issue #141) made this STRICTLY WORSE in two ways:
+ *   1. It set `headersTimeout: 60_000`, so undici ABORTS every heavy turn at
+ *      ~60s — deterministically, before z.ai answers (the prod logs show every
+ *      request failing at ~61-62s). That is the "ломается КАЖДЫЙ раз" regression.
+ *   2. It added `UND_ERR_HEADERS_TIMEOUT` to the RetryAgent retry codes. Retrying
+ *      a POST-with-body after a headers-timeout abort makes undici re-send the
+ *      body against a torn-down request and throw
+ *      `UND_ERR_REQ_CONTENT_LENGTH_MISMATCH` — the exact production error.
+ *      (Reproduced in ai-http.spec.ts.)
+ *
+ * THE FIX: behave like curl. Disable the time-to-first-header / inter-chunk
+ * timeouts by default (env-overridable) so the transport WAITS for z.ai's slow
+ * first byte instead of aborting it, and NEVER retry a header/body timeout at the
+ * transport layer (only genuine connection resets are retried on a fresh socket).
  */
 
-// `headersTimeout` bounds time-to-FIRST-response-headers (before any body). It
-// is NOT the streaming budget: once headers arrive the SSE body streams freely,
-// unaffected by this value — so it is safe to keep SHORT. Some providers (seen
-// with the z.ai GLM coding endpoint, #140) intermittently accept the request but
-// never send response headers; undici's 300s default then hangs the user for
-// FIVE MINUTES before failing, with no retry. Cap it so a stalled request fails
-// FAST and is retried on a fresh connection (the retry usually lands on a healthy
-// path and responds in seconds). Env-overridable for ops tuning.
-const HEADERS_TIMEOUT_MS =
-  Number(process.env.AI_HTTP_HEADERS_TIMEOUT_MS) || 60_000;
-// `bodyTimeout` bounds the gap BETWEEN streamed body chunks (not total stream
-// length). Kept generous so a legitimately slow/thinking model with sparse SSE
-// chunks is never killed mid-stream. Env-overridable.
-const BODY_TIMEOUT_MS = Number(process.env.AI_HTTP_BODY_TIMEOUT_MS) || 300_000;
+// Time-to-FIRST-response-headers (`headersTimeout`) and gap-between-streamed-
+// body-chunks (`bodyTimeout`) bounds, in ms. Default 0 = DISABLED — wait like
+// `curl`. See the #140 root-cause note above: any finite headersTimeout (undici's
+// 300s default, or the 60s of #141) aborts z.ai's slow-first-byte reasoning turn
+// before it answers. Operators who want a finite cap can set the env vars.
+//
+// undici REQUIRES a non-negative integer here and throws at construction time on
+// anything else, so a typo'd env value (e.g. "60s") must NOT reach it — that
+// would crash the whole AI layer at import. Sanitize: unset/invalid/negative → 0.
+const envTimeoutMs = (name: string): number => {
+  const n = Number(process.env[name]);
+  return Number.isInteger(n) && n >= 0 ? n : 0;
+};
+const HEADERS_TIMEOUT_MS = envTimeoutMs('AI_HTTP_HEADERS_TIMEOUT_MS');
+const BODY_TIMEOUT_MS = envTimeoutMs('AI_HTTP_BODY_TIMEOUT_MS');
 
 const baseAgent = new Agent({
   // Cap TCP/TLS connect so a stuck connect fails fast and gets retried instead
@@ -66,9 +90,9 @@ const baseAgent = new Agent({
   // a stale/half-closed socket can be reused, which is exactly the condition
   // that produces `read ECONNRESET`. Do NOT raise this.
   keepAliveTimeout: 4_000,
-  // Short time-to-headers (see HEADERS_TIMEOUT_MS) so a header stall fails fast
-  // and gets retried; generous per-chunk body timeout so real streams survive
-  // (see BODY_TIMEOUT_MS). Lowering headersTimeout does NOT truncate streams.
+  // 0 = disabled: wait for z.ai's slow first byte / streamed chunks like curl,
+  // instead of aborting the heavy chat stream prematurely (#140). Do NOT lower
+  // these to a finite value without re-reading the root-cause note above.
   headersTimeout: HEADERS_TIMEOUT_MS,
   bodyTimeout: BODY_TIMEOUT_MS,
 });
@@ -91,6 +115,15 @@ const dispatcher: Dispatcher = new RetryAgent(baseAgent, {
   // An explicit copy of undici 7.x's default connection-error code set, pinned
   // here so a future undici upgrade can't silently change which transport errors
   // we reconnect on. These are the errors we retry on a FRESH connection.
+  //
+  // CRITICAL — do NOT add timeout codes here:
+  //   - `UND_ERR_HEADERS_TIMEOUT` / `UND_ERR_BODY_TIMEOUT` are NOT connection
+  //     resets; retrying them re-sends a POST body against a torn-down request
+  //     and throws `UND_ERR_REQ_CONTENT_LENGTH_MISMATCH` (the #140/#141 prod
+  //     error — see ai-http.spec.ts), and merely re-incurs the same slow wait.
+  //     A header/body timeout means the upstream is slow, not that the socket is
+  //     poisoned — it must surface, not retry. (Timeouts are disabled anyway; see
+  //     HEADERS_TIMEOUT_MS / BODY_TIMEOUT_MS.)
   errorCodes: [
     'ECONNRESET',
     'ECONNREFUSED',
@@ -100,12 +133,6 @@ const dispatcher: Dispatcher = new RetryAgent(baseAgent, {
     'EHOSTDOWN',
     'EHOSTUNREACH',
     'UND_ERR_SOCKET',
-    // Added (NOT in undici's default set): a header timeout fires BEFORE any
-    // response body, so retrying is clean (no partially-consumed stream / Range
-    // problem) — and it is exactly the z.ai stall mode (#140), where a fresh
-    // retry usually succeeds. We deliberately do NOT retry UND_ERR_BODY_TIMEOUT
-    // (mid-body; partial SSE already delivered, not safe to resume).
-    'UND_ERR_HEADERS_TIMEOUT',
     'EPIPE',
   ],
 });
diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts
index f5f4dca3..83fa7b9f 100644
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -296,6 +296,9 @@ export class AiService {
       );
     }
     const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
+    // Bound the STT call explicitly. aiFetch disables transport-level
+    // headers/body timeouts by default (see ai-http.ts #140 note), so without an
+    // app-level signal a hung STT endpoint would wait forever. Mirror embedTexts.
     const res = await aiFetch(url, {
       method: 'POST',
       headers: {
@@ -310,6 +313,7 @@ export class AiService {
           format,
         },
       }),
+      signal: AbortSignal.timeout(AiService.sttTimeoutMs()),
     });
     if (!res.ok) {
       // Surface status + body so the real reason reaches the user; never log the key.
@@ -376,6 +380,16 @@ export class AiService {
     return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
   }
 
+  /**
+   * Per-transcription (STT) call timeout in ms. Configurable via
+   * AI_STT_TIMEOUT_MS; falls back to 120000 (2 min) when unset or invalid.
+   * Needed because aiFetch disables transport-level timeouts by default (#140).
+   */
+  private static sttTimeoutMs(): number {
+    const raw = Number(process.env.AI_STT_TIMEOUT_MS);
+    return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
+  }
+
   // Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only
   // as a connectivity probe for the STT endpoint in testConnection.
   private static silentWavProbe(): Uint8Array {