Merge pull request 'fix(ai-http): fail fast + retry on provider header stall (#140)' (#141) from fix/ai-stream-headers-timeout into develop

Reviewed-on: #141
2026-06-23 04:16:37 +03:00
parent 7884dc2e1a d6cd275469
commit 6efb865625
2 changed files with 94 additions and 3 deletions
--- a/apps/server/src/integrations/ai/ai-http.spec.ts
+++ b/apps/server/src/integrations/ai/ai-http.spec.ts
@@ -1,5 +1,10 @@
+import * as http from 'node:http';
 import { RetryAgent } from 'undici';

+// A short header timeout makes the #140 "header stall" deterministic and fast.
+// Must be set BEFORE importing ai-http (the undici agents read it at module load).
+process.env.AI_HTTP_HEADERS_TIMEOUT_MS = '800';
+
 import { aiFetch } from './ai-http';

 /**
@@ -45,3 +50,63 @@ describe('ai-http', () => {
    }
  });
 });
+
+/**
+ * #140 regression: a provider that accepts the request but stalls without ever
+ * sending response headers must FAIL FAST (at headersTimeout — set to 800ms
+ * above, not undici's 300s default) and be RETRIED on a fresh connection.
+ * headersTimeout only bounds time-to-headers, so a healthy fast response is
+ * unaffected. Uses a real loopback server; makes no external network calls.
+ */
+describe('aiFetch header-stall resilience (#140)', () => {
+  function makeServer(
+    handler: http.RequestListener,
+  ): Promise<{ url: string; close: () => Promise<void> }> {
+    return new Promise((resolve) => {
+      const server = http.createServer(handler);
+      server.listen(0, '127.0.0.1', () => {
+        const port = (server.address() as { port: number }).port;
+        resolve({
+          url: `http://127.0.0.1:${port}/health`,
+          close: () => new Promise<void>((r) => server.close(() => r())),
+        });
+      });
+    });
+  }
+
+  it('retries a header stall on a fresh connection and recovers', async () => {
+    let attempts = 0;
+    const { url, close } = await makeServer((_req, res) => {
+      attempts++;
+      // First attempt: never send headers -> UND_ERR_HEADERS_TIMEOUT -> retry.
+      if (attempts === 1) return;
+      res.writeHead(200, { 'content-type': 'application/json' });
+      res.end(JSON.stringify({ ok: true, servedOnAttempt: attempts }));
+    });
+    try {
+      const res = await aiFetch(url, { method: 'GET' });
+      expect(res.status).toBe(200);
+      const body = (await res.json()) as { servedOnAttempt: number };
+      expect(attempts).toBeGreaterThanOrEqual(2); // the stalled attempt was retried
+      expect(body.servedOnAttempt).toBeGreaterThanOrEqual(2);
+    } finally {
+      await close();
+    }
+  }, 15000);
+
+  it('passes a healthy fast response straight through (one attempt)', async () => {
+    let attempts = 0;
+    const { url, close } = await makeServer((_req, res) => {
+      attempts++;
+      res.writeHead(200, { 'content-type': 'application/json' });
+      res.end(JSON.stringify({ ok: true }));
+    });
+    try {
+      const res = await aiFetch(url, { method: 'GET' });
+      expect(res.status).toBe(200);
+      expect(attempts).toBe(1);
+    } finally {
+      await close();
+    }
+  }, 15000);
+});
--- a/apps/server/src/integrations/ai/ai-http.ts
+++ b/apps/server/src/integrations/ai/ai-http.ts
@@ -7,7 +7,9 @@ import { Logger } from '@nestjs/common';
 * WHY THIS EXISTS
 * ---------------
 * Production logs showed the AI chat stream (and title generation) failing with
- * `read ECONNRESET` after the AI SDK's own retries were exhausted. The provider
+ * `read ECONNRESET` after the AI SDK's own retries were exhausted, and
+ * (z.ai GLM coding endpoint, #140) intermittently stalling without ever sending
+ * response headers until undici's 300s default cut the request with no retry. The provider
 * clients were built with NO custom `fetch`, so all outbound LLM traffic used
 * Node's default global undici agent: default keep-alive pooling and NO
 * transport-level reconnect on connection resets. `read ECONNRESET` is a TCP RST
@@ -41,6 +43,21 @@ import { Logger } from '@nestjs/common';
 * error message for that rarer mid-stream case changes.
 */

+// `headersTimeout` bounds time-to-FIRST-response-headers (before any body). It
+// is NOT the streaming budget: once headers arrive the SSE body streams freely,
+// unaffected by this value — so it is safe to keep SHORT. Some providers (seen
+// with the z.ai GLM coding endpoint, #140) intermittently accept the request but
+// never send response headers; undici's 300s default then hangs the user for
+// FIVE MINUTES before failing, with no retry. Cap it so a stalled request fails
+// FAST and is retried on a fresh connection (the retry usually lands on a healthy
+// path and responds in seconds). Env-overridable for ops tuning.
+const HEADERS_TIMEOUT_MS =
+  Number(process.env.AI_HTTP_HEADERS_TIMEOUT_MS) || 60_000;
+// `bodyTimeout` bounds the gap BETWEEN streamed body chunks (not total stream
+// length). Kept generous so a legitimately slow/thinking model with sparse SSE
+// chunks is never killed mid-stream. Env-overridable.
+const BODY_TIMEOUT_MS = Number(process.env.AI_HTTP_BODY_TIMEOUT_MS) || 300_000;
+
 const baseAgent = new Agent({
  // Cap TCP/TLS connect so a stuck connect fails fast and gets retried instead
  // of hanging indefinitely.
@@ -49,8 +66,11 @@ const baseAgent = new Agent({
  // a stale/half-closed socket can be reused, which is exactly the condition
  // that produces `read ECONNRESET`. Do NOT raise this.
  keepAliveTimeout: 4_000,
-  // Do NOT override headersTimeout/bodyTimeout — keep undici defaults so
-  // long-lived SSE streaming responses are not killed mid-stream.
+  // Short time-to-headers (see HEADERS_TIMEOUT_MS) so a header stall fails fast
+  // and gets retried; generous per-chunk body timeout so real streams survive
+  // (see BODY_TIMEOUT_MS). Lowering headersTimeout does NOT truncate streams.
+  headersTimeout: HEADERS_TIMEOUT_MS,
+  bodyTimeout: BODY_TIMEOUT_MS,
 });

 const dispatcher: Dispatcher = new RetryAgent(baseAgent, {
@@ -80,6 +100,12 @@ const dispatcher: Dispatcher = new RetryAgent(baseAgent, {
    'EHOSTDOWN',
    'EHOSTUNREACH',
    'UND_ERR_SOCKET',
+    // Added (NOT in undici's default set): a header timeout fires BEFORE any
+    // response body, so retrying is clean (no partially-consumed stream / Range
+    // problem) — and it is exactly the z.ai stall mode (#140), where a fresh
+    // retry usually succeeds. We deliberately do NOT retry UND_ERR_BODY_TIMEOUT
+    // (mid-body; partial SSE already delivered, not safe to resume).
+    'UND_ERR_HEADERS_TIMEOUT',
    'EPIPE',
  ],
 });