From 7c308728de1a35fff2cb8949674a9926ae8afda9 Mon Sep 17 00:00:00 2001
From: claude_code <claude_code@vvzvlad.xyz>
Date: Tue, 23 Jun 2026 02:13:54 +0300
Subject: [PATCH] chore(ai-chat): add stream timing logs + env-gated aiFetch
 bypass (diagnostics)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The streaming chat turn hangs in all browsers while the non-streaming test
endpoint works — both use the same model/transport (createOpenAI + aiFetch),
so the suspect is the streaming path / custom undici RetryAgent transport.

- ai-http.ts: wrap aiFetch with per-request timing logs (start, ms-to-headers
  on success, elapsed ms + cause on failure). Chat at info, embeddings at
  debug. Only host+path logged.
- ai-chat.controller.ts / ai-chat.service.ts: log turn START, first-chunk
  latency, FINISHED duration, and elapsed ms on disconnect/error/abort.
- ai.service.ts: AI_BYPASS_RESILIENT_FETCH=true makes the CHAT model omit
  fetch:aiFetch and use the default global fetch — isolates transport vs
  request-shape. Chat-only; embeddings/STT untouched; reversible via env.
- .env.example: document the flag.

No timeout/retry change. tsc clean; ai-chat + ai suites pass (292).
---
 .env.example                                  |  5 ++
 .../src/core/ai-chat/ai-chat.controller.ts    |  9 +++-
 .../src/core/ai-chat/ai-chat.service.ts       | 20 +++++++-
 apps/server/src/integrations/ai/ai-http.ts    | 47 ++++++++++++++++++-
 apps/server/src/integrations/ai/ai.service.ts | 31 ++++++++++--
 5 files changed, 103 insertions(+), 9 deletions(-)

diff --git a/.env.example b/.env.example
index 1948fafe..ae932ee5 100644
--- a/.env.example
+++ b/.env.example
@@ -128,6 +128,11 @@ MCP_DOCMOST_PASSWORD=
 # A slow/hung embeddings endpoint fails after this and the batch continues.
 # AI_EMBEDDING_TIMEOUT_MS=120000
 
+# Diagnostic: bypass the resilient outbound HTTP layer (custom undici RetryAgent)
+# for the CHAT model, using the default global fetch instead. Use only to isolate
+# a streaming/transport issue; leave unset in normal operation.
+# AI_BYPASS_RESILIENT_FETCH=true
+
 # --- Anonymous public-share AI assistant ---
 # Opt-in per workspace (AI settings -> "public share assistant"; off by default).
 # When enabled, anonymous visitors of a published share can ask an AI about that
diff --git a/apps/server/src/core/ai-chat/ai-chat.controller.ts b/apps/server/src/core/ai-chat/ai-chat.controller.ts
index 0870969e..965e8a84 100644
--- a/apps/server/src/core/ai-chat/ai-chat.controller.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts
@@ -142,6 +142,9 @@ export class AiChatController {
 
     const body = (req.body ?? {}) as AiChatStreamBody;
 
+    // Diagnostic timing baseline for this turn (see START / terminal logs below).
+    const startedAt = Date.now();
+
     // Resolve the agent role for this turn BEFORE hijack: existing chats read it
     // from ai_chats.role_id (authoritative), a new chat from body.roleId. The
     // role drives both the persona and the optional model override below.
@@ -167,7 +170,7 @@ export class AiChatController {
       // so log it here before aborting the agent loop.
       if (!res.raw.writableEnded) {
         this.logger.warn(
-          'AI chat stream: client disconnected before completion; aborting turn',
+          `AI chat stream: client disconnected before completion after ${Date.now() - startedAt}ms; aborting turn`,
         );
         controller.abort();
       }
@@ -175,6 +178,10 @@ export class AiChatController {
     req.raw.once('close', onClose);
     res.raw.once('finish', () => req.raw.off('close', onClose));
 
+    this.logger.log(
+      `AI chat stream START chat=${body.chatId ?? 'new'} ua="${req.headers['user-agent'] ?? ''}"`,
+    );
+
     // Commit to streaming: hijack so Fastify stops managing the response and
     // the AI SDK can write the UI-message stream directly to the Node socket.
     res.hijack();
diff --git a/apps/server/src/core/ai-chat/ai-chat.service.ts b/apps/server/src/core/ai-chat/ai-chat.service.ts
index 457682b8..c74b9945 100644
--- a/apps/server/src/core/ai-chat/ai-chat.service.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.service.ts
@@ -192,6 +192,7 @@ export class AiChatService {
     model,
     role,
   }: AiChatStreamArgs): Promise<void> {
+    const turnStartedAt = Date.now();
     // Resolve / create the chat. A new chat is created when no valid chatId is
     // supplied or the supplied one does not belong to this workspace.
     let isNewChat = false;
@@ -380,6 +381,10 @@ export class AiChatService {
     const capturedSteps: StepLike[] = [];
     let inProgressText = '';
 
+    // Log only the FIRST streamed chunk so we can see the provider's observed
+    // time-to-first-token without flooding the log with every delta.
+    let firstChunkLogged = false;
+
     // NOTE: streamText is synchronous in v6 — do NOT await it. A synchronous
     // failure here (or in pipe below) would skip the terminal callbacks, so the
     // catch releases the leased external clients to avoid a connection leak.
@@ -404,6 +409,12 @@ export class AiChatService {
       prepareStep: ({ stepNumber }) => prepareAgentStep(stepNumber, system),
       abortSignal: signal,
       onChunk: ({ chunk }) => {
+        if (!firstChunkLogged) {
+          firstChunkLogged = true;
+          this.logger.log(
+            `AI chat stream first chunk (${chunk.type}) chat=${chatId} after ${Date.now() - turnStartedAt}ms`,
+          );
+        }
         // 'text-delta' is the assistant's prose; tool-call args are separate chunk
         // types — so this mirrors exactly what streams to the client.
         if (chunk.type === 'text-delta') inProgressText += chunk.text;
@@ -415,6 +426,9 @@ export class AiChatService {
         inProgressText = '';
       },
       onFinish: async ({ text, finishReason, totalUsage, usage, steps }) => {
+        this.logger.log(
+          `AI chat stream FINISHED chat=${chatId} in ${Date.now() - turnStartedAt}ms, ${steps.length} step(s)`,
+        );
         await persistAssistant({
           text,
           toolCalls: serializeSteps(steps),
@@ -444,6 +458,9 @@ export class AiChatService {
         const e = error as { stack?: string };
         const errorText = describeProviderError(error, String(error));
         this.logger.error(`AI chat stream error: ${errorText}`, e?.stack);
+        this.logger.warn(
+          `AI chat stream ERROR terminal chat=${chatId} after ${Date.now() - turnStartedAt}ms`,
+        );
         // Persist the PARTIAL answer streamed before the failure (text + any
         // finished tool steps) WITH the error in metadata, so the turn shows what
         // the user already saw plus the cause — not just a bare error.
@@ -466,7 +483,8 @@ export class AiChatService {
         // invisible in the logs. Log it (warn) so the abort is traceable.
         this.logger.warn(
           `AI chat stream aborted (chat ${chatId}) after ${steps.length} ` +
-            `step(s), ${partialChars} chars partial text; persisting partial turn.`,
+            `step(s), ${partialChars} chars partial text; persisting partial turn` +
+            ` after ${Date.now() - turnStartedAt}ms`,
         );
         await persistAssistant(
           buildPartialAssistantRecord(capturedSteps, inProgressText, 'aborted'),
diff --git a/apps/server/src/integrations/ai/ai-http.ts b/apps/server/src/integrations/ai/ai-http.ts
index 5182393c..b894ae9b 100644
--- a/apps/server/src/integrations/ai/ai-http.ts
+++ b/apps/server/src/integrations/ai/ai-http.ts
@@ -1,4 +1,5 @@
 import { Agent, RetryAgent, type Dispatcher } from 'undici';
+import { Logger } from '@nestjs/common';
 
 /**
  * Dedicated, resilient outbound HTTP layer for ALL AI provider calls.
@@ -83,11 +84,53 @@ const dispatcher: Dispatcher = new RetryAgent(baseAgent, {
   ],
 });
 
+const logger = new Logger('AiHttp');
+let requestSeq = 0;
+
 /**
  * A `fetch`-compatible function that routes the request through the shared,
  * resilient AI dispatcher. Injected into AI SDK provider factories via their
  * `fetch` option. Follows the repo convention (see mcp-clients.service.ts
  * `guardedFetch`).
+ *
+ * Wrapped with timing logs so provider latency is visible: for streaming
+ * responses `fetch` resolves when RESPONSE HEADERS arrive (the body streams
+ * after), so "in <ms>ms (headers received)" is exactly the provider's
+ * time-to-first-byte, and a rejection time pinpoints a headers/body timeout.
+ * Chat/Responses calls log at info; bulk embedding calls log at debug so RAG
+ * indexing never floods the logs. No secrets are logged — only host + pathname.
  */
-export const aiFetch: typeof fetch = (input, init) =>
-  fetch(input, { ...init, dispatcher } as RequestInit);
+export const aiFetch: typeof fetch = async (input, init) => {
+  const id = ++requestSeq;
+  const method = (init?.method ?? 'GET').toUpperCase();
+  const rawUrl =
+    typeof input === 'string'
+      ? input
+      : input instanceof URL
+        ? input.href
+        : (input as Request).url;
+  let path = rawUrl;
+  try {
+    const u = new URL(rawUrl);
+    path = u.host + u.pathname;
+  } catch {
+    // Non-absolute / unparseable URL: keep the raw string (still no secrets).
+  }
+  const isChat = /\/(chat\/completions|responses)\b/.test(path);
+  const log = (msg: string): void =>
+    isChat ? logger.log(msg) : logger.debug(msg);
+  const startedAt = performance.now();
+  log(`provider request #${id} -> ${method} ${path}`);
+  try {
+    const res = await fetch(input, { ...init, dispatcher } as RequestInit);
+    const ms = Math.round(performance.now() - startedAt);
+    log(`provider request #${id} <- ${res.status} in ${ms}ms (headers received)`);
+    return res;
+  } catch (err) {
+    const ms = Math.round(performance.now() - startedAt);
+    logger.warn(
+      `provider request #${id} x after ${ms}ms: ${(err as Error)?.message ?? String(err)}`,
+    );
+    throw err;
+  }
+};
diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts
index 442ebf9b..f5f4dca3 100644
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -133,6 +133,19 @@ export class AiService {
       throw new AiNotConfiguredException();
     }
 
+    // Diagnostic toggle: when AI_BYPASS_RESILIENT_FETCH=true the chat model
+    // bypasses the resilient aiFetch (custom undici RetryAgent) and uses the
+    // default global fetch. Isolates whether the streaming chat hang comes from
+    // the custom transport vs the request shape. Reversible via env, no rebuild.
+    const bypassResilientFetch =
+      process.env.AI_BYPASS_RESILIENT_FETCH === 'true';
+    if (bypassResilientFetch) {
+      this.logger.warn(
+        'AI chat: resilient aiFetch BYPASSED for chat model ' +
+          '(AI_BYPASS_RESILIENT_FETCH=true; using default fetch)',
+      );
+    }
+
     switch (driver) {
       case 'openai':
         // baseURL (when set) covers openai-compatible endpoints. Use Chat
@@ -141,14 +154,22 @@ export class AiService {
         // Responses API (/responses), which OpenAI-compatible gateways
         // (OpenRouter, etc.) reject on multi-turn requests (history with
         // assistant messages) → 400.
-        return createOpenAI({ apiKey, baseURL: baseUrl, fetch: aiFetch }).chat(
-          chatModel,
-        );
+        return createOpenAI({
+          apiKey,
+          baseURL: baseUrl,
+          ...(bypassResilientFetch ? {} : { fetch: aiFetch }),
+        }).chat(chatModel);
       case 'gemini':
-        return createGoogleGenerativeAI({ apiKey, fetch: aiFetch })(chatModel);
+        return createGoogleGenerativeAI({
+          apiKey,
+          ...(bypassResilientFetch ? {} : { fetch: aiFetch }),
+        })(chatModel);
       case 'ollama':
         // Ollama needs no API key.
-        return createOllama({ baseURL: baseUrl, fetch: aiFetch })(chatModel);
+        return createOllama({
+          baseURL: baseUrl,
+          ...(bypassResilientFetch ? {} : { fetch: aiFetch }),
+        })(chatModel);
       default:
         throw new AiNotConfiguredException();
     }