chore(ai): passive z.ai provider HTTP telemetry (#175)

Investigate the intermittent (~20-30%) long-turn failure "Lost connection to the AI provider" = AI_RetryError / read ECONNRESET on the gitmost->z.ai link (browser-agnostic, mid-turn). Pure instrumentation, no behavior change: - ai-http-diagnostics.ts: a passive fetch wrapper injected into the OpenAI-compatible (z.ai) client. Per provider HTTP call it logs time-to-headers/status on success, and on a pre-response rejection the latency, error code/cause, request-body size and idle-gap since the previous call. The Response is returned untouched (streaming intact), errors rethrown unchanged; no retry/timeout/dispatcher. - ai.service.ts: wire the instrumented fetch into the openai case only. Lets us classify the reset as connection-phase vs mid-stream before choosing a fix, without repeating the reverted RetryAgent (#140). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-24 21:24:05 +03:00
parent 04a418e1a6
commit 4cc8df836f
2 changed files with 91 additions and 1 deletions
--- a/apps/server/src/integrations/ai/ai-http-diagnostics.ts
+++ b/apps/server/src/integrations/ai/ai-http-diagnostics.ts
@@ -0,0 +1,75 @@
+import { Logger } from '@nestjs/common';
+
+/**
+ * DIAGNOSTIC (provider ECONNRESET investigation) — temporary.
+ *
+ * A PASSIVE, behavior-neutral wrapper around the global `fetch`, injected into
+ * the OpenAI-compatible provider client (`createOpenAI({ fetch })`, the z.ai
+ * path). Per provider HTTP call it logs: time-to-response-headers + status +
+ * request-body size on success; and on a pre-response rejection the failure
+ * latency + error code/cause + request-body size + the idle gap since the
+ * previous provider call. It NEVER retries, times out, swaps the dispatcher, or
+ * reads/clones the response body — the Response is returned untouched (streaming
+ * unaffected) and any error is rethrown unchanged.
+ *
+ * How to read the result (a long agentic turn makes one provider call per step):
+ *  - a failed turn whose last provider line is "PRE-RESPONSE FAILED ... ECONNRESET"
+ *    => the reset is in the CONNECTION phase of a step's request (the provider
+ *    never replied) — usually a poisoned keep-alive socket or the provider/middle
+ *    box resetting that request (large body / idle gap are the suspects, hence
+ *    reqBytes + idleSincePrevCall below).
+ *  - the last line is "OK status=200" and the turn still errors with NO
+ *    "PRE-RESPONSE FAILED" => the cut happened MID-STREAM (after headers), a
+ *    different failure mode.
+ *
+ * The seq/last-call timestamps are module-level, so under concurrent turns the
+ * idle-gap figure is approximate (fine for single-user reproduction).
+ */
+export function createDiagnosticFetch(context: string): typeof fetch {
+  const logger = new Logger(context);
+  let callSeq = 0;
+  let lastCallStartedAt: number | undefined;
+
+  return async (input: Parameters<typeof fetch>[0], init?: Parameters<typeof fetch>[1]): Promise<Response> => {
+    const callId = ++callSeq;
+    const startedAt = Date.now();
+    const idleSincePrev =
+      lastCallStartedAt === undefined ? undefined : startedAt - lastCallStartedAt;
+    lastCallStartedAt = startedAt;
+    // Request body size: the chat payload is a JSON string. Used to test whether
+    // failures correlate with the large accumulated context on later agent steps.
+    const body = init?.body as unknown;
+    const bodyBytes =
+      typeof body === 'string'
+        ? body.length
+        : body instanceof Uint8Array
+          ? body.byteLength
+          : undefined;
+    try {
+      // Delegate to global fetch; return the Response UNTOUCHED (never read/clone
+      // the body) so the streamed SSE response is unaffected.
+      const res = await fetch(input, init);
+      logger.log(
+        `provider HTTP DIAGNOSTIC: call#${callId} OK ` +
+          `headersAfter=${Date.now() - startedAt}ms status=${res.status} ` +
+          `reqBytes=${bodyBytes ?? 'n/a'} idleSincePrevCall=${idleSincePrev ?? 'n/a'}ms`,
+      );
+      return res;
+    } catch (err) {
+      // fetch() rejected => PRE-RESPONSE failure (no headers/body received yet):
+      // the connection/request phase. Log it and rethrow the SAME error.
+      const e = err as {
+        name?: string;
+        message?: string;
+        cause?: { code?: string; message?: string };
+      };
+      logger.warn(
+        `provider HTTP DIAGNOSTIC: call#${callId} PRE-RESPONSE FAILED ` +
+          `after=${Date.now() - startedAt}ms code=${e?.cause?.code ?? 'none'} ` +
+          `name=${e?.name ?? 'Error'} cause=${e?.cause?.message ?? e?.message ?? 'unknown'} ` +
+          `reqBytes=${bodyBytes ?? 'n/a'} idleSincePrevCall=${idleSincePrev ?? 'n/a'}ms`,
+      );
+      throw err;
+    }
+  };
+}
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -14,6 +14,8 @@ import { AiNotConfiguredException } from './ai-not-configured.exception';
 import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
 import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
 import { describeProviderError } from './ai-error.util';
+// DIAGNOSTIC (provider ECONNRESET investigation) — temporary.
+import { createDiagnosticFetch } from './ai-http-diagnostics';
 import { AiProviderCredentialsRepo } from '@docmost/db/repos/ai-chat/ai-provider-credentials.repo';
 import { SecretBoxService } from '../crypto/secret-box';
 import { AiDriver } from './ai.types';
@@ -43,6 +45,13 @@ export interface ChatModelOverride {
 export class AiService {
  private readonly logger = new Logger(AiService.name);

+  // DIAGNOSTIC (provider ECONNRESET investigation) — temporary: passive
+  // instrumentation of the OpenAI-compatible provider HTTP calls (z.ai).
+  // Logs call timing/outcome only — no behavior change.
+  private readonly aiDiagnosticFetch = createDiagnosticFetch(
+    'AiService:provider-http',
+  );
+
  constructor(
    private readonly aiSettings: AiSettingsService,
    private readonly aiProviderCredentialsRepo: AiProviderCredentialsRepo,
@@ -140,7 +149,13 @@ export class AiService {
        // Responses API (/responses), which OpenAI-compatible gateways
        // (OpenRouter, etc.) reject on multi-turn requests (history with
        // assistant messages) → 400.
-        return createOpenAI({ apiKey, baseURL: baseUrl }).chat(chatModel);
+        // DIAGNOSTIC (provider ECONNRESET investigation) — temporary: pass the
+        // passive instrumented fetch (logging only; no behavior change).
+        return createOpenAI({
+          apiKey,
+          baseURL: baseUrl,
+          fetch: this.aiDiagnosticFetch,
+        }).chat(chatModel);
      case 'gemini':
        return createGoogleGenerativeAI({ apiKey })(chatModel);
      case 'ollama':