From 4cc8df836f1bb88e45bca6252a5f7a0b4bd7176e Mon Sep 17 00:00:00 2001 From: claude_code Date: Wed, 24 Jun 2026 21:24:05 +0300 Subject: [PATCH] chore(ai): passive z.ai provider HTTP telemetry (#175) Investigate the intermittent (~20-30%) long-turn failure "Lost connection to the AI provider" = AI_RetryError / read ECONNRESET on the gitmost->z.ai link (browser-agnostic, mid-turn). Pure instrumentation, no behavior change: - ai-http-diagnostics.ts: a passive fetch wrapper injected into the OpenAI-compatible (z.ai) client. Per provider HTTP call it logs time-to-headers/status on success, and on a pre-response rejection the latency, error code/cause, request-body size and idle-gap since the previous call. The Response is returned untouched (streaming intact), errors rethrown unchanged; no retry/timeout/dispatcher. - ai.service.ts: wire the instrumented fetch into the openai case only. Lets us classify the reset as connection-phase vs mid-stream before choosing a fix, without repeating the reverted RetryAgent (#140). Co-Authored-By: Claude Opus 4.8 --- .../integrations/ai/ai-http-diagnostics.ts | 75 +++++++++++++++++++ apps/server/src/integrations/ai/ai.service.ts | 17 ++++- 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 apps/server/src/integrations/ai/ai-http-diagnostics.ts diff --git a/apps/server/src/integrations/ai/ai-http-diagnostics.ts b/apps/server/src/integrations/ai/ai-http-diagnostics.ts new file mode 100644 index 00000000..eb9beeb2 --- /dev/null +++ b/apps/server/src/integrations/ai/ai-http-diagnostics.ts @@ -0,0 +1,75 @@ +import { Logger } from '@nestjs/common'; + +/** + * DIAGNOSTIC (provider ECONNRESET investigation) — temporary. + * + * A PASSIVE, behavior-neutral wrapper around the global `fetch`, injected into + * the OpenAI-compatible provider client (`createOpenAI({ fetch })`, the z.ai + * path). Per provider HTTP call it logs: time-to-response-headers + status + + * request-body size on success; and on a pre-response rejection the failure + * latency + error code/cause + request-body size + the idle gap since the + * previous provider call. It NEVER retries, times out, swaps the dispatcher, or + * reads/clones the response body — the Response is returned untouched (streaming + * unaffected) and any error is rethrown unchanged. + * + * How to read the result (a long agentic turn makes one provider call per step): + * - a failed turn whose last provider line is "PRE-RESPONSE FAILED ... ECONNRESET" + * => the reset is in the CONNECTION phase of a step's request (the provider + * never replied) — usually a poisoned keep-alive socket or the provider/middle + * box resetting that request (large body / idle gap are the suspects, hence + * reqBytes + idleSincePrevCall below). + * - the last line is "OK status=200" and the turn still errors with NO + * "PRE-RESPONSE FAILED" => the cut happened MID-STREAM (after headers), a + * different failure mode. + * + * The seq/last-call timestamps are module-level, so under concurrent turns the + * idle-gap figure is approximate (fine for single-user reproduction). + */ +export function createDiagnosticFetch(context: string): typeof fetch { + const logger = new Logger(context); + let callSeq = 0; + let lastCallStartedAt: number | undefined; + + return async (input: Parameters[0], init?: Parameters[1]): Promise => { + const callId = ++callSeq; + const startedAt = Date.now(); + const idleSincePrev = + lastCallStartedAt === undefined ? undefined : startedAt - lastCallStartedAt; + lastCallStartedAt = startedAt; + // Request body size: the chat payload is a JSON string. Used to test whether + // failures correlate with the large accumulated context on later agent steps. + const body = init?.body as unknown; + const bodyBytes = + typeof body === 'string' + ? body.length + : body instanceof Uint8Array + ? body.byteLength + : undefined; + try { + // Delegate to global fetch; return the Response UNTOUCHED (never read/clone + // the body) so the streamed SSE response is unaffected. + const res = await fetch(input, init); + logger.log( + `provider HTTP DIAGNOSTIC: call#${callId} OK ` + + `headersAfter=${Date.now() - startedAt}ms status=${res.status} ` + + `reqBytes=${bodyBytes ?? 'n/a'} idleSincePrevCall=${idleSincePrev ?? 'n/a'}ms`, + ); + return res; + } catch (err) { + // fetch() rejected => PRE-RESPONSE failure (no headers/body received yet): + // the connection/request phase. Log it and rethrow the SAME error. + const e = err as { + name?: string; + message?: string; + cause?: { code?: string; message?: string }; + }; + logger.warn( + `provider HTTP DIAGNOSTIC: call#${callId} PRE-RESPONSE FAILED ` + + `after=${Date.now() - startedAt}ms code=${e?.cause?.code ?? 'none'} ` + + `name=${e?.name ?? 'Error'} cause=${e?.cause?.message ?? e?.message ?? 'unknown'} ` + + `reqBytes=${bodyBytes ?? 'n/a'} idleSincePrevCall=${idleSincePrev ?? 'n/a'}ms`, + ); + throw err; + } + }; +} diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts index 078de791..4f72d23b 100644 --- a/apps/server/src/integrations/ai/ai.service.ts +++ b/apps/server/src/integrations/ai/ai.service.ts @@ -14,6 +14,8 @@ import { AiNotConfiguredException } from './ai-not-configured.exception'; import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception'; import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception'; import { describeProviderError } from './ai-error.util'; +// DIAGNOSTIC (provider ECONNRESET investigation) — temporary. +import { createDiagnosticFetch } from './ai-http-diagnostics'; import { AiProviderCredentialsRepo } from '@docmost/db/repos/ai-chat/ai-provider-credentials.repo'; import { SecretBoxService } from '../crypto/secret-box'; import { AiDriver } from './ai.types'; @@ -43,6 +45,13 @@ export interface ChatModelOverride { export class AiService { private readonly logger = new Logger(AiService.name); + // DIAGNOSTIC (provider ECONNRESET investigation) — temporary: passive + // instrumentation of the OpenAI-compatible provider HTTP calls (z.ai). + // Logs call timing/outcome only — no behavior change. + private readonly aiDiagnosticFetch = createDiagnosticFetch( + 'AiService:provider-http', + ); + constructor( private readonly aiSettings: AiSettingsService, private readonly aiProviderCredentialsRepo: AiProviderCredentialsRepo, @@ -140,7 +149,13 @@ export class AiService { // Responses API (/responses), which OpenAI-compatible gateways // (OpenRouter, etc.) reject on multi-turn requests (history with // assistant messages) → 400. - return createOpenAI({ apiKey, baseURL: baseUrl }).chat(chatModel); + // DIAGNOSTIC (provider ECONNRESET investigation) — temporary: pass the + // passive instrumented fetch (logging only; no behavior change). + return createOpenAI({ + apiKey, + baseURL: baseUrl, + fetch: this.aiDiagnosticFetch, + }).chat(chatModel); case 'gemini': return createGoogleGenerativeAI({ apiKey })(chatModel); case 'ollama':