chore(ai): passive z.ai provider HTTP telemetry (#175)

Investigate the intermittent (~20-30%) long-turn failure
"Lost connection to the AI provider" = AI_RetryError / read ECONNRESET
on the gitmost->z.ai link (browser-agnostic, mid-turn). Pure
instrumentation, no behavior change:

- ai-http-diagnostics.ts: a passive fetch wrapper injected into the
  OpenAI-compatible (z.ai) client. Per provider HTTP call it logs
  time-to-headers/status on success, and on a pre-response rejection the
  latency, error code/cause, request-body size and idle-gap since the
  previous call. The Response is returned untouched (streaming intact),
  errors rethrown unchanged; no retry/timeout/dispatcher.
- ai.service.ts: wire the instrumented fetch into the openai case only.

Lets us classify the reset as connection-phase vs mid-stream before
choosing a fix, without repeating the reverted RetryAgent (#140).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
claude_code
2026-06-24 21:24:05 +03:00
parent 04a418e1a6
commit 4cc8df836f
2 changed files with 91 additions and 1 deletions

View File

@@ -0,0 +1,75 @@
import { Logger } from '@nestjs/common';
/**
* DIAGNOSTIC (provider ECONNRESET investigation) — temporary.
*
* A PASSIVE, behavior-neutral wrapper around the global `fetch`, injected into
* the OpenAI-compatible provider client (`createOpenAI({ fetch })`, the z.ai
* path). Per provider HTTP call it logs: time-to-response-headers + status +
* request-body size on success; and on a pre-response rejection the failure
* latency + error code/cause + request-body size + the idle gap since the
* previous provider call. It NEVER retries, times out, swaps the dispatcher, or
* reads/clones the response body — the Response is returned untouched (streaming
* unaffected) and any error is rethrown unchanged.
*
* How to read the result (a long agentic turn makes one provider call per step):
* - a failed turn whose last provider line is "PRE-RESPONSE FAILED ... ECONNRESET"
* => the reset is in the CONNECTION phase of a step's request (the provider
* never replied) — usually a poisoned keep-alive socket or the provider/middle
* box resetting that request (large body / idle gap are the suspects, hence
* reqBytes + idleSincePrevCall below).
* - the last line is "OK status=200" and the turn still errors with NO
* "PRE-RESPONSE FAILED" => the cut happened MID-STREAM (after headers), a
* different failure mode.
*
* The seq/last-call timestamps are module-level, so under concurrent turns the
* idle-gap figure is approximate (fine for single-user reproduction).
*/
export function createDiagnosticFetch(context: string): typeof fetch {
const logger = new Logger(context);
let callSeq = 0;
let lastCallStartedAt: number | undefined;
return async (input: Parameters<typeof fetch>[0], init?: Parameters<typeof fetch>[1]): Promise<Response> => {
const callId = ++callSeq;
const startedAt = Date.now();
const idleSincePrev =
lastCallStartedAt === undefined ? undefined : startedAt - lastCallStartedAt;
lastCallStartedAt = startedAt;
// Request body size: the chat payload is a JSON string. Used to test whether
// failures correlate with the large accumulated context on later agent steps.
const body = init?.body as unknown;
const bodyBytes =
typeof body === 'string'
? body.length
: body instanceof Uint8Array
? body.byteLength
: undefined;
try {
// Delegate to global fetch; return the Response UNTOUCHED (never read/clone
// the body) so the streamed SSE response is unaffected.
const res = await fetch(input, init);
logger.log(
`provider HTTP DIAGNOSTIC: call#${callId} OK ` +
`headersAfter=${Date.now() - startedAt}ms status=${res.status} ` +
`reqBytes=${bodyBytes ?? 'n/a'} idleSincePrevCall=${idleSincePrev ?? 'n/a'}ms`,
);
return res;
} catch (err) {
// fetch() rejected => PRE-RESPONSE failure (no headers/body received yet):
// the connection/request phase. Log it and rethrow the SAME error.
const e = err as {
name?: string;
message?: string;
cause?: { code?: string; message?: string };
};
logger.warn(
`provider HTTP DIAGNOSTIC: call#${callId} PRE-RESPONSE FAILED ` +
`after=${Date.now() - startedAt}ms code=${e?.cause?.code ?? 'none'} ` +
`name=${e?.name ?? 'Error'} cause=${e?.cause?.message ?? e?.message ?? 'unknown'} ` +
`reqBytes=${bodyBytes ?? 'n/a'} idleSincePrevCall=${idleSincePrev ?? 'n/a'}ms`,
);
throw err;
}
};
}

View File

@@ -14,6 +14,8 @@ import { AiNotConfiguredException } from './ai-not-configured.exception';
import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
import { describeProviderError } from './ai-error.util';
// DIAGNOSTIC (provider ECONNRESET investigation) — temporary.
import { createDiagnosticFetch } from './ai-http-diagnostics';
import { AiProviderCredentialsRepo } from '@docmost/db/repos/ai-chat/ai-provider-credentials.repo';
import { SecretBoxService } from '../crypto/secret-box';
import { AiDriver } from './ai.types';
@@ -43,6 +45,13 @@ export interface ChatModelOverride {
export class AiService {
private readonly logger = new Logger(AiService.name);
// DIAGNOSTIC (provider ECONNRESET investigation) — temporary: passive
// instrumentation of the OpenAI-compatible provider HTTP calls (z.ai).
// Logs call timing/outcome only — no behavior change.
private readonly aiDiagnosticFetch = createDiagnosticFetch(
'AiService:provider-http',
);
constructor(
private readonly aiSettings: AiSettingsService,
private readonly aiProviderCredentialsRepo: AiProviderCredentialsRepo,
@@ -140,7 +149,13 @@ export class AiService {
// Responses API (/responses), which OpenAI-compatible gateways
// (OpenRouter, etc.) reject on multi-turn requests (history with
// assistant messages) → 400.
return createOpenAI({ apiKey, baseURL: baseUrl }).chat(chatModel);
// DIAGNOSTIC (provider ECONNRESET investigation) — temporary: pass the
// passive instrumented fetch (logging only; no behavior change).
return createOpenAI({
apiKey,
baseURL: baseUrl,
fetch: this.aiDiagnosticFetch,
}).chat(chatModel);
case 'gemini':
return createGoogleGenerativeAI({ apiKey })(chatModel);
case 'ollama':