- Invert the transport layers so the pre-response retry is OUTERMOST and the
provider-HTTP instrumentation is INNER. Before, the retry lived inside
createStreamingFetch (under the instrumentation), so a reset the retry
recovered from logged only a clean "OK status=200" — the
"PRE-RESPONSE FAILED ... ECONNRESET ... idleSincePrevCall" signal went blind
exactly when the fix works, and AI_STREAM_KEEPALIVE_MS couldn't be tuned from
prod data. Now createStreamingFetch is the dispatcher-bound BASE (no retry) and
a new withPreResponseRetry() wraps it; ai.service composes
withPreResponseRetry(createInstrumentedFetch('AiService:provider-http',
createStreamingFetch())), so every attempt — including recovered resets — flows
through the instrumentation. (Also expresses the keepAlive-config vs retry-
behavior boundary structurally, per review #3.)
- Add the retry-exhaustion test: a server that resets EVERY connection, asserting
the call rejects with a retryable connection error AND exactly
PRE_RESPONSE_CONNECT_RETRIES + 1 (= 3) requests reached the server — pinning the
bound and that the final error propagates (guards an off-by-one / infinite loop
/ swallowed error). Existing happy-retry + abort tests moved onto
withPreResponseRetry.
Verified on the stand: a normal turn still streams (reasoning + finish) and the
provider-HTTP telemetry still logs. server tsc + ai/mcp specs green (30).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
236 lines
8.8 KiB
TypeScript
236 lines
8.8 KiB
TypeScript
import * as http from 'node:http';
|
|
import {
|
|
createStreamingFetch,
|
|
withPreResponseRetry,
|
|
streamTimeoutMs,
|
|
streamKeepAliveMs,
|
|
streamingDispatcherOptions,
|
|
isRetryableConnectError,
|
|
} from './ai-streaming-fetch';
|
|
|
|
/**
|
|
* #175: undici's default 300s headers/body timeouts severed long agent turns.
|
|
* The streaming fetch raises them to a generous-but-FINITE silence timeout (not
|
|
* 0 — a true hang must still break). We pin: the configured value + env override,
|
|
* that both dispatcher timeouts use it, and that a delayed response streams.
|
|
*/
|
|
describe('streamTimeoutMs', () => {
|
|
const ORIG = process.env.AI_STREAM_TIMEOUT_MS;
|
|
afterEach(() => {
|
|
if (ORIG === undefined) delete process.env.AI_STREAM_TIMEOUT_MS;
|
|
else process.env.AI_STREAM_TIMEOUT_MS = ORIG;
|
|
});
|
|
|
|
it('defaults to a generous-but-finite 15 minutes', () => {
|
|
delete process.env.AI_STREAM_TIMEOUT_MS;
|
|
expect(streamTimeoutMs()).toBe(900_000);
|
|
// Finite — NOT disabled (0 would let a hung provider leak forever).
|
|
expect(streamTimeoutMs()).toBeGreaterThan(0);
|
|
expect(Number.isFinite(streamTimeoutMs())).toBe(true);
|
|
});
|
|
|
|
it('honours a positive AI_STREAM_TIMEOUT_MS override', () => {
|
|
process.env.AI_STREAM_TIMEOUT_MS = '120000';
|
|
expect(streamTimeoutMs()).toBe(120000);
|
|
});
|
|
|
|
it('ignores an invalid / non-positive override (falls back to default)', () => {
|
|
for (const bad of ['0', '-5', 'abc', '']) {
|
|
process.env.AI_STREAM_TIMEOUT_MS = bad;
|
|
expect(streamTimeoutMs()).toBe(900_000);
|
|
}
|
|
});
|
|
|
|
it('applies the silence timeout + keep-alive recycle window to the dispatcher', () => {
|
|
delete process.env.AI_STREAM_TIMEOUT_MS;
|
|
delete process.env.AI_STREAM_KEEPALIVE_MS;
|
|
expect(streamingDispatcherOptions()).toEqual({
|
|
headersTimeout: 900_000,
|
|
bodyTimeout: 900_000,
|
|
keepAliveTimeout: 10_000,
|
|
keepAliveMaxTimeout: 10_000,
|
|
});
|
|
});
|
|
});
|
|
|
|
describe('streamKeepAliveMs', () => {
|
|
const ORIG = process.env.AI_STREAM_KEEPALIVE_MS;
|
|
afterEach(() => {
|
|
if (ORIG === undefined) delete process.env.AI_STREAM_KEEPALIVE_MS;
|
|
else process.env.AI_STREAM_KEEPALIVE_MS = ORIG;
|
|
});
|
|
|
|
it('defaults to 10s (recycle idle sockets so a NAT/proxy drop cannot poison reuse)', () => {
|
|
delete process.env.AI_STREAM_KEEPALIVE_MS;
|
|
expect(streamKeepAliveMs()).toBe(10_000);
|
|
});
|
|
|
|
it('honours a positive override and ignores invalid/non-positive', () => {
|
|
process.env.AI_STREAM_KEEPALIVE_MS = '4000';
|
|
expect(streamKeepAliveMs()).toBe(4000);
|
|
for (const bad of ['0', '-1', 'x', '']) {
|
|
process.env.AI_STREAM_KEEPALIVE_MS = bad;
|
|
expect(streamKeepAliveMs()).toBe(10_000);
|
|
}
|
|
});
|
|
});
|
|
|
|
describe('isRetryableConnectError', () => {
|
|
it('matches connection-level codes on the error or its cause', () => {
|
|
expect(isRetryableConnectError({ cause: { code: 'ECONNRESET' } })).toBe(true);
|
|
expect(isRetryableConnectError({ cause: { code: 'UND_ERR_SOCKET' } })).toBe(true);
|
|
expect(isRetryableConnectError({ code: 'ECONNREFUSED' })).toBe(true);
|
|
});
|
|
it('does NOT match aborts / unrelated errors', () => {
|
|
expect(isRetryableConnectError({ name: 'AbortError', cause: { code: 'ABORT_ERR' } })).toBe(false);
|
|
expect(isRetryableConnectError({ cause: { code: 'UND_ERR_HEADERS_TIMEOUT' } })).toBe(false);
|
|
expect(isRetryableConnectError(new Error('plain'))).toBe(false);
|
|
expect(isRetryableConnectError(undefined)).toBe(false);
|
|
});
|
|
});
|
|
|
|
describe('createStreamingFetch — against a delayed server', () => {
|
|
const ORIG = process.env.AI_STREAM_TIMEOUT_MS;
|
|
let server: http.Server;
|
|
let url: string;
|
|
// The server waits before sending ANY byte (a long time-to-first-token). It is
|
|
// > undici's ~1s timeout-timer granularity so a sub-second configured timeout
|
|
// fires deterministically in the load-bearing test below.
|
|
const DELAY = 1500;
|
|
|
|
beforeAll(async () => {
|
|
server = http.createServer((_req, res) => {
|
|
setTimeout(() => {
|
|
res.writeHead(200, { 'Content-Type': 'text/plain' });
|
|
res.end('ok');
|
|
}, DELAY);
|
|
});
|
|
await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve));
|
|
const addr = server.address() as import('node:net').AddressInfo;
|
|
url = `http://127.0.0.1:${addr.port}/`;
|
|
});
|
|
|
|
afterAll(async () => {
|
|
await new Promise<void>((resolve) => server.close(() => resolve()));
|
|
});
|
|
|
|
afterEach(() => {
|
|
if (ORIG === undefined) delete process.env.AI_STREAM_TIMEOUT_MS;
|
|
else process.env.AI_STREAM_TIMEOUT_MS = ORIG;
|
|
});
|
|
|
|
it('streams the delayed response at the default (generous) timeout', async () => {
|
|
delete process.env.AI_STREAM_TIMEOUT_MS; // default 15 min >> DELAY
|
|
const streamingFetch = createStreamingFetch();
|
|
const res = await streamingFetch(url);
|
|
expect(res.status).toBe(200);
|
|
expect(await res.text()).toBe('ok');
|
|
});
|
|
|
|
it('LOAD-BEARING: a sub-DELAY AI_STREAM_TIMEOUT_MS actually severs the response', async () => {
|
|
// Proves the configured dispatcher is wired into the fetch: with the timeout
|
|
// set below DELAY the call must reject with undici's headers-timeout. If the
|
|
// dispatcher were lost (fallback to global fetch's 300s default), the 1.5s
|
|
// response would slip through and this would NOT throw.
|
|
process.env.AI_STREAM_TIMEOUT_MS = '500';
|
|
const streamingFetch = createStreamingFetch();
|
|
let caught: unknown;
|
|
const startedAt = Date.now();
|
|
try {
|
|
await streamingFetch(url).then((r) => r.text());
|
|
} catch (e) {
|
|
caught = e;
|
|
}
|
|
// It rejected (a lost dispatcher -> global 300s default would NOT reject on a
|
|
// 1.5s response) and it did so BEFORE the response would have arrived (DELAY).
|
|
// Use `.name` (realm-safe) — undici's TypeError fails cross-realm instanceof.
|
|
expect(caught).toBeDefined();
|
|
expect((caught as Error)?.name).toBe('TypeError');
|
|
expect(Date.now() - startedAt).toBeLessThan(DELAY);
|
|
// When present, the undici cause is the headers timeout.
|
|
const code = (caught as { cause?: { code?: string } })?.cause?.code;
|
|
if (code) expect(code).toBe('UND_ERR_HEADERS_TIMEOUT');
|
|
});
|
|
});
|
|
|
|
describe('withPreResponseRetry', () => {
|
|
// The retry is the OUTERMOST layer (over the dispatcher-bound streaming fetch),
|
|
// matching ai.service's withPreResponseRetry(instrument(createStreamingFetch())).
|
|
// PRE_RESPONSE_CONNECT_RETRIES is 2 -> at most 3 total attempts.
|
|
const MAX_ATTEMPTS = 3;
|
|
let server: http.Server;
|
|
let url: string;
|
|
let requests = 0;
|
|
// 'first' resets only the first connection; 'all' resets every connection.
|
|
let resetMode: 'first' | 'all' = 'first';
|
|
|
|
const retryingFetch = () => withPreResponseRetry(createStreamingFetch());
|
|
|
|
beforeAll(async () => {
|
|
server = http.createServer((req, res) => {
|
|
requests += 1;
|
|
const shouldReset = resetMode === 'all' || requests === 1;
|
|
if (shouldReset) {
|
|
// Reset before any response byte (a poisoned/stale keep-alive socket).
|
|
const sock = req.socket as import('node:net').Socket & {
|
|
resetAndDestroy?: () => void;
|
|
};
|
|
if (typeof sock.resetAndDestroy === 'function') sock.resetAndDestroy();
|
|
else sock.destroy();
|
|
return;
|
|
}
|
|
res.writeHead(200, { 'Content-Type': 'text/plain' });
|
|
res.end('ok');
|
|
});
|
|
await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve));
|
|
const addr = server.address() as import('node:net').AddressInfo;
|
|
url = `http://127.0.0.1:${addr.port}/`;
|
|
});
|
|
|
|
afterAll(async () => {
|
|
await new Promise<void>((resolve) => server.close(() => resolve()));
|
|
});
|
|
|
|
beforeEach(() => {
|
|
requests = 0;
|
|
resetMode = 'first';
|
|
});
|
|
|
|
it('retries a pre-response reset on a fresh connection and succeeds', async () => {
|
|
resetMode = 'first';
|
|
const res = await retryingFetch()(url);
|
|
expect(res.status).toBe(200);
|
|
expect(await res.text()).toBe('ok');
|
|
// first request reset -> retry -> second request served.
|
|
expect(requests).toBe(2);
|
|
});
|
|
|
|
it('gives up after the retry bound and rethrows the original reset', async () => {
|
|
resetMode = 'all'; // every attempt resets -> retries exhaust
|
|
let caught: unknown;
|
|
try {
|
|
await retryingFetch()(url);
|
|
} catch (e) {
|
|
caught = e;
|
|
}
|
|
expect(caught).toBeDefined();
|
|
// A retryable connection error reached the caller (not swallowed).
|
|
expect(isRetryableConnectError(caught)).toBe(true);
|
|
// Bounded: exactly PRE_RESPONSE_CONNECT_RETRIES + 1 attempts hit the server
|
|
// (pins both the limit and that the final error propagates — guards an
|
|
// off-by-one or an infinite loop).
|
|
expect(requests).toBe(MAX_ATTEMPTS);
|
|
});
|
|
|
|
it('does NOT retry an aborted request (no retry storm)', async () => {
|
|
resetMode = 'all';
|
|
const ctrl = new AbortController();
|
|
ctrl.abort();
|
|
await expect(
|
|
retryingFetch()(url, { signal: ctrl.signal }),
|
|
).rejects.toBeDefined();
|
|
// Pre-aborted: the request never reached the server, so nothing was retried.
|
|
expect(requests).toBe(0);
|
|
});
|
|
});
|