diff --git a/apps/server/src/core/telemetry/client-telemetry.module.spec.ts b/apps/server/src/core/telemetry/client-telemetry.module.spec.ts new file mode 100644 index 00000000..eadeba35 --- /dev/null +++ b/apps/server/src/core/telemetry/client-telemetry.module.spec.ts @@ -0,0 +1,47 @@ +import { ClientTelemetryModule } from './client-telemetry.module'; +import { VitalsController } from './vitals.controller'; +import { VitalsService } from './vitals.service'; + +// The register() gate is the CORE of the maintainer's E1=B decision: the public, +// unauthenticated /api/telemetry/vitals endpoint must be OFF by default, so a +// self-host deploy has no anonymous disk-fill surface into `client_metrics`. A +// regression that inverts the flag (or a truthiness bug where "" / "false" +// registers the route) would silently reopen that surface — pin it here. +describe('ClientTelemetryModule.register (E1=B gate)', () => { + const original = process.env.CLIENT_TELEMETRY_ENABLED; + afterEach(() => { + if (original === undefined) delete process.env.CLIENT_TELEMETRY_ENABLED; + else process.env.CLIENT_TELEMETRY_ENABLED = original; + }); + + it('OFF by default (flag unset) — no controller, no provider (endpoint absent)', () => { + delete process.env.CLIENT_TELEMETRY_ENABLED; + const mod = ClientTelemetryModule.register(); + expect(mod.controllers).toEqual([]); + expect(mod.providers).toEqual([]); + }); + + it.each(['false', 'False', '0', '', 'yes', '1'])( + 'stays OFF for non-"true" value %p (no route)', + (val) => { + process.env.CLIENT_TELEMETRY_ENABLED = val; + const mod = ClientTelemetryModule.register(); + expect(mod.controllers).toEqual([]); + expect(mod.providers).toEqual([]); + }, + ); + + it('ON only for "true" — registers VitalsController + VitalsService', () => { + process.env.CLIENT_TELEMETRY_ENABLED = 'true'; + const mod = ClientTelemetryModule.register(); + expect(mod.controllers).toContain(VitalsController); + expect(mod.providers).toContain(VitalsService); + }); + + it('ON is case-insensitive ("TRUE")', () => { + process.env.CLIENT_TELEMETRY_ENABLED = 'TRUE'; + const mod = ClientTelemetryModule.register(); + expect(mod.controllers).toContain(VitalsController); + expect(mod.providers).toContain(VitalsService); + }); +}); diff --git a/apps/server/src/database/database.module.ts b/apps/server/src/database/database.module.ts index ea56ed11..d0fb81a7 100644 --- a/apps/server/src/database/database.module.ts +++ b/apps/server/src/database/database.module.ts @@ -40,7 +40,10 @@ import { PageListener } from '@docmost/db/listeners/page.listener'; import { PostgresJSDialect } from 'kysely-postgres-js'; import * as postgres from 'postgres'; import { normalizePostgresUrl } from '../common/helpers'; -import { observeDbQuery } from '../integrations/metrics/metrics.registry'; +import { + observeDbQuery, + isMetricsEnabled, +} from '../integrations/metrics/metrics.registry'; import { firstSqlToken } from '../integrations/metrics/metrics.constants'; @Global() @@ -70,12 +73,16 @@ import { firstSqlToken } from '../integrations/metrics/metrics.constants'; plugins: [new CamelCasePlugin()], log: (event: LogEvent) => { // #355 — db_query_duration_seconds, labelled by the leading SQL token - // (bounded cardinality). No-op when METRICS_PORT is unset. Runs for - // every query, independent of the dev-only debug logging below. - observeDbQuery( - firstSqlToken(event.query.sql), - event.queryDurationMillis / 1000, - ); + // (bounded cardinality). Gated on isMetricsEnabled() so the token work + // (regex + Set lookup) is skipped entirely when metrics are OFF — not + // just observeDbQuery no-op'd — so a non-metrics deployment pays nothing + // per query. Runs independent of the dev-only debug logging below. + if (isMetricsEnabled()) { + observeDbQuery( + firstSqlToken(event.query.sql), + event.queryDurationMillis / 1000, + ); + } if (environmentService.getNodeEnv() !== 'development') return; const logger = new Logger(DatabaseModule.name); diff --git a/apps/server/src/integrations/metrics/metrics.constants.ts b/apps/server/src/integrations/metrics/metrics.constants.ts index 16ad4950..b802e2f0 100644 --- a/apps/server/src/integrations/metrics/metrics.constants.ts +++ b/apps/server/src/integrations/metrics/metrics.constants.ts @@ -30,28 +30,32 @@ export const JOB_BUCKETS = [ * the full query text would blow up label cardinality; the leading keyword is a * finite set. Unknown/empty queries collapse to `other`. */ +// The bounded set of SQL leading keywords used as db_query_duration_seconds +// labels. Module-const so it is built ONCE, not per query (this runs on every DB +// query when metrics are enabled). +const KNOWN_SQL_TOKENS = new Set([ + 'select', + 'insert', + 'update', + 'delete', + 'with', + 'begin', + 'commit', + 'rollback', + 'alter', + 'create', + 'drop', + 'truncate', + 'explain', +]); + export function firstSqlToken(sql: string | undefined): string { if (!sql) return 'other'; // Skip leading whitespace / comments and grab the first word. const match = /^[\s(]*([a-zA-Z]+)/.exec(sql); if (!match) return 'other'; const token = match[1].toLowerCase(); - const known = new Set([ - 'select', - 'insert', - 'update', - 'delete', - 'with', - 'begin', - 'commit', - 'rollback', - 'alter', - 'create', - 'drop', - 'truncate', - 'explain', - ]); - return known.has(token) ? token : 'other'; + return KNOWN_SQL_TOKENS.has(token) ? token : 'other'; } /** diff --git a/apps/server/src/integrations/metrics/metrics.server.ts b/apps/server/src/integrations/metrics/metrics.server.ts index 3d09547a..a5c4d3e8 100644 --- a/apps/server/src/integrations/metrics/metrics.server.ts +++ b/apps/server/src/integrations/metrics/metrics.server.ts @@ -73,5 +73,14 @@ export function closeMetricsServer(): Promise { if (!server) return Promise.resolve(); return new Promise((resolve) => { server.close(() => resolve()); + // server.close() stops accepting NEW connections but its callback does not + // fire until existing keep-alive sockets drain. The scraper (VictoriaMetrics/ + // vmagent) holds an idle HTTP keep-alive socket, so without this the callback + // — and thus shutdown — would hang until the scraper disconnects or the + // orchestrator escalates to SIGKILL on the kill-grace window. Force-close idle + // keep-alive sockets so close() completes immediately, and unref so this + // server never keeps the event loop alive on its own. + server.closeIdleConnections(); + server.unref(); }); }