diff --git a/.env.example b/.env.example index d3d63309..44acb5ca 100644 --- a/.env.example +++ b/.env.example @@ -242,3 +242,27 @@ MCP_DOCMOST_PASSWORD= # FAILS CLOSED if Redis is unavailable (default: 1,000,000 tokens per workspace # per rolling day). # SHARE_AI_WORKSPACE_TOKEN_BUDGET_PER_DAY=1000000 + +# --- Observability / perf metrics (#355) --- +# +# Two INDEPENDENT toggles, both OFF by default: +# +# 1) METRICS_PORT — the server-side Prometheus scrape endpoint. +# UNSET (default) => the whole prom subsystem is OFF: no registry, no +# collectors, and NOTHING is exposed on the main app port. There is NO +# default port — leaving it blank disables it. When set to a port (e.g. +# 9464), a SEPARATE bare node:http listener serves GET /metrics on that port +# only (never on the main :3000 app listener), for a scraper such as +# VictoriaMetrics/Prometheus reaching it as :/metrics. +# METRICS_PORT=9464 +# +# 2) CLIENT_TELEMETRY_ENABLED — the public client perf-telemetry sink. +# OFF by default. When true, the unauthenticated POST /api/telemetry/vitals +# endpoint is registered and browsers collect + send web-vitals / editor +# metrics into the `client_metrics` table (read directly by Grafana, separate +# from METRICS_PORT). Leave OFF unless you actually consume this data: the +# endpoint is public and the table has NO app-side retention, so enabling it +# requires an EXTERNAL pruner to bound `client_metrics` growth (the deployed +# infra prunes rows >90d via a maintenance container). When off, the endpoint +# does not exist and the client installs no observers. +# CLIENT_TELEMETRY_ENABLED=false diff --git a/apps/client/src/features/editor/page-editor.tsx b/apps/client/src/features/editor/page-editor.tsx index 8b2f2fc9..e1244ee5 100644 --- a/apps/client/src/features/editor/page-editor.tsx +++ b/apps/client/src/features/editor/page-editor.tsx @@ -94,6 +94,7 @@ import { isCollabSynced, } from "@/features/editor/editor-sync-state"; import { + isVitalsActive, measurePageOpen, reportEditorTx, } from "@/lib/telemetry/vitals"; @@ -356,32 +357,38 @@ export default function PageEditor({ handleScrollTo(editor); editorRef.current = editor; - // #355 — page_open_ms: this is the first editor-content render, so - // measure against any page-open mark set on the tree-row/link click. - measurePageOpen(); + // #355 — perf instrumentation. Skip ALL of it when telemetry is + // disabled (F1 flag off) or this session isn't sampled: no page-open + // measure, and crucially NO dispatch wrapping, so a non-collecting + // session pays zero per-transaction cost. + if (isVitalsActive()) { + // page_open_ms: this is the first editor-content render, so measure + // against any page-open mark set on the tree-row/link click. + measurePageOpen(); - // #355 — editor_tx_ms: time the SYNCHRONOUS part of applying each - // transaction (state.apply + updateState) by wrapping the view's - // dispatch. Only slow syncs (>8ms) are reported (see reportEditorTx), - // so the common path adds just one performance.now() pair. Passive: - // the original dispatch still runs unchanged. - try { - const view = editor.view as unknown as { - dispatch: (tr: unknown) => void; - }; - const originalDispatch = view.dispatch.bind(view); - view.dispatch = (tr: unknown) => { - const started = performance.now(); - originalDispatch(tr); - const elapsed = performance.now() - started; - try { - reportEditorTx(elapsed, editor.state.doc.content.size); - } catch { - // never let telemetry break editing - } - }; - } catch { - // if the view shape changes, skip editor_tx instrumentation + // editor_tx_ms: time the SYNCHRONOUS part of applying each + // transaction (state.apply + updateState) by wrapping the view's + // dispatch. Only slow syncs (>8ms) are reported (see reportEditorTx), + // so the common path adds just one performance.now() pair. Passive: + // the original dispatch still runs unchanged. + try { + const view = editor.view as unknown as { + dispatch: (tr: unknown) => void; + }; + const originalDispatch = view.dispatch.bind(view); + view.dispatch = (tr: unknown) => { + const started = performance.now(); + originalDispatch(tr); + const elapsed = performance.now() - started; + try { + reportEditorTx(elapsed, editor.state.doc.content.size); + } catch { + // never let telemetry break editing + } + }; + } catch { + // if the view shape changes, skip editor_tx instrumentation + } } } }, diff --git a/apps/client/src/lib/config.ts b/apps/client/src/lib/config.ts index adc5615d..4ee12ee2 100644 --- a/apps/client/src/lib/config.ts +++ b/apps/client/src/lib/config.ts @@ -47,6 +47,13 @@ export function isCompactPageTreeEnabled(): boolean { return castToBoolean(getConfigValue("COMPACT_PAGE_TREE", "true")); } +// #355 — operator toggle for client perf-telemetry. DEFAULT OFF: the server +// mirrors CLIENT_TELEMETRY_ENABLED into window.CONFIG; when off the client +// installs no observers and sends nothing (the sink endpoint doesn't exist). +export function isClientTelemetryEnabled(): boolean { + return castToBoolean(getConfigValue("CLIENT_TELEMETRY_ENABLED", "false")); +} + export function getAvatarUrl( avatarUrl: string, type: AvatarIconType = AvatarIconType.AVATAR, diff --git a/apps/client/src/lib/telemetry/vitals.ts b/apps/client/src/lib/telemetry/vitals.ts index b18bcba0..20a67bc5 100644 --- a/apps/client/src/lib/telemetry/vitals.ts +++ b/apps/client/src/lib/telemetry/vitals.ts @@ -8,6 +8,7 @@ import { type LCPMetricWithAttribution, type TTFBMetricWithAttribution, } from "web-vitals/attribution"; +import { isClientTelemetryEnabled } from "@/lib/config"; import { currentRouteTemplate } from "./route-template"; /** @@ -75,6 +76,16 @@ export function isVitalsSampled(): boolean { } } +/** + * True only when telemetry is BOTH enabled by the operator (F1 flag) AND this + * session is sampled. Callers outside initVitals (e.g. the editor dispatch + * wrapper) use this to skip ALL instrumentation cost on disabled/non-sampled + * sessions — no observers, no per-transaction timing. + */ +export function isVitalsActive(): boolean { + return isClientTelemetryEnabled() && isVitalsSampled(); +} + function truncateAttr(value: unknown): string | undefined { if (typeof value !== "string" || value.length === 0) return undefined; return value.slice(0, MAX_ATTR_LENGTH); @@ -126,7 +137,7 @@ export function reportClientMetric( value: number, extra?: { docSize?: number }, ): void { - if (!isVitalsSampled()) return; + if (!isVitalsActive()) return; if (!Number.isFinite(value)) return; enqueue({ name, @@ -200,6 +211,10 @@ export function initVitals(): void { if (initialised) return; initialised = true; + // Operator flag gate (F1, default OFF): when telemetry is disabled the sink + // endpoint does not even exist server-side, so install ZERO observers. + if (!isClientTelemetryEnabled()) return; + // Sampling gate is evaluated BEFORE any observer subscription. if (!isVitalsSampled()) return; diff --git a/apps/server/src/app.module.ts b/apps/server/src/app.module.ts index 381970fa..527035b9 100644 --- a/apps/server/src/app.module.ts +++ b/apps/server/src/app.module.ts @@ -96,7 +96,9 @@ try { AiModule, AiChatModule, MetricsModule, - ClientTelemetryModule, + // Gated OFF by default: only registers the public vitals sink controller + // when CLIENT_TELEMETRY_ENABLED=true (maintainer decision E1=B). + ClientTelemetryModule.register(), ...enterpriseModules, ], controllers: [AppController], diff --git a/apps/server/src/core/auth/auth.controller.ts b/apps/server/src/core/auth/auth.controller.ts index 84cdea96..f15d82e4 100644 --- a/apps/server/src/core/auth/auth.controller.ts +++ b/apps/server/src/core/auth/auth.controller.ts @@ -16,6 +16,7 @@ import { AUTH_THROTTLER, PAGE_TEMPLATE_THROTTLER, PUBLIC_SHARE_AI_THROTTLER, + VITALS_THROTTLER, } from '../../integrations/throttle/throttler-names'; import { LoginDto } from './dto/login.dto'; import { AuthService } from './services/auth.service'; @@ -184,16 +185,21 @@ export class AuthController { } // The global ThrottlerGuard applies ALL named throttlers to every route by - // default, so each non-AUTH bucket (AI chat, page template, public-share AI) - // is explicitly skipped here. collab-token is auth-guarded (JwtAuthGuard), - // per-user and client-cached, so those feature buckets are irrelevant to it; - // skipping them avoids spurious 429s when a user opens many pages in a short - // window. The AUTH bucket is skipped too for the same per-user, cached reason. + // default, so each non-AUTH bucket (AI chat, page template, public-share AI, + // client vitals) is explicitly skipped here. collab-token is auth-guarded + // (JwtAuthGuard), per-user and client-cached, so those feature buckets are + // irrelevant to it; skipping them avoids spurious 429s when a user opens many + // pages in a short window. The VITALS bucket must be skipped too: it is a + // process-wide named throttler, so without this skip its per-IP limit would + // silently cap collab-token (the one route that opts out of every other + // bucket) and break editing behind shared/NAT IPs. The AUTH bucket is skipped + // for the same per-user, cached reason. @SkipThrottle({ [AUTH_THROTTLER]: true, [AI_CHAT_THROTTLER]: true, [PAGE_TEMPLATE_THROTTLER]: true, [PUBLIC_SHARE_AI_THROTTLER]: true, + [VITALS_THROTTLER]: true, }) @UseGuards(JwtAuthGuard) @HttpCode(HttpStatus.OK) diff --git a/apps/server/src/core/telemetry/client-metrics.constants.ts b/apps/server/src/core/telemetry/client-metrics.constants.ts index 4f4cc70f..a23fd06f 100644 --- a/apps/server/src/core/telemetry/client-metrics.constants.ts +++ b/apps/server/src/core/telemetry/client-metrics.constants.ts @@ -37,6 +37,12 @@ export const MAX_ATTR_LENGTH = 120; // route label sanity cap (client sends a template like /s/:space/p/:slug). export const MAX_ROUTE_LENGTH = 200; +// `client_metrics.doc_size` is a Postgres `int` (int4). A garbage/huge docSize +// on a single event would overflow int4 and make Postgres reject the WHOLE +// batch INSERT, losing every event in it. Values outside this range are DROPPED +// to null (the event is still kept) so one bad field never loses the batch. +export const DOC_SIZE_MAX = 2147483647; // 2^31 - 1 (int4 max) + export interface ClientMetricRow { name: string; value: number; @@ -88,6 +94,12 @@ export function sanitizeVitalEvent( // Accept snake_case too, in case a client sends the raw column name. docSize = Math.trunc(e.doc_size as number); } + // Guard the int4 column: an out-of-range docSize would overflow int4 and make + // Postgres reject the whole batch INSERT. Drop the field (keep the event) + // rather than lose every other event in the batch. + if (docSize !== null && (docSize < 0 || docSize > DOC_SIZE_MAX)) { + docSize = null; + } return { name, value, rating, route, attr, docSize, workspaceId }; } diff --git a/apps/server/src/core/telemetry/client-telemetry.module.ts b/apps/server/src/core/telemetry/client-telemetry.module.ts index cf94efc0..a16d3275 100644 --- a/apps/server/src/core/telemetry/client-telemetry.module.ts +++ b/apps/server/src/core/telemetry/client-telemetry.module.ts @@ -1,4 +1,4 @@ -import { Module } from '@nestjs/common'; +import { DynamicModule, Module } from '@nestjs/common'; import { VitalsController } from './vitals.controller'; import { VitalsService } from './vitals.service'; @@ -7,9 +7,26 @@ import { VitalsService } from './vitals.service'; * persists web-vitals + custom client metrics into `client_metrics`. * Named ClientTelemetryModule to avoid confusion with the unrelated * integrations/telemetry (product usage ping) module. + * + * GATED OFF BY DEFAULT (maintainer decision E1=B). The public, unauthenticated + * endpoint is only registered when CLIENT_TELEMETRY_ENABLED=true — otherwise the + * route does NOT exist at all (no anonymous disk-fill surface, and no unbounded + * `client_metrics` growth on a self-host deploy without an external pruner). The + * client is told the same flag via window.CONFIG and skips sending when off. */ -@Module({ - controllers: [VitalsController], - providers: [VitalsService], -}) -export class ClientTelemetryModule {} +@Module({}) +export class ClientTelemetryModule { + static register(): DynamicModule { + // Read process.env directly (not EnvironmentService) so the toggle is + // resolved at module-registration time, identical to how the metrics + // subsystem reads METRICS_PORT. Absent/anything-but-"true" => OFF. + const enabled = + (process.env.CLIENT_TELEMETRY_ENABLED ?? '').toLowerCase() === 'true'; + + return { + module: ClientTelemetryModule, + controllers: enabled ? [VitalsController] : [], + providers: enabled ? [VitalsService] : [], + }; + } +} diff --git a/apps/server/src/core/telemetry/vitals.controller.ts b/apps/server/src/core/telemetry/vitals.controller.ts index 62812671..242dfa54 100644 --- a/apps/server/src/core/telemetry/vitals.controller.ts +++ b/apps/server/src/core/telemetry/vitals.controller.ts @@ -6,10 +6,16 @@ import { Req, UseGuards, } from '@nestjs/common'; -import { Throttle, ThrottlerGuard } from '@nestjs/throttler'; +import { SkipThrottle, Throttle, ThrottlerGuard } from '@nestjs/throttler'; import { FastifyRequest } from 'fastify'; import { Public } from '../../common/decorators/public.decorator'; -import { VITALS_THROTTLER } from '../../integrations/throttle/throttler-names'; +import { + AI_CHAT_THROTTLER, + AUTH_THROTTLER, + PAGE_TEMPLATE_THROTTLER, + PUBLIC_SHARE_AI_THROTTLER, + VITALS_THROTTLER, +} from '../../integrations/throttle/throttler-names'; import { VitalsService } from './vitals.service'; /** @@ -25,6 +31,17 @@ export class VitalsController { @Public() @UseGuards(ThrottlerGuard) + // The global ThrottlerGuard applies ALL named throttlers to every route, so + // every OTHER bucket must be skipped here — otherwise the strictest of them + // (public-share AI at 5/min) would override the intended vitals limit and cap + // this route at 5/min instead of 120/min. Skip them all so ONLY the VITALS + // bucket below applies. + @SkipThrottle({ + [AUTH_THROTTLER]: true, + [AI_CHAT_THROTTLER]: true, + [PAGE_TEMPLATE_THROTTLER]: true, + [PUBLIC_SHARE_AI_THROTTLER]: true, + }) @Throttle({ [VITALS_THROTTLER]: { limit: 120, ttl: 60_000 } }) @Post('vitals') @HttpCode(200) diff --git a/apps/server/src/core/telemetry/vitals.service.spec.ts b/apps/server/src/core/telemetry/vitals.service.spec.ts index 2fca1228..12fb78b9 100644 --- a/apps/server/src/core/telemetry/vitals.service.spec.ts +++ b/apps/server/src/core/telemetry/vitals.service.spec.ts @@ -119,4 +119,31 @@ describe('VitalsService.buildRows', () => { const rows = svc.buildRows({ events: [{ name: 'LCP', value: 1 }] }, null); expect(rows[0].workspaceId).toBeNull(); }); + + it('drops an out-of-int4-range docSize to null without losing the batch', () => { + const rows = svc.buildRows( + { + events: [ + // Garbage docSize overflowing int4 must NOT reject the whole batch: + // the field is dropped to null and the event is kept. + { name: 'editor_tx_ms', value: 10, docSize: 9_999_999_999 }, + { name: 'editor_tx_ms', value: 20, docSize: -5 }, + { name: 'editor_tx_ms', value: 30, docSize: 4096 }, + ], + }, + WS, + ); + expect(rows).toHaveLength(3); + expect(rows[0].docSize).toBeNull(); + expect(rows[1].docSize).toBeNull(); + expect(rows[2].docSize).toBe(4096); + }); + + it('keeps a docSize exactly at the int4 max', () => { + const rows = svc.buildRows( + { events: [{ name: 'editor_tx_ms', value: 1, docSize: 2147483647 }] }, + WS, + ); + expect(rows[0].docSize).toBe(2147483647); + }); }); diff --git a/apps/server/src/integrations/environment/environment.service.ts b/apps/server/src/integrations/environment/environment.service.ts index 15169152..6cad98bf 100644 --- a/apps/server/src/integrations/environment/environment.service.ts +++ b/apps/server/src/integrations/environment/environment.service.ts @@ -227,6 +227,22 @@ export class EnvironmentService { return compactTree === 'true'; } + /** + * Operator toggle for the public client-telemetry sink (#355). DEFAULT OFF: + * the unauthenticated POST /api/telemetry/vitals endpoint + client vitals + * collection are only wired when this is explicitly true. Kept SEPARATE from + * METRICS_PORT (the server Prometheus half) because Grafana reads the + * `client_metrics` table directly, independent of the scrape port — and + * because `client_metrics` has no app-side retention, so an operator must opt + * in and run an external pruner. + */ + isClientTelemetryEnabled(): boolean { + const enabled = this.configService + .get('CLIENT_TELEMETRY_ENABLED', 'false') + .toLowerCase(); + return enabled === 'true'; + } + getStripePublishableKey(): string { return this.configService.get('STRIPE_PUBLISHABLE_KEY'); } diff --git a/apps/server/src/integrations/metrics/metrics-server.lifecycle.ts b/apps/server/src/integrations/metrics/metrics-server.lifecycle.ts new file mode 100644 index 00000000..1c4de068 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics-server.lifecycle.ts @@ -0,0 +1,16 @@ +import { Injectable, OnModuleDestroy } from '@nestjs/common'; +import { closeMetricsServer } from './metrics.server'; + +/** + * Ties the bare node:http metrics scrape server (started in main.ts after the + * Fastify app is up, outside the DI container) into Nest's shutdown lifecycle. + * With `app.enableShutdownHooks()`, onModuleDestroy fires on SIGTERM/SIGINT and + * closes the listener so it is not left dangling (jest/e2e never exits, and a + * prod restart doesn't leak the port). No-op when metrics are disabled. + */ +@Injectable() +export class MetricsServerLifecycle implements OnModuleDestroy { + async onModuleDestroy(): Promise { + await closeMetricsServer(); + } +} diff --git a/apps/server/src/integrations/metrics/metrics.module.ts b/apps/server/src/integrations/metrics/metrics.module.ts index a89bdff0..4c73584f 100644 --- a/apps/server/src/integrations/metrics/metrics.module.ts +++ b/apps/server/src/integrations/metrics/metrics.module.ts @@ -1,13 +1,15 @@ import { Module } from '@nestjs/common'; import { MetricsBullService } from './metrics-bull.service'; +import { MetricsServerLifecycle } from './metrics-server.lifecycle'; /** * Wires the BullMQ collectors (#355). The queues are provided by the @Global * QueueModule (which exports BullModule), so no re-registration is needed here. * The HTTP histogram, DB-query and collab-store collectors live in module-level * singletons (metrics.registry) and are wired directly at their call sites. + * MetricsServerLifecycle closes the scrape server on shutdown. */ @Module({ - providers: [MetricsBullService], + providers: [MetricsBullService, MetricsServerLifecycle], }) export class MetricsModule {} diff --git a/apps/server/src/integrations/metrics/metrics.server.ts b/apps/server/src/integrations/metrics/metrics.server.ts index f1b90b15..3d09547a 100644 --- a/apps/server/src/integrations/metrics/metrics.server.ts +++ b/apps/server/src/integrations/metrics/metrics.server.ts @@ -3,13 +3,19 @@ import { Logger } from '@nestjs/common'; import { getMetricsRegistry, isMetricsEnabled } from './metrics.registry'; /** - * Start the Prometheus scrape endpoint on a SEPARATE port (default 9464, - * overridable via `METRICS_PORT`). This is a bare node:http server, NOT part of - * the Fastify app, so `/metrics` never exists on the public :3000 listener. + * Start the Prometheus scrape endpoint on a SEPARATE port, taken from + * `METRICS_PORT`. There is NO default port: when `METRICS_PORT` is unset the + * whole metrics subsystem is OFF and this returns null. This is a bare node:http + * server, NOT part of the Fastify app, so `/metrics` never exists on the public + * :3000 listener. * * Returns the http.Server (so callers can close it on shutdown) or null when - * metrics are disabled. + * metrics are disabled. The reference is also kept module-side so the Nest + * lifecycle (see MetricsModule) can close it on application shutdown without + * threading the handle back through the non-DI bootstrap. */ +let metricsServer: Server | null = null; + export function startMetricsServer(): Server | null { if (!isMetricsEnabled()) return null; @@ -52,5 +58,20 @@ export function startMetricsServer(): Server | null { logger.error(`Metrics server error: ${err?.message}`); }); + metricsServer = server; return server; } + +/** + * Close the metrics scrape server if one is running. Idempotent and safe to call + * when metrics are disabled (no server was ever started). Wired into Nest's + * shutdown lifecycle so the listener is not left dangling on shutdown. + */ +export function closeMetricsServer(): Promise { + const server = metricsServer; + metricsServer = null; + if (!server) return Promise.resolve(); + return new Promise((resolve) => { + server.close(() => resolve()); + }); +} diff --git a/apps/server/src/integrations/static/static.module.ts b/apps/server/src/integrations/static/static.module.ts index b7565d7f..220d5944 100644 --- a/apps/server/src/integrations/static/static.module.ts +++ b/apps/server/src/integrations/static/static.module.ts @@ -50,6 +50,10 @@ export class StaticModule implements OnModuleInit { : undefined, POSTHOG_HOST: this.environmentService.getPostHogHost(), POSTHOG_KEY: this.environmentService.getPostHogKey(), + // #355 — mirrors the server-side CLIENT_TELEMETRY_ENABLED gate so the + // client only collects/sends vitals when the operator opts in. + CLIENT_TELEMETRY_ENABLED: + this.environmentService.isClientTelemetryEnabled(), }; const windowScriptContent = ``; diff --git a/apps/server/src/main.ts b/apps/server/src/main.ts index e31f2ddc..04df81d8 100644 --- a/apps/server/src/main.ts +++ b/apps/server/src/main.ts @@ -195,8 +195,9 @@ async function bootstrap() { ); }); - // #355 — Prometheus scrape endpoint on a SEPARATE port (METRICS_PORT, default - // 9464), started after the app is up. No-op when METRICS_PORT is unset. + // #355 — Prometheus scrape endpoint on a SEPARATE port (METRICS_PORT), + // started after the app is up. No default port: a no-op when METRICS_PORT is + // unset. Closed on shutdown by MetricsServerLifecycle (MetricsModule). startMetricsServer(); }