diff --git a/apps/client/package.json b/apps/client/package.json index 76d617da..b922efe9 100644 --- a/apps/client/package.json +++ b/apps/client/package.json @@ -61,6 +61,7 @@ "react-clear-modal": "^2.0.18", "react-dom": "^18.3.1", "react-drawio": "1.0.7", + "web-vitals": "^5.1.0", "react-error-boundary": "6.1.1", "react-helmet-async": "3.0.0", "react-i18next": "16.5.8", diff --git a/apps/client/src/features/editor/page-editor.tsx b/apps/client/src/features/editor/page-editor.tsx index 9953d6c5..8b2f2fc9 100644 --- a/apps/client/src/features/editor/page-editor.tsx +++ b/apps/client/src/features/editor/page-editor.tsx @@ -93,6 +93,10 @@ import { isBodyEditable, isCollabSynced, } from "@/features/editor/editor-sync-state"; +import { + measurePageOpen, + reportEditorTx, +} from "@/lib/telemetry/vitals"; interface PageEditorProps { pageId: string; @@ -351,6 +355,34 @@ export default function PageEditor({ editor.storage.pageId = pageId; handleScrollTo(editor); editorRef.current = editor; + + // #355 — page_open_ms: this is the first editor-content render, so + // measure against any page-open mark set on the tree-row/link click. + measurePageOpen(); + + // #355 — editor_tx_ms: time the SYNCHRONOUS part of applying each + // transaction (state.apply + updateState) by wrapping the view's + // dispatch. Only slow syncs (>8ms) are reported (see reportEditorTx), + // so the common path adds just one performance.now() pair. Passive: + // the original dispatch still runs unchanged. + try { + const view = editor.view as unknown as { + dispatch: (tr: unknown) => void; + }; + const originalDispatch = view.dispatch.bind(view); + view.dispatch = (tr: unknown) => { + const started = performance.now(); + originalDispatch(tr); + const elapsed = performance.now() - started; + try { + reportEditorTx(elapsed, editor.state.doc.content.size); + } catch { + // never let telemetry break editing + } + }; + } catch { + // if the view shape changes, skip editor_tx instrumentation + } } }, onUpdate({ editor }) { diff --git a/apps/client/src/lib/telemetry/route-template.test.ts b/apps/client/src/lib/telemetry/route-template.test.ts new file mode 100644 index 00000000..aa798a3d --- /dev/null +++ b/apps/client/src/lib/telemetry/route-template.test.ts @@ -0,0 +1,35 @@ +import { describe, it, expect } from "vitest"; +import { templateRoute } from "./route-template"; + +describe("templateRoute", () => { + it("templates a space page path (never leaks slugs)", () => { + const t = templateRoute("/s/engineering/p/design-doc-abc123"); + expect(t).toBe("/s/:space/p/:slug"); + expect(t).not.toContain("engineering"); + expect(t).not.toContain("design-doc"); + }); + + it("templates share, redirect and space paths", () => { + expect(templateRoute("/share/abc/p/xyz")).toBe("/share/:shareId/p/:slug"); + expect(templateRoute("/share/p/xyz")).toBe("/share/p/:slug"); + expect(templateRoute("/p/some-slug")).toBe("/p/:slug"); + expect(templateRoute("/s/team")).toBe("/s/:space"); + expect(templateRoute("/s/team/trash")).toBe("/s/:space/trash"); + expect(templateRoute("/labels/urgent")).toBe("/labels/:label"); + }); + + it("keeps known static routes verbatim", () => { + expect(templateRoute("/home")).toBe("/home"); + expect(templateRoute("/settings/members")).toBe("/settings/members"); + expect(templateRoute("/")).toBe("/"); + }); + + it("normalises a trailing slash", () => { + expect(templateRoute("/s/team/p/slug/")).toBe("/s/:space/p/:slug"); + }); + + it("collapses unknown paths to 'other' (bounded cardinality)", () => { + expect(templateRoute("/weird/unknown/thing")).toBe("other"); + expect(templateRoute("/s/team/p/slug/extra/segments")).toBe("other"); + }); +}); diff --git a/apps/client/src/lib/telemetry/route-template.ts b/apps/client/src/lib/telemetry/route-template.ts new file mode 100644 index 00000000..45fe4855 --- /dev/null +++ b/apps/client/src/lib/telemetry/route-template.ts @@ -0,0 +1,70 @@ +/** + * Map a raw pathname to a BOUNDED route TEMPLATE (#355). + * + * Perf metrics must be labelled by route template only — never a raw path with + * slugs/ids — so the server-side `route` column and any downstream aggregation + * stay low-cardinality and carry NO page slugs/titles (privacy). Anything that + * does not match a known pattern collapses to `other`. + * + * The template vocabulary mirrors the issue's example (`/s/:space/p/:slug`). + */ +const ROUTE_PATTERNS: { re: RegExp; template: string }[] = [ + // Share pages (public). + { re: /^\/share\/[^/]+\/p\/[^/]+$/, template: '/share/:shareId/p/:slug' }, + { re: /^\/share\/p\/[^/]+$/, template: '/share/p/:slug' }, + { re: /^\/share\/[^/]+$/, template: '/share/:shareId' }, + // Page redirect. + { re: /^\/p\/[^/]+$/, template: '/p/:slug' }, + // Space + page. + { re: /^\/s\/[^/]+\/p\/[^/]+$/, template: '/s/:space/p/:slug' }, + { re: /^\/s\/[^/]+\/trash$/, template: '/s/:space/trash' }, + { re: /^\/s\/[^/]+$/, template: '/s/:space' }, + // Misc dynamic. + { re: /^\/labels\/[^/]+$/, template: '/labels/:label' }, + { re: /^\/invites\/[^/]+$/, template: '/invites/:invitationId' }, + { re: /^\/settings\/groups\/[^/]+$/, template: '/settings/groups/:groupId' }, +]; + +// Static routes we accept verbatim (finite set). +const STATIC_ROUTES = new Set([ + '/home', + '/spaces', + '/favorites', + '/login', + '/forgot-password', + '/password-reset', + '/setup/register', + '/settings/account/profile', + '/settings/account/preferences', + '/settings/workspace', + '/settings/ai', + '/settings/members', + '/settings/groups', + '/settings/spaces', + '/settings/sharing', +]); + +export function templateRoute(pathname: string): string { + // Normalise a trailing slash (except root). + const path = + pathname.length > 1 && pathname.endsWith('/') + ? pathname.slice(0, -1) + : pathname; + + if (path === '' || path === '/') return '/'; + if (STATIC_ROUTES.has(path)) return path; + + for (const { re, template } of ROUTE_PATTERNS) { + if (re.test(path)) return template; + } + return 'other'; +} + +/** Template for the current window location. */ +export function currentRouteTemplate(): string { + try { + return templateRoute(window.location.pathname); + } catch { + return 'other'; + } +} diff --git a/apps/client/src/lib/telemetry/vitals.ts b/apps/client/src/lib/telemetry/vitals.ts new file mode 100644 index 00000000..b18bcba0 --- /dev/null +++ b/apps/client/src/lib/telemetry/vitals.ts @@ -0,0 +1,275 @@ +import { + onCLS, + onINP, + onLCP, + onTTFB, + type CLSMetricWithAttribution, + type INPMetricWithAttribution, + type LCPMetricWithAttribution, + type TTFBMetricWithAttribution, +} from "web-vitals/attribution"; +import { currentRouteTemplate } from "./route-template"; + +/** + * Client perf-telemetry (#355): web-vitals + custom metrics buffered and posted + * to POST /api/telemetry/vitals via sendBeacon. + * + * Design constraints from the issue: + * - Sampling is decided ONCE per session (25%), cached in sessionStorage, + * BEFORE any observer is subscribed. Non-sampled sessions send nothing. + * - Route labels are TEMPLATES only; attr is truncated to 120 chars; no page + * titles/slugs/text ever leave the browser. + * - Observers are passive and reporting is best-effort — telemetry must not + * degrade the perf it measures. + */ + +const ENDPOINT = "/api/telemetry/vitals"; +const SAMPLE_RATE = 0.25; +const SAMPLE_KEY = "gm_vitals_sampled"; +const FLUSH_INTERVAL_MS = 15_000; +const MAX_BUFFER = 40; // flush early if the buffer fills between timers +const MAX_ATTR_LENGTH = 120; +const EDITOR_TX_MIN_MS = 8; // only report editor transactions slower than this + +const ALLOWED_NAMES = new Set([ + "INP", + "LCP", + "CLS", + "TTFB", + "editor_tx_ms", + "page_open_ms", + "longtask_ms", +]); + +interface VitalEvent { + name: string; + value: number; + rating?: string; + route?: string; + attr?: string; + docSize?: number; +} + +let sampledCache: boolean | null = null; +let initialised = false; +let buffer: VitalEvent[] = []; +let longtaskSum = 0; // accumulated longtask duration (ms) for the current window + +/** + * Decide once per session whether this session is sampled. Cached in + * sessionStorage so the choice is stable across reloads within the session and + * identical for every observer/custom-metric caller. + */ +export function isVitalsSampled(): boolean { + if (sampledCache !== null) return sampledCache; + try { + const stored = sessionStorage.getItem(SAMPLE_KEY); + if (stored === "1") return (sampledCache = true); + if (stored === "0") return (sampledCache = false); + const sampled = Math.random() < SAMPLE_RATE; + sessionStorage.setItem(SAMPLE_KEY, sampled ? "1" : "0"); + return (sampledCache = sampled); + } catch { + // sessionStorage unavailable (private mode / SSR): default to not sampled. + return (sampledCache = false); + } +} + +function truncateAttr(value: unknown): string | undefined { + if (typeof value !== "string" || value.length === 0) return undefined; + return value.slice(0, MAX_ATTR_LENGTH); +} + +function enqueue(event: VitalEvent): void { + if (!ALLOWED_NAMES.has(event.name)) return; + if (!Number.isFinite(event.value)) return; + buffer.push(event); + if (buffer.length >= MAX_BUFFER) flush(); +} + +function flush(): void { + // Fold any pending longtask total into the batch first. + if (longtaskSum > 0) { + buffer.push({ + name: "longtask_ms", + value: Math.round(longtaskSum), + route: currentRouteTemplate(), + }); + longtaskSum = 0; + } + if (buffer.length === 0) return; + + const payload = JSON.stringify({ events: buffer }); + buffer = []; + + try { + const blob = new Blob([payload], { type: "application/json" }); + if (navigator.sendBeacon && navigator.sendBeacon(ENDPOINT, blob)) return; + // Fallback for browsers without sendBeacon: keepalive fetch. + void fetch(ENDPOINT, { + method: "POST", + body: payload, + headers: { "Content-Type": "application/json" }, + keepalive: true, + }).catch(() => undefined); + } catch { + // Best-effort: never throw out of telemetry. + } +} + +/** + * Report a custom client metric (editor_tx_ms, page_open_ms). No-op unless the + * session is sampled. Route is always the current TEMPLATE. + */ +export function reportClientMetric( + name: "editor_tx_ms" | "page_open_ms", + value: number, + extra?: { docSize?: number }, +): void { + if (!isVitalsSampled()) return; + if (!Number.isFinite(value)) return; + enqueue({ + name, + value, + route: currentRouteTemplate(), + docSize: extra?.docSize, + }); +} + +/** Threshold-gated editor transaction reporter (only reports slow syncs). */ +export function reportEditorTx(ms: number, docSize: number): void { + if (ms <= EDITOR_TX_MIN_MS) return; + reportClientMetric("editor_tx_ms", ms, { docSize }); +} + +const PAGE_OPEN_MARK = "gm_page_open_start"; + +/** Mark the start of a page-open interaction (tree-row / link click). */ +export function markPageOpenStart(): void { + try { + performance.clearMarks(PAGE_OPEN_MARK); + performance.mark(PAGE_OPEN_MARK); + } catch { + // ignore + } +} + +/** + * Measure page_open_ms at first editor-content render, if a start mark exists. + * Consumes the mark so a later render doesn't double-count. + */ +export function measurePageOpen(): void { + try { + const marks = performance.getEntriesByName(PAGE_OPEN_MARK, "mark"); + if (marks.length === 0) return; + const started = marks[0].startTime; + const elapsed = performance.now() - started; + performance.clearMarks(PAGE_OPEN_MARK); + if (elapsed > 0 && Number.isFinite(elapsed)) { + reportClientMetric("page_open_ms", elapsed); + } + } catch { + // ignore + } +} + +function attrTarget( + metric: + | INPMetricWithAttribution + | LCPMetricWithAttribution + | CLSMetricWithAttribution, +): string | undefined { + const a = metric.attribution as Record | undefined; + if (!a) return undefined; + // Different vitals expose their culprit element under different keys; only a + // CSS-selector-ish target string is taken (no text content / titles). + return ( + truncateAttr(a.interactionTarget) ?? + truncateAttr(a.element) ?? + truncateAttr(a.largestShiftTarget) ?? + undefined + ); +} + +/** + * Initialise client telemetry. Safe to call multiple times (idempotent). Returns + * immediately without subscribing when the session is not sampled — so a + * non-sampled session subscribes to NO observers and sends nothing. + */ +export function initVitals(): void { + if (initialised) return; + initialised = true; + + // Sampling gate is evaluated BEFORE any observer subscription. + if (!isVitalsSampled()) return; + + const report = ( + metric: + | INPMetricWithAttribution + | LCPMetricWithAttribution + | CLSMetricWithAttribution + | TTFBMetricWithAttribution, + ) => { + enqueue({ + name: metric.name, + value: metric.value, + rating: metric.rating, + route: currentRouteTemplate(), + attr: + metric.name === "TTFB" + ? undefined + : attrTarget( + metric as + | INPMetricWithAttribution + | LCPMetricWithAttribution + | CLSMetricWithAttribution, + ), + }); + }; + + onINP(report); + onLCP(report); + onCLS(report); + onTTFB(report); + + // Long tasks: aggregate the total blocking time per flush window (a passive + // observer; individual entries are summed, never stored/sent individually). + try { + if (typeof PerformanceObserver !== "undefined") { + const observer = new PerformanceObserver((list) => { + for (const entry of list.getEntries()) { + longtaskSum += entry.duration; + } + }); + observer.observe({ type: "longtask", buffered: true }); + } + } catch { + // longtask entry type unsupported: skip silently. + } + + // page_open_ms start: mark when the user clicks a page link/tree-row (any + // anchor navigating to a page URL). Passive capture listener; the matching + // measure fires at first editor-content render (measurePageOpen). No page + // titles/slugs are read — only the click timing is marked. + document.addEventListener( + "click", + (event) => { + const target = event.target as Element | null; + const anchor = target?.closest?.("a[href]") as HTMLAnchorElement | null; + if (!anchor) return; + const href = anchor.getAttribute("href") ?? ""; + // A page link is `/s/:space/p/:slug`, `/p/:slug` or a share page path. + if (/\/p\//.test(href)) markPageOpenStart(); + }, + { capture: true, passive: true }, + ); + + // Flush on tab hide (most reliable delivery point) and periodically. + const onHidden = () => { + if (document.visibilityState === "hidden") flush(); + }; + document.addEventListener("visibilitychange", onHidden); + window.addEventListener("pagehide", flush); + + setInterval(flush, FLUSH_INTERVAL_MS); +} diff --git a/apps/client/src/main.tsx b/apps/client/src/main.tsx index 5dbf6b81..0c19fe28 100644 --- a/apps/client/src/main.tsx +++ b/apps/client/src/main.tsx @@ -22,6 +22,7 @@ import { isPostHogEnabled, } from "@/lib/config.ts"; import posthog from "posthog-js"; +import { initVitals } from "@/lib/telemetry/vitals"; export const queryClient = new QueryClient({ defaultOptions: { @@ -43,6 +44,10 @@ if (isCloud() && isPostHogEnabled) { }); } +// #355 — client perf-telemetry. Decides sampling ONCE (25%/session) before +// subscribing to any observer; non-sampled sessions send nothing. +initVitals(); + const container = document.getElementById("root") as HTMLElement; const root = (container as any).__reactRoot ??= ReactDOM.createRoot(container); diff --git a/apps/server/package.json b/apps/server/package.json index 4e566d9d..517f1b3d 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -111,6 +111,7 @@ "pino-pretty": "^13.1.3", "postgres": "^3.4.8", "postmark": "^4.0.7", + "prom-client": "^15.1.3", "react": "^18.3.1", "react-email": "6.0.8", "reflect-metadata": "^0.2.2", diff --git a/apps/server/src/app.module.ts b/apps/server/src/app.module.ts index e257fe9b..381970fa 100644 --- a/apps/server/src/app.module.ts +++ b/apps/server/src/app.module.ts @@ -31,6 +31,8 @@ import { McpModule } from './integrations/mcp/mcp.module'; import { SandboxModule } from './integrations/sandbox/sandbox.module'; import { AiModule } from './integrations/ai/ai.module'; import { AiChatModule } from './core/ai-chat/ai-chat.module'; +import { MetricsModule } from './integrations/metrics/metrics.module'; +import { ClientTelemetryModule } from './core/telemetry/client-telemetry.module'; const enterpriseModules = []; try { @@ -93,6 +95,8 @@ try { SandboxModule, AiModule, AiChatModule, + MetricsModule, + ClientTelemetryModule, ...enterpriseModules, ], controllers: [AppController], diff --git a/apps/server/src/collaboration/extensions/persistence.extension.ts b/apps/server/src/collaboration/extensions/persistence.extension.ts index 18eb99e6..35310cf4 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.ts @@ -41,6 +41,7 @@ import { HISTORY_INTERVAL, } from '../constants'; import { TransclusionService } from '../../core/page/transclusion/transclusion.service'; +import { observeCollabStore } from '../../integrations/metrics/metrics.registry'; /** * #251 — wire format of the client→server stateless message that signals a @@ -192,6 +193,17 @@ export class PersistenceExtension implements Extension { } async onStoreDocument(data: onStoreDocumentPayload) { + // #355 — time the full store (persist + post-store side effects) into + // collab_store_duration_seconds. No-op when METRICS_PORT is unset. + const startedAt = performance.now(); + try { + await this.storeDocument(data); + } finally { + observeCollabStore((performance.now() - startedAt) / 1000); + } + } + + private async storeDocument(data: onStoreDocumentPayload) { const { documentName, document, context } = data; const pageId = getPageId(documentName); diff --git a/apps/server/src/core/telemetry/client-metrics.constants.ts b/apps/server/src/core/telemetry/client-metrics.constants.ts new file mode 100644 index 00000000..4f4cc70f --- /dev/null +++ b/apps/server/src/core/telemetry/client-metrics.constants.ts @@ -0,0 +1,93 @@ +/** + * Server-side whitelist + limits for POST /api/telemetry/vitals (#355). + * + * The endpoint is PUBLIC (browsers post it, no auth) so it is a privacy and + * abuse surface: everything not on these lists is silently DROPPED and the + * request still returns 200 (never 400 — a 400 would make browsers retry). + */ + +// The only metric names accepted. Anything else is dropped. +export const ALLOWED_METRIC_NAMES = new Set([ + 'INP', + 'LCP', + 'CLS', + 'TTFB', + 'editor_tx_ms', + 'page_open_ms', + 'longtask_ms', +]); + +// The only rating values accepted (web-vitals). Anything else -> null. +export const ALLOWED_RATINGS = new Set([ + 'good', + 'needs-improvement', + 'poor', +]); + +// Max events accepted per batch; the rest are ignored. +export const MAX_EVENTS_PER_BATCH = 50; + +// Defence-in-depth body cap (~16KB). Fastify's global bodyLimit is far larger, +// so we re-check the parsed payload size here and drop oversized batches. +export const MAX_BODY_BYTES = 16 * 1024; + +// attr is truncated to this many characters (attribution target only, no PII). +export const MAX_ATTR_LENGTH = 120; + +// route label sanity cap (client sends a template like /s/:space/p/:slug). +export const MAX_ROUTE_LENGTH = 200; + +export interface ClientMetricRow { + name: string; + value: number; + rating: string | null; + route: string | null; + attr: string | null; + docSize: number | null; + workspaceId: string | null; +} + +/** + * Validate + normalise a single incoming event into a DB row, or return null to + * DROP it. Pure so it is directly unit-testable. Enforces the name whitelist, + * numeric value, rating whitelist, attr truncation and doc_size (int) coercion. + */ +export function sanitizeVitalEvent( + raw: unknown, + workspaceId: string | null, +): ClientMetricRow | null { + if (!raw || typeof raw !== 'object') return null; + const e = raw as Record; + + const name = e.name; + if (typeof name !== 'string' || !ALLOWED_METRIC_NAMES.has(name)) return null; + + const value = + typeof e.value === 'number' && Number.isFinite(e.value) ? e.value : null; + if (value === null) return null; + + const rating = + typeof e.rating === 'string' && ALLOWED_RATINGS.has(e.rating) + ? e.rating + : null; + + let route: string | null = null; + if (typeof e.route === 'string' && e.route.length > 0) { + route = e.route.slice(0, MAX_ROUTE_LENGTH); + } + + let attr: string | null = null; + if (typeof e.attr === 'string' && e.attr.length > 0) { + attr = e.attr.slice(0, MAX_ATTR_LENGTH); + } + + let docSize: number | null = null; + if (typeof e.docSize === 'number' && Number.isFinite(e.docSize)) { + docSize = Math.trunc(e.docSize); + } else if (typeof e.doc_size === 'number' && Number.isFinite(e.doc_size)) { + // Accept snake_case too, in case a client sends the raw column name. + docSize = Math.trunc(e.doc_size as number); + } + + return { name, value, rating, route, attr, docSize, workspaceId }; +} diff --git a/apps/server/src/core/telemetry/client-telemetry.module.ts b/apps/server/src/core/telemetry/client-telemetry.module.ts new file mode 100644 index 00000000..cf94efc0 --- /dev/null +++ b/apps/server/src/core/telemetry/client-telemetry.module.ts @@ -0,0 +1,15 @@ +import { Module } from '@nestjs/common'; +import { VitalsController } from './vitals.controller'; +import { VitalsService } from './vitals.service'; + +/** + * Client perf-telemetry (#355): the public /api/telemetry/vitals sink that + * persists web-vitals + custom client metrics into `client_metrics`. + * Named ClientTelemetryModule to avoid confusion with the unrelated + * integrations/telemetry (product usage ping) module. + */ +@Module({ + controllers: [VitalsController], + providers: [VitalsService], +}) +export class ClientTelemetryModule {} diff --git a/apps/server/src/core/telemetry/vitals.controller.ts b/apps/server/src/core/telemetry/vitals.controller.ts new file mode 100644 index 00000000..62812671 --- /dev/null +++ b/apps/server/src/core/telemetry/vitals.controller.ts @@ -0,0 +1,47 @@ +import { + Body, + Controller, + HttpCode, + Post, + Req, + UseGuards, +} from '@nestjs/common'; +import { Throttle, ThrottlerGuard } from '@nestjs/throttler'; +import { FastifyRequest } from 'fastify'; +import { Public } from '../../common/decorators/public.decorator'; +import { VITALS_THROTTLER } from '../../integrations/throttle/throttler-names'; +import { VitalsService } from './vitals.service'; + +/** + * POST /api/telemetry/vitals (#355) — public client perf-metrics sink. + * + * PUBLIC (browsers post via sendBeacon, no session) but IP-throttled. Always + * returns 200 with no body of interest: invalid/foreign/oversized payloads are + * silently dropped by the service rather than 400'd, so browsers never retry. + */ +@Controller('telemetry') +export class VitalsController { + constructor(private readonly vitalsService: VitalsService) {} + + @Public() + @UseGuards(ThrottlerGuard) + @Throttle({ [VITALS_THROTTLER]: { limit: 120, ttl: 60_000 } }) + @Post('vitals') + @HttpCode(200) + async vitals( + @Body() body: unknown, + @Req() req: FastifyRequest, + ): Promise<{ ok: true }> { + // workspaceId is resolved by the workspace-host middleware onto req.raw when + // the browser posts from a workspace host; null otherwise. No other PII. + const workspaceId = + ((req.raw as unknown as { workspaceId?: string })?.workspaceId ?? null) || + null; + try { + await this.vitalsService.ingest(body, workspaceId); + } catch { + // Never surface storage errors to the browser; telemetry is best-effort. + } + return { ok: true }; + } +} diff --git a/apps/server/src/core/telemetry/vitals.service.spec.ts b/apps/server/src/core/telemetry/vitals.service.spec.ts new file mode 100644 index 00000000..2fca1228 --- /dev/null +++ b/apps/server/src/core/telemetry/vitals.service.spec.ts @@ -0,0 +1,122 @@ +import { VitalsService } from './vitals.service'; +import { MAX_ATTR_LENGTH } from './client-metrics.constants'; + +// buildRows is pure (no DB access), so a null db is fine here. +const svc = new VitalsService(null as any); + +describe('VitalsService.buildRows', () => { + const WS = 'ws-uuid'; + + it('accepts a valid batch and maps whitelisted fields to rows', () => { + const body = { + events: [ + { name: 'INP', value: 123.4, rating: 'good', route: '/s/:space/p/:slug' }, + { name: 'editor_tx_ms', value: 12, route: '/s/:space/p/:slug', docSize: 4096 }, + ], + }; + const rows = svc.buildRows(body, WS); + expect(rows).toHaveLength(2); + expect(rows[0]).toEqual({ + name: 'INP', + value: 123.4, + rating: 'good', + route: '/s/:space/p/:slug', + attr: null, + docSize: null, + workspaceId: WS, + }); + expect(rows[1].name).toBe('editor_tx_ms'); + expect(rows[1].docSize).toBe(4096); + expect(rows[1].workspaceId).toBe(WS); + }); + + it('accepts a bare array body', () => { + const rows = svc.buildRows([{ name: 'LCP', value: 1 }], WS); + expect(rows).toHaveLength(1); + expect(rows[0].name).toBe('LCP'); + }); + + it('drops events with foreign metric names', () => { + const rows = svc.buildRows( + { events: [{ name: 'evil_metric', value: 1 }, { name: 'LCP', value: 2 }] }, + WS, + ); + expect(rows).toHaveLength(1); + expect(rows[0].name).toBe('LCP'); + }); + + it('drops events with a non-numeric or missing value', () => { + const rows = svc.buildRows( + { + events: [ + { name: 'CLS', value: 'nan' }, + { name: 'CLS' }, + { name: 'CLS', value: 0.1 }, + ], + }, + WS, + ); + expect(rows).toHaveLength(1); + expect(rows[0].value).toBe(0.1); + }); + + it('strips foreign fields and only keeps whitelisted columns', () => { + const rows = svc.buildRows( + { events: [{ name: 'TTFB', value: 5, secret: 'drop-me', title: 'my page' }] }, + WS, + ); + expect(rows).toHaveLength(1); + expect(Object.keys(rows[0]).sort()).toEqual( + ['attr', 'docSize', 'name', 'rating', 'route', 'value', 'workspaceId'].sort(), + ); + expect((rows[0] as any).secret).toBeUndefined(); + expect((rows[0] as any).title).toBeUndefined(); + }); + + it('rejects a rating outside the allowed set (-> null)', () => { + const rows = svc.buildRows( + { events: [{ name: 'INP', value: 1, rating: 'terrible' }] }, + WS, + ); + expect(rows[0].rating).toBeNull(); + }); + + it('truncates attr to 120 chars', () => { + const longAttr = 'a'.repeat(500); + const rows = svc.buildRows( + { events: [{ name: 'INP', value: 1, attr: longAttr }] }, + WS, + ); + expect(rows[0].attr).toHaveLength(MAX_ATTR_LENGTH); + }); + + it('caps the batch at 50 events', () => { + const events = Array.from({ length: 200 }, () => ({ name: 'CLS', value: 1 })); + const rows = svc.buildRows({ events }, WS); + expect(rows).toHaveLength(50); + }); + + it('drops an oversized (>16KB) payload wholesale', () => { + const events = Array.from({ length: 50 }, () => ({ + name: 'INP', + value: 1, + attr: 'x'.repeat(400), + route: '/s/:space/p/:slug', + })); + // Serialised body far exceeds 16KB. + const rows = svc.buildRows({ events }, WS); + expect(rows).toHaveLength(0); + }); + + it('returns [] for malformed bodies', () => { + expect(svc.buildRows(null, WS)).toEqual([]); + expect(svc.buildRows('nope', WS)).toEqual([]); + expect(svc.buildRows({ notEvents: 1 }, WS)).toEqual([]); + expect(svc.buildRows(42, WS)).toEqual([]); + }); + + it('carries a null workspaceId through', () => { + const rows = svc.buildRows({ events: [{ name: 'LCP', value: 1 }] }, null); + expect(rows[0].workspaceId).toBeNull(); + }); +}); diff --git a/apps/server/src/core/telemetry/vitals.service.ts b/apps/server/src/core/telemetry/vitals.service.ts new file mode 100644 index 00000000..076a2506 --- /dev/null +++ b/apps/server/src/core/telemetry/vitals.service.ts @@ -0,0 +1,70 @@ +import { Injectable } from '@nestjs/common'; +import { InjectKysely } from 'nestjs-kysely'; +import { KyselyDB } from '@docmost/db/types/kysely.types'; +import { + ClientMetricRow, + MAX_BODY_BYTES, + MAX_EVENTS_PER_BATCH, + sanitizeVitalEvent, +} from './client-metrics.constants'; + +@Injectable() +export class VitalsService { + constructor(@InjectKysely() private readonly db: KyselyDB) {} + + /** + * Turn a raw request body into the (bounded, whitelisted) rows to persist. + * Pure/synchronous so it is unit-testable without a DB. Returns [] for any + * malformed / oversized / foreign input — the caller still responds 200. + */ + buildRows(body: unknown, workspaceId: string | null): ClientMetricRow[] { + if (!body || typeof body !== 'object') return []; + + // Defence-in-depth body cap (~16KB): drop oversized batches wholesale. + try { + if (JSON.stringify(body).length > MAX_BODY_BYTES) return []; + } catch { + return []; + } + + // Accept either a bare array or `{ events: [...] }`. + const events = Array.isArray(body) + ? body + : Array.isArray((body as { events?: unknown }).events) + ? ((body as { events: unknown[] }).events as unknown[]) + : null; + if (!events) return []; + + const rows: ClientMetricRow[] = []; + for (const event of events) { + if (rows.length >= MAX_EVENTS_PER_BATCH) break; + const row = sanitizeVitalEvent(event, workspaceId); + if (row) rows.push(row); + } + return rows; + } + + /** Batch-insert the sanitised rows in a single statement. No-op on []. */ + async insertRows(rows: ClientMetricRow[]): Promise { + if (rows.length === 0) return; + await this.db + .insertInto('clientMetrics') + .values( + rows.map((r) => ({ + name: r.name, + value: r.value, + rating: r.rating, + route: r.route, + attr: r.attr, + docSize: r.docSize, + workspaceId: r.workspaceId, + })), + ) + .execute(); + } + + async ingest(body: unknown, workspaceId: string | null): Promise { + const rows = this.buildRows(body, workspaceId); + await this.insertRows(rows); + } +} diff --git a/apps/server/src/database/database.module.ts b/apps/server/src/database/database.module.ts index 4e9c5d13..ea56ed11 100644 --- a/apps/server/src/database/database.module.ts +++ b/apps/server/src/database/database.module.ts @@ -40,6 +40,8 @@ import { PageListener } from '@docmost/db/listeners/page.listener'; import { PostgresJSDialect } from 'kysely-postgres-js'; import * as postgres from 'postgres'; import { normalizePostgresUrl } from '../common/helpers'; +import { observeDbQuery } from '../integrations/metrics/metrics.registry'; +import { firstSqlToken } from '../integrations/metrics/metrics.constants'; @Global() @Module({ @@ -67,6 +69,14 @@ import { normalizePostgresUrl } from '../common/helpers'; }), plugins: [new CamelCasePlugin()], log: (event: LogEvent) => { + // #355 — db_query_duration_seconds, labelled by the leading SQL token + // (bounded cardinality). No-op when METRICS_PORT is unset. Runs for + // every query, independent of the dev-only debug logging below. + observeDbQuery( + firstSqlToken(event.query.sql), + event.queryDurationMillis / 1000, + ); + if (environmentService.getNodeEnv() !== 'development') return; const logger = new Logger(DatabaseModule.name); if (process.env.DEBUG_DB?.toLowerCase() === 'true') { diff --git a/apps/server/src/database/migrations/20260704T120000-client-metrics.ts b/apps/server/src/database/migrations/20260704T120000-client-metrics.ts new file mode 100644 index 00000000..92c42334 --- /dev/null +++ b/apps/server/src/database/migrations/20260704T120000-client-metrics.ts @@ -0,0 +1,52 @@ +import { type Kysely, sql } from 'kysely'; + +/** + * #355 — `client_metrics`: raw sink for client-side perf telemetry (web-vitals + * + custom editor/page metrics) posted to /api/telemetry/vitals. + * + * The table/columns/indexes here are a FIXED contract shared with the deployed + * Grafana infra (the `grafana_ro` role reads this table; a separate maintenance + * container prunes rows >90d and re-GRANTs daily). No app-side retention is + * added on purpose. Written as raw SQL to match that contract 1:1 (identity PK, + * conditional GRANT). + */ +export async function up(db: Kysely): Promise { + await sql` + CREATE TABLE client_metrics ( + id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + created_at timestamptz NOT NULL DEFAULT now(), + name text NOT NULL, -- INP|LCP|CLS|TTFB|editor_tx_ms|page_open_ms|longtask_ms + value double precision NOT NULL, + rating text, -- good|needs-improvement|poor (web-vitals only) + route text, -- templated: /s/:space/p/:slug — never raw slugs + attr text, -- attribution target, truncated to 120 chars + doc_size int, -- editor_tx_ms only + workspace_id uuid + ) + `.execute(db); + + await sql` + CREATE INDEX idx_client_metrics_name_created + ON client_metrics (name, created_at) + `.execute(db); + + await sql` + CREATE INDEX idx_client_metrics_created + ON client_metrics (created_at) + `.execute(db); + + // The read-only Grafana role only exists in the deployed environment; guard so + // the migration still applies cleanly in dev/CI where the role is absent. + await sql` + DO $$ + BEGIN + IF EXISTS (SELECT FROM pg_roles WHERE rolname = 'grafana_ro') THEN + GRANT SELECT ON client_metrics TO grafana_ro; + END IF; + END $$; + `.execute(db); +} + +export async function down(db: Kysely): Promise { + await sql`DROP TABLE IF EXISTS client_metrics`.execute(db); +} diff --git a/apps/server/src/database/types/db.d.ts b/apps/server/src/database/types/db.d.ts index dab98b3d..d5ec8d5e 100644 --- a/apps/server/src/database/types/db.d.ts +++ b/apps/server/src/database/types/db.d.ts @@ -156,6 +156,18 @@ export interface Billing { workspaceId: string; } +export interface ClientMetrics { + id: Generated; + createdAt: Generated; + name: string; + value: number; + rating: string | null; + route: string | null; + attr: string | null; + docSize: number | null; + workspaceId: string | null; +} + export interface Comments { aiChatId: string | null; content: Json | null; @@ -691,6 +703,7 @@ export interface DB { authProviders: AuthProviders; backlinks: Backlinks; billing: Billing; + clientMetrics: ClientMetrics; comments: Comments; favorites: Favorites; fileTasks: FileTasks; diff --git a/apps/server/src/integrations/metrics/http-metrics.hook.ts b/apps/server/src/integrations/metrics/http-metrics.hook.ts new file mode 100644 index 00000000..a01a7d1a --- /dev/null +++ b/apps/server/src/integrations/metrics/http-metrics.hook.ts @@ -0,0 +1,46 @@ +import { FastifyReply, FastifyRequest } from 'fastify'; +import { isStreamingResponse } from './metrics.constants'; +import { observeHttp } from './metrics.registry'; + +/** + * Resolve the BOUNDED route label for an HTTP response. + * + * HARD REQUIREMENT (#355): use the ROUTE TEMPLATE (`/pages/:id`), NEVER the raw + * URL (`/pages/abc-123`), so label cardinality stays finite. Fastify exposes the + * matched template on `req.routeOptions.url`. On 404s (no route matched) that is + * missing → collapse to the literal `unknown`. + */ +export function resolveRouteLabel(req: FastifyRequest): string { + const url = req.routeOptions?.url; + return typeof url === 'string' && url.length > 0 ? url : 'unknown'; +} + +/** + * Fastify onResponse handler that records http_request_duration_seconds. + * No-op when metrics are disabled (the hook is only registered when enabled, + * but the observe helpers are also guarded). Never throws into the response + * pipeline — telemetry must not break request handling. + */ +export function recordHttpResponse( + req: FastifyRequest, + reply: FastifyReply, +): void { + try { + const route = resolveRouteLabel(req); + + // Exclude SSE/streaming responses: onResponse fires at connection close for + // those, so it would record the stream lifetime and poison p95/p99. + const contentType = reply.getHeader('content-type'); + if (isStreamingResponse(contentType, route)) return; + + observeHttp( + req.method, + route, + reply.statusCode, + // Fastify measures elapsed time in ms; the metric is in seconds. + reply.elapsedTime / 1000, + ); + } catch { + // Swallow: a telemetry failure must never affect the served response. + } +} diff --git a/apps/server/src/integrations/metrics/metrics-bull.service.ts b/apps/server/src/integrations/metrics/metrics-bull.service.ts new file mode 100644 index 00000000..04722cd1 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics-bull.service.ts @@ -0,0 +1,146 @@ +import { + Injectable, + Logger, + OnModuleDestroy, + OnModuleInit, +} from '@nestjs/common'; +import { InjectQueue } from '@nestjs/bullmq'; +import { Queue, QueueEvents } from 'bullmq'; +import { QueueName } from '../queue/constants'; +import { EnvironmentService } from '../environment/environment.service'; +import { parseRedisUrl } from '../../common/helpers'; +import { + isMetricsEnabled, + observeJobDuration, + setQueueDepth, +} from './metrics.registry'; + +const POLL_INTERVAL_MS = 15_000; +// Cap the in-flight start-time map so a job that never emits completed/failed +// (worker crash) cannot leak memory unbounded. Well above realistic concurrency. +const MAX_INFLIGHT = 10_000; + +/** + * BullMQ instrumentation for #355: + * - `bullmq_queue_depth{queue}`: polled from getJobCounts() every 15s. + * - `bullmq_job_duration_seconds{queue}`: wall-clock time between a job going + * `active` and `completed`/`failed`, observed via per-queue QueueEvents. + * + * Queue names are a FINITE list (the QueueName enum), so labels are bounded — no + * job ids ever enter a label. Everything is gated on METRICS_PORT: when metrics + * are off, onModuleInit does nothing (no interval, no QueueEvents connections). + */ +@Injectable() +export class MetricsBullService implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(MetricsBullService.name); + private readonly queues: { label: string; queue: Queue }[]; + private timer: NodeJS.Timeout | null = null; + private queueEvents: QueueEvents[] = []; + // jobId -> start timestamp (ms). Bounded by MAX_INFLIGHT. + private readonly inflight = new Map(); + + constructor( + private readonly environmentService: EnvironmentService, + @InjectQueue(QueueName.EMAIL_QUEUE) emailQueue: Queue, + @InjectQueue(QueueName.ATTACHMENT_QUEUE) attachmentQueue: Queue, + @InjectQueue(QueueName.GENERAL_QUEUE) generalQueue: Queue, + @InjectQueue(QueueName.BILLING_QUEUE) billingQueue: Queue, + @InjectQueue(QueueName.FILE_TASK_QUEUE) fileTaskQueue: Queue, + @InjectQueue(QueueName.SEARCH_QUEUE) searchQueue: Queue, + @InjectQueue(QueueName.AI_QUEUE) aiQueue: Queue, + @InjectQueue(QueueName.HISTORY_QUEUE) historyQueue: Queue, + @InjectQueue(QueueName.NOTIFICATION_QUEUE) notificationQueue: Queue, + @InjectQueue(QueueName.AUDIT_QUEUE) auditQueue: Queue, + ) { + this.queues = [ + { label: 'email', queue: emailQueue }, + { label: 'attachment', queue: attachmentQueue }, + { label: 'general', queue: generalQueue }, + { label: 'billing', queue: billingQueue }, + { label: 'file-task', queue: fileTaskQueue }, + { label: 'search', queue: searchQueue }, + { label: 'ai', queue: aiQueue }, + { label: 'history', queue: historyQueue }, + { label: 'notification', queue: notificationQueue }, + { label: 'audit', queue: auditQueue }, + ]; + } + + onModuleInit(): void { + if (!isMetricsEnabled()) return; + + // Poll queue depth. + this.timer = setInterval(() => { + void this.pollDepths(); + }, POLL_INTERVAL_MS); + // Do not keep the event loop alive solely for polling. + this.timer.unref?.(); + void this.pollDepths(); + + // Wire per-queue job-duration events. + const redisConfig = parseRedisUrl(this.environmentService.getRedisUrl()); + const connection = { + host: redisConfig.host, + port: redisConfig.port, + password: redisConfig.password, + db: redisConfig.db, + family: redisConfig.family, + }; + + for (const { label, queue } of this.queues) { + const events = new QueueEvents(queue.name, { connection }); + events.on('active', ({ jobId }) => { + if (this.inflight.size >= MAX_INFLIGHT) { + // Drop the oldest tracked start to keep the map bounded. + const oldest = this.inflight.keys().next().value; + if (oldest !== undefined) this.inflight.delete(oldest); + } + this.inflight.set(jobId, Date.now()); + }); + const finalize = ({ jobId }: { jobId: string }) => { + const start = this.inflight.get(jobId); + if (start === undefined) return; + this.inflight.delete(jobId); + observeJobDuration(label, (Date.now() - start) / 1000); + }; + events.on('completed', finalize); + events.on('failed', finalize); + events.on('error', (err) => { + this.logger.debug(`QueueEvents error (${label}): ${err?.message}`); + }); + this.queueEvents.push(events); + } + } + + private async pollDepths(): Promise { + for (const { label, queue } of this.queues) { + try { + const counts = await queue.getJobCounts(); + // "Depth" = jobs not yet finished (backlog + in-flight). + const depth = + (counts.waiting ?? 0) + + (counts.active ?? 0) + + (counts.delayed ?? 0) + + (counts.prioritized ?? 0) + + (counts.paused ?? 0); + setQueueDepth(label, depth); + } catch (err) { + this.logger.debug( + `Failed to read job counts for ${label}: ${(err as Error)?.message}`, + ); + } + } + } + + async onModuleDestroy(): Promise { + if (this.timer) { + clearInterval(this.timer); + this.timer = null; + } + await Promise.all( + this.queueEvents.map((e) => e.close().catch(() => undefined)), + ); + this.queueEvents = []; + this.inflight.clear(); + } +} diff --git a/apps/server/src/integrations/metrics/metrics.constants.ts b/apps/server/src/integrations/metrics/metrics.constants.ts new file mode 100644 index 00000000..16ad4950 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.constants.ts @@ -0,0 +1,80 @@ +/** + * Perf-metrics contract (#355). These names/labels are FIXED by the already + * deployed scrape+dashboard infra (VictoriaMetrics scraping docmost:9464, + * Grafana dashboards, alerts). Do NOT rename them. + */ +export const METRIC_HTTP_REQUEST_DURATION = 'http_request_duration_seconds'; +export const METRIC_DB_QUERY_DURATION = 'db_query_duration_seconds'; +export const METRIC_BULLMQ_QUEUE_DEPTH = 'bullmq_queue_depth'; +export const METRIC_BULLMQ_JOB_DURATION = 'bullmq_job_duration_seconds'; +export const METRIC_COLLAB_STORE_DURATION = 'collab_store_duration_seconds'; + +// Histogram buckets (seconds). Chosen to give useful p50/p95/p99 resolution +// for typical web/DB latencies without exploding series cardinality. +export const HTTP_BUCKETS = [ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, +]; +export const DB_BUCKETS = [ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, +]; +export const COLLAB_BUCKETS = [ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, +]; +export const JOB_BUCKETS = [ + 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, +]; + +/** + * Extract the first SQL token (select/insert/update/delete/...) from a query, + * lower-cased, to use as a BOUNDED label for db_query_duration_seconds. Using + * the full query text would blow up label cardinality; the leading keyword is a + * finite set. Unknown/empty queries collapse to `other`. + */ +export function firstSqlToken(sql: string | undefined): string { + if (!sql) return 'other'; + // Skip leading whitespace / comments and grab the first word. + const match = /^[\s(]*([a-zA-Z]+)/.exec(sql); + if (!match) return 'other'; + const token = match[1].toLowerCase(); + const known = new Set([ + 'select', + 'insert', + 'update', + 'delete', + 'with', + 'begin', + 'commit', + 'rollback', + 'alter', + 'create', + 'drop', + 'truncate', + 'explain', + ]); + return known.has(token) ? token : 'other'; +} + +/** + * Whether an HTTP response must be EXCLUDED from http_request_duration_seconds. + * + * SSE/streaming responses (the AI-chat `text/event-stream`) keep the connection + * open for the whole conversation, so Fastify's onResponse fires only when the + * client disconnects — recording the connection lifetime, not a response time, + * which would poison p95/p99. We skip by content-type (authoritative) with a + * route-suffix fallback for the two known stream endpoints. + */ +export function isStreamingResponse( + contentType: unknown, + route: string | undefined, +): boolean { + if ( + typeof contentType === 'string' && + contentType.toLowerCase().includes('text/event-stream') + ) { + return true; + } + // Fallback: the AI-chat stream routes (/api/ai-chat/stream, + // /api/shares/ai/stream) both end in `/stream`. + if (route && route.endsWith('/stream')) return true; + return false; +} diff --git a/apps/server/src/integrations/metrics/metrics.module.ts b/apps/server/src/integrations/metrics/metrics.module.ts new file mode 100644 index 00000000..a89bdff0 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.module.ts @@ -0,0 +1,13 @@ +import { Module } from '@nestjs/common'; +import { MetricsBullService } from './metrics-bull.service'; + +/** + * Wires the BullMQ collectors (#355). The queues are provided by the @Global + * QueueModule (which exports BullModule), so no re-registration is needed here. + * The HTTP histogram, DB-query and collab-store collectors live in module-level + * singletons (metrics.registry) and are wired directly at their call sites. + */ +@Module({ + providers: [MetricsBullService], +}) +export class MetricsModule {} diff --git a/apps/server/src/integrations/metrics/metrics.registry.ts b/apps/server/src/integrations/metrics/metrics.registry.ts new file mode 100644 index 00000000..d62be165 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.registry.ts @@ -0,0 +1,126 @@ +import { + collectDefaultMetrics, + Histogram, + Gauge, + Registry, +} from 'prom-client'; +import { + COLLAB_BUCKETS, + DB_BUCKETS, + HTTP_BUCKETS, + JOB_BUCKETS, + METRIC_BULLMQ_JOB_DURATION, + METRIC_BULLMQ_QUEUE_DEPTH, + METRIC_COLLAB_STORE_DURATION, + METRIC_DB_QUERY_DURATION, + METRIC_HTTP_REQUEST_DURATION, +} from './metrics.constants'; + +/** + * Process-wide perf-metrics registry (#355). + * + * This is a plain module singleton (NOT a Nest provider) because the collectors + * are cross-cutting: the Kysely `log` callback (built in a DI factory), the + * Fastify onResponse hook (main.ts, before the Nest container hands out + * providers) and the collab persistence extension all need the SAME instruments + * without threading DI through them. + * + * HARD CONTRACT: when `METRICS_PORT` is unset the whole subsystem is OFF — the + * registry is never created, `collectDefaultMetrics` never runs, and every + * observe/set helper is a cheap no-op. Nothing is exposed on :3000. + */ + +// Decided once at process start. Deliberately read here (not via +// EnvironmentService) so the toggle is identical for the DI and non-DI callers. +const enabled = Boolean(process.env.METRICS_PORT); + +let registry: Registry | null = null; +let httpHist: Histogram<'method' | 'route' | 'status'> | null = null; +let dbHist: Histogram<'op'> | null = null; +let queueDepthGauge: Gauge<'queue'> | null = null; +let jobHist: Histogram<'queue'> | null = null; +let collabHist: Histogram | null = null; + +function init(): void { + if (registry || !enabled) return; + + registry = new Registry(); + + // Node/runtime metrics: gives nodejs_eventloop_lag_p99_seconds, GC, heap, etc. + collectDefaultMetrics({ register: registry }); + + httpHist = new Histogram({ + name: METRIC_HTTP_REQUEST_DURATION, + help: 'HTTP request duration in seconds, by method, route template and status', + labelNames: ['method', 'route', 'status'], + buckets: HTTP_BUCKETS, + registers: [registry], + }); + + dbHist = new Histogram({ + name: METRIC_DB_QUERY_DURATION, + help: 'Database query duration in seconds, by leading SQL keyword', + labelNames: ['op'], + buckets: DB_BUCKETS, + registers: [registry], + }); + + queueDepthGauge = new Gauge({ + name: METRIC_BULLMQ_QUEUE_DEPTH, + help: 'Number of not-yet-finished BullMQ jobs per queue', + labelNames: ['queue'], + registers: [registry], + }); + + jobHist = new Histogram({ + name: METRIC_BULLMQ_JOB_DURATION, + help: 'BullMQ job processing duration in seconds, per queue', + labelNames: ['queue'], + buckets: JOB_BUCKETS, + registers: [registry], + }); + + collabHist = new Histogram({ + name: METRIC_COLLAB_STORE_DURATION, + help: 'Collaboration onStoreDocument duration in seconds', + buckets: COLLAB_BUCKETS, + registers: [registry], + }); +} + +// Runs once when this module is first imported. Safe to call again (idempotent). +init(); + +export function isMetricsEnabled(): boolean { + return enabled; +} + +/** The prom-client registry, or null when metrics are disabled. */ +export function getMetricsRegistry(): Registry | null { + return registry; +} + +export function observeHttp( + method: string, + route: string, + status: number, + seconds: number, +): void { + httpHist?.observe({ method, route, status }, seconds); +} + +export function observeDbQuery(op: string, seconds: number): void { + dbHist?.observe({ op }, seconds); +} + +export function setQueueDepth(queue: string, depth: number): void { + queueDepthGauge?.set({ queue }, depth); +} + +export function observeJobDuration(queue: string, seconds: number): void { + jobHist?.observe({ queue }, seconds); +} + +export function observeCollabStore(seconds: number): void { + collabHist?.observe(seconds); +} diff --git a/apps/server/src/integrations/metrics/metrics.server.ts b/apps/server/src/integrations/metrics/metrics.server.ts new file mode 100644 index 00000000..f1b90b15 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.server.ts @@ -0,0 +1,56 @@ +import { createServer, Server } from 'node:http'; +import { Logger } from '@nestjs/common'; +import { getMetricsRegistry, isMetricsEnabled } from './metrics.registry'; + +/** + * Start the Prometheus scrape endpoint on a SEPARATE port (default 9464, + * overridable via `METRICS_PORT`). This is a bare node:http server, NOT part of + * the Fastify app, so `/metrics` never exists on the public :3000 listener. + * + * Returns the http.Server (so callers can close it on shutdown) or null when + * metrics are disabled. + */ +export function startMetricsServer(): Server | null { + if (!isMetricsEnabled()) return null; + + const logger = new Logger('MetricsServer'); + const register = getMetricsRegistry(); + if (!register) return null; + + const port = Number(process.env.METRICS_PORT); + if (!Number.isInteger(port) || port <= 0) { + logger.warn( + `Invalid METRICS_PORT="${process.env.METRICS_PORT}", metrics endpoint not started`, + ); + return null; + } + + const server = createServer(async (req, res) => { + if (req.method === 'GET' && req.url === '/metrics') { + try { + const body = await register.metrics(); + res.setHeader('Content-Type', register.contentType); + res.statusCode = 200; + res.end(body); + } catch (err) { + res.statusCode = 500; + res.end(String((err as Error)?.message ?? 'error')); + } + return; + } + res.statusCode = 404; + res.end(); + }); + + // Bind on all interfaces: the scraper (VictoriaMetrics) reaches this from + // another container as docmost:9464. The port is not published to the host. + server.listen(port, '0.0.0.0', () => { + logger.log(`Metrics endpoint listening on :${port}/metrics`); + }); + + server.on('error', (err) => { + logger.error(`Metrics server error: ${err?.message}`); + }); + + return server; +} diff --git a/apps/server/src/integrations/metrics/metrics.spec.ts b/apps/server/src/integrations/metrics/metrics.spec.ts new file mode 100644 index 00000000..ffcc11e6 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.spec.ts @@ -0,0 +1,70 @@ +import { FastifyRequest } from 'fastify'; +import { resolveRouteLabel } from './http-metrics.hook'; +import { firstSqlToken, isStreamingResponse } from './metrics.constants'; + +describe('resolveRouteLabel (histogram route label)', () => { + it('uses the ROUTE TEMPLATE, never the raw URL', () => { + // routeOptions.url is the matched template; url is the raw path with the id. + const req = { + url: '/api/pages/abc-123-def', + routeOptions: { url: '/api/pages/:id' }, + } as unknown as FastifyRequest; + expect(resolveRouteLabel(req)).toBe('/api/pages/:id'); + expect(resolveRouteLabel(req)).not.toContain('abc-123-def'); + }); + + it('falls back to "unknown" on a 404 (no matched route template)', () => { + const req = { + url: '/totally/unmatched/path', + routeOptions: {}, + } as unknown as FastifyRequest; + expect(resolveRouteLabel(req)).toBe('unknown'); + }); + + it('falls back to "unknown" when routeOptions is missing', () => { + const req = { url: '/x' } as unknown as FastifyRequest; + expect(resolveRouteLabel(req)).toBe('unknown'); + }); +}); + +describe('isStreamingResponse (SSE exclusion)', () => { + it('excludes text/event-stream responses by content-type', () => { + expect(isStreamingResponse('text/event-stream', '/api/ai-chat/stream')).toBe( + true, + ); + expect(isStreamingResponse('text/event-stream; charset=utf-8', '/x')).toBe( + true, + ); + }); + + it('excludes known /stream routes by suffix as a fallback', () => { + expect(isStreamingResponse('application/json', '/api/ai-chat/stream')).toBe( + true, + ); + expect(isStreamingResponse(undefined, '/api/shares/ai/stream')).toBe(true); + }); + + it('does not exclude ordinary JSON responses', () => { + expect(isStreamingResponse('application/json', '/api/pages/:id')).toBe( + false, + ); + expect(isStreamingResponse(undefined, '/api/pages/:id')).toBe(false); + }); +}); + +describe('firstSqlToken (bounded db label)', () => { + it('returns the lower-cased leading keyword', () => { + expect(firstSqlToken('SELECT * FROM pages')).toBe('select'); + expect(firstSqlToken(' insert into x values (1)')).toBe('insert'); + expect(firstSqlToken('UPDATE pages SET a=1')).toBe('update'); + expect(firstSqlToken('delete from pages')).toBe('delete'); + expect(firstSqlToken('(SELECT 1)')).toBe('select'); + }); + + it('collapses unknown/empty queries to "other"', () => { + expect(firstSqlToken('')).toBe('other'); + expect(firstSqlToken(undefined)).toBe('other'); + expect(firstSqlToken('123 not sql')).toBe('other'); + expect(firstSqlToken('vacuum analyze')).toBe('other'); + }); +}); diff --git a/apps/server/src/integrations/throttle/throttle.module.ts b/apps/server/src/integrations/throttle/throttle.module.ts index db197015..4ad0f9c4 100644 --- a/apps/server/src/integrations/throttle/throttle.module.ts +++ b/apps/server/src/integrations/throttle/throttle.module.ts @@ -9,6 +9,7 @@ import { AI_CHAT_THROTTLER, PAGE_TEMPLATE_THROTTLER, PUBLIC_SHARE_AI_THROTTLER, + VITALS_THROTTLER, } from './throttler-names'; @Module({ @@ -29,6 +30,8 @@ import { { name: PAGE_TEMPLATE_THROTTLER, ttl: 60_000, limit: 30 }, // Anonymous public-share assistant: ~5 req/min per IP. { name: PUBLIC_SHARE_AI_THROTTLER, ttl: 60_000, limit: 5 }, + // Anonymous client perf-telemetry sink: 120 batched posts/min per IP. + { name: VITALS_THROTTLER, ttl: 60_000, limit: 120 }, ], errorMessage: 'Too many requests', // Pass ioredis options (not a pre-built Redis instance) so diff --git a/apps/server/src/integrations/throttle/throttler-names.ts b/apps/server/src/integrations/throttle/throttler-names.ts index f1ab971e..c54b0898 100644 --- a/apps/server/src/integrations/throttle/throttler-names.ts +++ b/apps/server/src/integrations/throttle/throttler-names.ts @@ -6,3 +6,7 @@ export const PAGE_TEMPLATE_THROTTLER = 'page-template'; // ThrottlerGuard tracker) to bound anonymous abuse — the workspace owner pays // for the tokens. export const PUBLIC_SHARE_AI_THROTTLER = 'public-share-ai'; +// IP-keyed throttler for the anonymous client perf-telemetry sink +// (POST /api/telemetry/vitals). Browsers batch metrics, so the limit is +// generous; it only exists to bound abuse of the public, unauthenticated route. +export const VITALS_THROTTLER = 'vitals'; diff --git a/apps/server/src/main.ts b/apps/server/src/main.ts index b5fa0027..e31f2ddc 100644 --- a/apps/server/src/main.ts +++ b/apps/server/src/main.ts @@ -16,6 +16,9 @@ import { EnvironmentService } from './integrations/environment/environment.servi import { SANDBOX_API_PATH } from './integrations/sandbox/sandbox.constants'; import { resolveFrameHeader } from './common/helpers'; import { resolveTrustProxy } from './integrations/environment/trust-proxy.util'; +import { isMetricsEnabled } from './integrations/metrics/metrics.registry'; +import { recordHttpResponse } from './integrations/metrics/http-metrics.hook'; +import { startMetricsServer } from './integrations/metrics/metrics.server'; async function bootstrap() { const app = await NestFactory.create( @@ -91,6 +94,19 @@ async function bootstrap() { done(); }); + // #355 — HTTP request-duration histogram. Registered ONLY when METRICS_PORT is + // set (otherwise no collector runs at all). Uses the bounded route template + // label and excludes SSE/streaming responses (see recordHttpResponse). + if (isMetricsEnabled()) { + app + .getHttpAdapter() + .getInstance() + .addHook('onResponse', (req, reply, done) => { + recordHttpResponse(req, reply); + done(); + }); + } + app .getHttpAdapter() .getInstance() @@ -127,6 +143,9 @@ async function bootstrap() { '/api/workspace/create', '/api/workspace/joined', '/api/workspace/find-by-email', + // Public client perf-telemetry sink: browsers post it without a + // resolved workspace host, so the workspace-resolution gate must not 404 it. + '/api/telemetry/vitals', // Anonymous in-RAM blob sandbox: a remote consumer fetches blobs by an // unguessable UUID without any workspace host context, so the // workspace-resolution gate must not apply. @@ -175,6 +194,10 @@ async function bootstrap() { `Listening on http://127.0.0.1:${port} / ${process.env.APP_URL}`, ); }); + + // #355 — Prometheus scrape endpoint on a SEPARATE port (METRICS_PORT, default + // 9464), started after the app is up. No-op when METRICS_PORT is unset. + startMetricsServer(); } bootstrap(); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 299f5e51..0d99a612 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -416,6 +416,9 @@ importers: socket.io-client: specifier: 4.8.3 version: 4.8.3 + web-vitals: + specifier: ^5.1.0 + version: 5.1.0 zod: specifier: 4.3.6 version: 4.3.6 @@ -744,6 +747,9 @@ importers: postmark: specifier: ^4.0.7 version: 4.0.7 + prom-client: + specifier: ^15.1.3 + version: 15.1.3 react: specifier: ^18.3.1 version: 18.3.1 @@ -5988,6 +5994,9 @@ packages: bind-event-listener@3.0.0: resolution: {integrity: sha512-PJvH288AWQhKs2v9zyfYdPzlPqf5bXbGMmhmUIY9x4dAUGIWgomO771oBQNwJnMQSnUIXhKu6sgzpBRXTlvb8Q==} + bintrees@1.0.2: + resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==} + bl@4.1.0: resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} @@ -9318,6 +9327,10 @@ packages: process-warning@5.0.0: resolution: {integrity: sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA==} + prom-client@15.1.3: + resolution: {integrity: sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==} + engines: {node: ^16 || ^18 || >=20} + prompts@2.4.2: resolution: {integrity: sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==} engines: {node: '>= 6'} @@ -10145,6 +10158,9 @@ packages: resolution: {integrity: sha512-4LeEWl96twnS2Q7Bz4MGqgazLqO+hJN63GZxXoIqh1T3VweYD997gbU1ItNsQafqqXTXd5WFyFdReLtwvRBNiw==} engines: {node: '>=18'} + tdigest@0.1.2: + resolution: {integrity: sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==} + terser-webpack-plugin@5.4.0: resolution: {integrity: sha512-Bn5vxm48flOIfkdl5CaD2+1CiUVbonWQ3KQPyP7/EuIl9Gbzq/gQFOzaMFUEgVjB1396tcK0SG8XcNJ/2kDH8g==} engines: {node: '>= 10.13.0'} @@ -16153,7 +16169,7 @@ snapshots: obug: 2.1.1 std-env: 4.1.0 tinyrainbow: 3.1.0 - vitest: 4.1.6(@opentelemetry/api@1.9.0)(@types/node@25.5.0)(@vitest/coverage-v8@4.1.6)(happy-dom@20.8.9)(jsdom@27.4.0(@noble/hashes@2.0.1))(vite@8.0.5(@types/node@25.5.0)(esbuild@0.28.0)(jiti@2.4.2)(less@4.2.0)(sugarss@5.0.1(postcss@8.5.14))(terser@5.39.0)(tsx@4.21.0)(yaml@2.8.3)) + vitest: 4.1.6(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(@vitest/coverage-v8@4.1.6)(happy-dom@20.8.9)(jsdom@25.0.0)(vite@8.0.5(@types/node@22.19.1)(esbuild@0.28.0)(jiti@2.4.2)(less@4.2.0)(sugarss@5.0.1(postcss@8.5.14))(terser@5.39.0)(tsx@4.21.0)(yaml@2.8.3)) '@vitest/expect@4.1.6': dependencies: @@ -16665,6 +16681,8 @@ snapshots: bind-event-listener@3.0.0: {} + bintrees@1.0.2: {} + bl@4.1.0: dependencies: buffer: 5.7.1 @@ -20476,6 +20494,11 @@ snapshots: process-warning@5.0.0: {} + prom-client@15.1.3: + dependencies: + '@opentelemetry/api': 1.9.0 + tdigest: 0.1.2 + prompts@2.4.2: dependencies: kleur: 3.0.3 @@ -21521,6 +21544,10 @@ snapshots: minizlib: 3.1.0 yallist: 5.0.0 + tdigest@0.1.2: + dependencies: + bintrees: 1.0.2 + terser-webpack-plugin@5.4.0(@swc/core@1.5.25(@swc/helpers@0.5.5))(webpack@5.106.0(@swc/core@1.5.25(@swc/helpers@0.5.5))): dependencies: '@jridgewell/trace-mapping': 0.3.31