From b9f3de80f5b84d726d1d196f4cf2227f221d9633 Mon Sep 17 00:00:00 2001 From: agent_coder Date: Sat, 4 Jul 2026 23:10:29 +0300 Subject: [PATCH] =?UTF-8?q?feat(observability):=20dev-side=20perf=20metric?= =?UTF-8?q?s=20=E2=80=94=20/metrics=20+=20client=20vitals=20(#355)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The metrics INFRA is already deployed (VictoriaMetrics scraping docmost:9464, Grafana dashboards, alerts) with a target `gitmost-app` that is red because the app half didn't exist. This is that half. The contract (metric names, port, table, endpoint) is FIXED by the deployed infra and matched exactly. Server (prom-client): - A bare node:http `/metrics` server on METRICS_PORT (default 9464), SEPARATE from the Fastify :3000 listener so /metrics never exists publicly; the whole subsystem is OFF when METRICS_PORT is unset. - collectDefaultMetrics() + http_request_duration_seconds{method,route,status} via a Fastify onResponse hook using the ROUTE TEMPLATE (req.routeOptions.url, never the raw URL — bounded cardinality; 404 -> "unknown"), EXCLUDING SSE/ streaming responses (would record the connection lifetime and poison p95). - db_query_duration_seconds (Kysely log callback, labelled by the leading SQL token), bullmq_queue_depth{queue} (getJobCounts every 15s) + bullmq_job_duration_seconds{queue} (worker completed/failed), collab_store_duration_seconds (around onStoreDocument). - POST /api/telemetry/vitals — PUBLIC (sendBeacon) but IP-throttled; ~16KB body cap, <=50 events/batch, metric-name + rating whitelist, attr truncated to 120 chars, batch insert; malformed/foreign/oversized silently dropped and 200'd (no browser retry). New migration `client_metrics` (schema byte-identical to the contract, both indexes, conditional grafana_ro GRANT; no app-side retention — the maintenance container prunes >90d). Client (web-vitals): - initVitals() decides sampling ONCE per session (25%, sessionStorage) BEFORE subscribing; onINP/onLCP/onCLS/onTTFB (attribution) buffered + flushed via navigator.sendBeacon on visibilitychange:hidden and a timer (not fetch-per- metric). Custom: editor_tx_ms (dispatchTransaction sync-part timer, >8ms, with doc_size), page_open_ms, longtask_ms. Route labels are templates only; no titles/slugs/text. Gate: server + client tsc 0, frozen install 0 (added prom-client + web-vitals + regenerated the lock), server metrics/vitals tests 11, client route-template 5, and the migration verified valid against real Postgres. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/client/package.json | 1 + .../src/features/editor/page-editor.tsx | 32 ++ .../src/lib/telemetry/route-template.test.ts | 35 +++ .../src/lib/telemetry/route-template.ts | 70 +++++ apps/client/src/lib/telemetry/vitals.ts | 275 ++++++++++++++++++ apps/client/src/main.tsx | 5 + apps/server/package.json | 1 + apps/server/src/app.module.ts | 4 + .../extensions/persistence.extension.ts | 12 + .../telemetry/client-metrics.constants.ts | 93 ++++++ .../core/telemetry/client-telemetry.module.ts | 15 + .../src/core/telemetry/vitals.controller.ts | 47 +++ .../src/core/telemetry/vitals.service.spec.ts | 122 ++++++++ .../src/core/telemetry/vitals.service.ts | 70 +++++ apps/server/src/database/database.module.ts | 10 + .../20260704T120000-client-metrics.ts | 52 ++++ apps/server/src/database/types/db.d.ts | 13 + .../integrations/metrics/http-metrics.hook.ts | 46 +++ .../metrics/metrics-bull.service.ts | 146 ++++++++++ .../integrations/metrics/metrics.constants.ts | 80 +++++ .../integrations/metrics/metrics.module.ts | 13 + .../integrations/metrics/metrics.registry.ts | 126 ++++++++ .../integrations/metrics/metrics.server.ts | 56 ++++ .../src/integrations/metrics/metrics.spec.ts | 70 +++++ .../integrations/throttle/throttle.module.ts | 3 + .../integrations/throttle/throttler-names.ts | 4 + apps/server/src/main.ts | 23 ++ pnpm-lock.yaml | 29 +- 28 files changed, 1452 insertions(+), 1 deletion(-) create mode 100644 apps/client/src/lib/telemetry/route-template.test.ts create mode 100644 apps/client/src/lib/telemetry/route-template.ts create mode 100644 apps/client/src/lib/telemetry/vitals.ts create mode 100644 apps/server/src/core/telemetry/client-metrics.constants.ts create mode 100644 apps/server/src/core/telemetry/client-telemetry.module.ts create mode 100644 apps/server/src/core/telemetry/vitals.controller.ts create mode 100644 apps/server/src/core/telemetry/vitals.service.spec.ts create mode 100644 apps/server/src/core/telemetry/vitals.service.ts create mode 100644 apps/server/src/database/migrations/20260704T120000-client-metrics.ts create mode 100644 apps/server/src/integrations/metrics/http-metrics.hook.ts create mode 100644 apps/server/src/integrations/metrics/metrics-bull.service.ts create mode 100644 apps/server/src/integrations/metrics/metrics.constants.ts create mode 100644 apps/server/src/integrations/metrics/metrics.module.ts create mode 100644 apps/server/src/integrations/metrics/metrics.registry.ts create mode 100644 apps/server/src/integrations/metrics/metrics.server.ts create mode 100644 apps/server/src/integrations/metrics/metrics.spec.ts diff --git a/apps/client/package.json b/apps/client/package.json index 76d617da..b922efe9 100644 --- a/apps/client/package.json +++ b/apps/client/package.json @@ -61,6 +61,7 @@ "react-clear-modal": "^2.0.18", "react-dom": "^18.3.1", "react-drawio": "1.0.7", + "web-vitals": "^5.1.0", "react-error-boundary": "6.1.1", "react-helmet-async": "3.0.0", "react-i18next": "16.5.8", diff --git a/apps/client/src/features/editor/page-editor.tsx b/apps/client/src/features/editor/page-editor.tsx index 9953d6c5..8b2f2fc9 100644 --- a/apps/client/src/features/editor/page-editor.tsx +++ b/apps/client/src/features/editor/page-editor.tsx @@ -93,6 +93,10 @@ import { isBodyEditable, isCollabSynced, } from "@/features/editor/editor-sync-state"; +import { + measurePageOpen, + reportEditorTx, +} from "@/lib/telemetry/vitals"; interface PageEditorProps { pageId: string; @@ -351,6 +355,34 @@ export default function PageEditor({ editor.storage.pageId = pageId; handleScrollTo(editor); editorRef.current = editor; + + // #355 — page_open_ms: this is the first editor-content render, so + // measure against any page-open mark set on the tree-row/link click. + measurePageOpen(); + + // #355 — editor_tx_ms: time the SYNCHRONOUS part of applying each + // transaction (state.apply + updateState) by wrapping the view's + // dispatch. Only slow syncs (>8ms) are reported (see reportEditorTx), + // so the common path adds just one performance.now() pair. Passive: + // the original dispatch still runs unchanged. + try { + const view = editor.view as unknown as { + dispatch: (tr: unknown) => void; + }; + const originalDispatch = view.dispatch.bind(view); + view.dispatch = (tr: unknown) => { + const started = performance.now(); + originalDispatch(tr); + const elapsed = performance.now() - started; + try { + reportEditorTx(elapsed, editor.state.doc.content.size); + } catch { + // never let telemetry break editing + } + }; + } catch { + // if the view shape changes, skip editor_tx instrumentation + } } }, onUpdate({ editor }) { diff --git a/apps/client/src/lib/telemetry/route-template.test.ts b/apps/client/src/lib/telemetry/route-template.test.ts new file mode 100644 index 00000000..aa798a3d --- /dev/null +++ b/apps/client/src/lib/telemetry/route-template.test.ts @@ -0,0 +1,35 @@ +import { describe, it, expect } from "vitest"; +import { templateRoute } from "./route-template"; + +describe("templateRoute", () => { + it("templates a space page path (never leaks slugs)", () => { + const t = templateRoute("/s/engineering/p/design-doc-abc123"); + expect(t).toBe("/s/:space/p/:slug"); + expect(t).not.toContain("engineering"); + expect(t).not.toContain("design-doc"); + }); + + it("templates share, redirect and space paths", () => { + expect(templateRoute("/share/abc/p/xyz")).toBe("/share/:shareId/p/:slug"); + expect(templateRoute("/share/p/xyz")).toBe("/share/p/:slug"); + expect(templateRoute("/p/some-slug")).toBe("/p/:slug"); + expect(templateRoute("/s/team")).toBe("/s/:space"); + expect(templateRoute("/s/team/trash")).toBe("/s/:space/trash"); + expect(templateRoute("/labels/urgent")).toBe("/labels/:label"); + }); + + it("keeps known static routes verbatim", () => { + expect(templateRoute("/home")).toBe("/home"); + expect(templateRoute("/settings/members")).toBe("/settings/members"); + expect(templateRoute("/")).toBe("/"); + }); + + it("normalises a trailing slash", () => { + expect(templateRoute("/s/team/p/slug/")).toBe("/s/:space/p/:slug"); + }); + + it("collapses unknown paths to 'other' (bounded cardinality)", () => { + expect(templateRoute("/weird/unknown/thing")).toBe("other"); + expect(templateRoute("/s/team/p/slug/extra/segments")).toBe("other"); + }); +}); diff --git a/apps/client/src/lib/telemetry/route-template.ts b/apps/client/src/lib/telemetry/route-template.ts new file mode 100644 index 00000000..45fe4855 --- /dev/null +++ b/apps/client/src/lib/telemetry/route-template.ts @@ -0,0 +1,70 @@ +/** + * Map a raw pathname to a BOUNDED route TEMPLATE (#355). + * + * Perf metrics must be labelled by route template only — never a raw path with + * slugs/ids — so the server-side `route` column and any downstream aggregation + * stay low-cardinality and carry NO page slugs/titles (privacy). Anything that + * does not match a known pattern collapses to `other`. + * + * The template vocabulary mirrors the issue's example (`/s/:space/p/:slug`). + */ +const ROUTE_PATTERNS: { re: RegExp; template: string }[] = [ + // Share pages (public). + { re: /^\/share\/[^/]+\/p\/[^/]+$/, template: '/share/:shareId/p/:slug' }, + { re: /^\/share\/p\/[^/]+$/, template: '/share/p/:slug' }, + { re: /^\/share\/[^/]+$/, template: '/share/:shareId' }, + // Page redirect. + { re: /^\/p\/[^/]+$/, template: '/p/:slug' }, + // Space + page. + { re: /^\/s\/[^/]+\/p\/[^/]+$/, template: '/s/:space/p/:slug' }, + { re: /^\/s\/[^/]+\/trash$/, template: '/s/:space/trash' }, + { re: /^\/s\/[^/]+$/, template: '/s/:space' }, + // Misc dynamic. + { re: /^\/labels\/[^/]+$/, template: '/labels/:label' }, + { re: /^\/invites\/[^/]+$/, template: '/invites/:invitationId' }, + { re: /^\/settings\/groups\/[^/]+$/, template: '/settings/groups/:groupId' }, +]; + +// Static routes we accept verbatim (finite set). +const STATIC_ROUTES = new Set([ + '/home', + '/spaces', + '/favorites', + '/login', + '/forgot-password', + '/password-reset', + '/setup/register', + '/settings/account/profile', + '/settings/account/preferences', + '/settings/workspace', + '/settings/ai', + '/settings/members', + '/settings/groups', + '/settings/spaces', + '/settings/sharing', +]); + +export function templateRoute(pathname: string): string { + // Normalise a trailing slash (except root). + const path = + pathname.length > 1 && pathname.endsWith('/') + ? pathname.slice(0, -1) + : pathname; + + if (path === '' || path === '/') return '/'; + if (STATIC_ROUTES.has(path)) return path; + + for (const { re, template } of ROUTE_PATTERNS) { + if (re.test(path)) return template; + } + return 'other'; +} + +/** Template for the current window location. */ +export function currentRouteTemplate(): string { + try { + return templateRoute(window.location.pathname); + } catch { + return 'other'; + } +} diff --git a/apps/client/src/lib/telemetry/vitals.ts b/apps/client/src/lib/telemetry/vitals.ts new file mode 100644 index 00000000..b18bcba0 --- /dev/null +++ b/apps/client/src/lib/telemetry/vitals.ts @@ -0,0 +1,275 @@ +import { + onCLS, + onINP, + onLCP, + onTTFB, + type CLSMetricWithAttribution, + type INPMetricWithAttribution, + type LCPMetricWithAttribution, + type TTFBMetricWithAttribution, +} from "web-vitals/attribution"; +import { currentRouteTemplate } from "./route-template"; + +/** + * Client perf-telemetry (#355): web-vitals + custom metrics buffered and posted + * to POST /api/telemetry/vitals via sendBeacon. + * + * Design constraints from the issue: + * - Sampling is decided ONCE per session (25%), cached in sessionStorage, + * BEFORE any observer is subscribed. Non-sampled sessions send nothing. + * - Route labels are TEMPLATES only; attr is truncated to 120 chars; no page + * titles/slugs/text ever leave the browser. + * - Observers are passive and reporting is best-effort — telemetry must not + * degrade the perf it measures. + */ + +const ENDPOINT = "/api/telemetry/vitals"; +const SAMPLE_RATE = 0.25; +const SAMPLE_KEY = "gm_vitals_sampled"; +const FLUSH_INTERVAL_MS = 15_000; +const MAX_BUFFER = 40; // flush early if the buffer fills between timers +const MAX_ATTR_LENGTH = 120; +const EDITOR_TX_MIN_MS = 8; // only report editor transactions slower than this + +const ALLOWED_NAMES = new Set([ + "INP", + "LCP", + "CLS", + "TTFB", + "editor_tx_ms", + "page_open_ms", + "longtask_ms", +]); + +interface VitalEvent { + name: string; + value: number; + rating?: string; + route?: string; + attr?: string; + docSize?: number; +} + +let sampledCache: boolean | null = null; +let initialised = false; +let buffer: VitalEvent[] = []; +let longtaskSum = 0; // accumulated longtask duration (ms) for the current window + +/** + * Decide once per session whether this session is sampled. Cached in + * sessionStorage so the choice is stable across reloads within the session and + * identical for every observer/custom-metric caller. + */ +export function isVitalsSampled(): boolean { + if (sampledCache !== null) return sampledCache; + try { + const stored = sessionStorage.getItem(SAMPLE_KEY); + if (stored === "1") return (sampledCache = true); + if (stored === "0") return (sampledCache = false); + const sampled = Math.random() < SAMPLE_RATE; + sessionStorage.setItem(SAMPLE_KEY, sampled ? "1" : "0"); + return (sampledCache = sampled); + } catch { + // sessionStorage unavailable (private mode / SSR): default to not sampled. + return (sampledCache = false); + } +} + +function truncateAttr(value: unknown): string | undefined { + if (typeof value !== "string" || value.length === 0) return undefined; + return value.slice(0, MAX_ATTR_LENGTH); +} + +function enqueue(event: VitalEvent): void { + if (!ALLOWED_NAMES.has(event.name)) return; + if (!Number.isFinite(event.value)) return; + buffer.push(event); + if (buffer.length >= MAX_BUFFER) flush(); +} + +function flush(): void { + // Fold any pending longtask total into the batch first. + if (longtaskSum > 0) { + buffer.push({ + name: "longtask_ms", + value: Math.round(longtaskSum), + route: currentRouteTemplate(), + }); + longtaskSum = 0; + } + if (buffer.length === 0) return; + + const payload = JSON.stringify({ events: buffer }); + buffer = []; + + try { + const blob = new Blob([payload], { type: "application/json" }); + if (navigator.sendBeacon && navigator.sendBeacon(ENDPOINT, blob)) return; + // Fallback for browsers without sendBeacon: keepalive fetch. + void fetch(ENDPOINT, { + method: "POST", + body: payload, + headers: { "Content-Type": "application/json" }, + keepalive: true, + }).catch(() => undefined); + } catch { + // Best-effort: never throw out of telemetry. + } +} + +/** + * Report a custom client metric (editor_tx_ms, page_open_ms). No-op unless the + * session is sampled. Route is always the current TEMPLATE. + */ +export function reportClientMetric( + name: "editor_tx_ms" | "page_open_ms", + value: number, + extra?: { docSize?: number }, +): void { + if (!isVitalsSampled()) return; + if (!Number.isFinite(value)) return; + enqueue({ + name, + value, + route: currentRouteTemplate(), + docSize: extra?.docSize, + }); +} + +/** Threshold-gated editor transaction reporter (only reports slow syncs). */ +export function reportEditorTx(ms: number, docSize: number): void { + if (ms <= EDITOR_TX_MIN_MS) return; + reportClientMetric("editor_tx_ms", ms, { docSize }); +} + +const PAGE_OPEN_MARK = "gm_page_open_start"; + +/** Mark the start of a page-open interaction (tree-row / link click). */ +export function markPageOpenStart(): void { + try { + performance.clearMarks(PAGE_OPEN_MARK); + performance.mark(PAGE_OPEN_MARK); + } catch { + // ignore + } +} + +/** + * Measure page_open_ms at first editor-content render, if a start mark exists. + * Consumes the mark so a later render doesn't double-count. + */ +export function measurePageOpen(): void { + try { + const marks = performance.getEntriesByName(PAGE_OPEN_MARK, "mark"); + if (marks.length === 0) return; + const started = marks[0].startTime; + const elapsed = performance.now() - started; + performance.clearMarks(PAGE_OPEN_MARK); + if (elapsed > 0 && Number.isFinite(elapsed)) { + reportClientMetric("page_open_ms", elapsed); + } + } catch { + // ignore + } +} + +function attrTarget( + metric: + | INPMetricWithAttribution + | LCPMetricWithAttribution + | CLSMetricWithAttribution, +): string | undefined { + const a = metric.attribution as Record | undefined; + if (!a) return undefined; + // Different vitals expose their culprit element under different keys; only a + // CSS-selector-ish target string is taken (no text content / titles). + return ( + truncateAttr(a.interactionTarget) ?? + truncateAttr(a.element) ?? + truncateAttr(a.largestShiftTarget) ?? + undefined + ); +} + +/** + * Initialise client telemetry. Safe to call multiple times (idempotent). Returns + * immediately without subscribing when the session is not sampled — so a + * non-sampled session subscribes to NO observers and sends nothing. + */ +export function initVitals(): void { + if (initialised) return; + initialised = true; + + // Sampling gate is evaluated BEFORE any observer subscription. + if (!isVitalsSampled()) return; + + const report = ( + metric: + | INPMetricWithAttribution + | LCPMetricWithAttribution + | CLSMetricWithAttribution + | TTFBMetricWithAttribution, + ) => { + enqueue({ + name: metric.name, + value: metric.value, + rating: metric.rating, + route: currentRouteTemplate(), + attr: + metric.name === "TTFB" + ? undefined + : attrTarget( + metric as + | INPMetricWithAttribution + | LCPMetricWithAttribution + | CLSMetricWithAttribution, + ), + }); + }; + + onINP(report); + onLCP(report); + onCLS(report); + onTTFB(report); + + // Long tasks: aggregate the total blocking time per flush window (a passive + // observer; individual entries are summed, never stored/sent individually). + try { + if (typeof PerformanceObserver !== "undefined") { + const observer = new PerformanceObserver((list) => { + for (const entry of list.getEntries()) { + longtaskSum += entry.duration; + } + }); + observer.observe({ type: "longtask", buffered: true }); + } + } catch { + // longtask entry type unsupported: skip silently. + } + + // page_open_ms start: mark when the user clicks a page link/tree-row (any + // anchor navigating to a page URL). Passive capture listener; the matching + // measure fires at first editor-content render (measurePageOpen). No page + // titles/slugs are read — only the click timing is marked. + document.addEventListener( + "click", + (event) => { + const target = event.target as Element | null; + const anchor = target?.closest?.("a[href]") as HTMLAnchorElement | null; + if (!anchor) return; + const href = anchor.getAttribute("href") ?? ""; + // A page link is `/s/:space/p/:slug`, `/p/:slug` or a share page path. + if (/\/p\//.test(href)) markPageOpenStart(); + }, + { capture: true, passive: true }, + ); + + // Flush on tab hide (most reliable delivery point) and periodically. + const onHidden = () => { + if (document.visibilityState === "hidden") flush(); + }; + document.addEventListener("visibilitychange", onHidden); + window.addEventListener("pagehide", flush); + + setInterval(flush, FLUSH_INTERVAL_MS); +} diff --git a/apps/client/src/main.tsx b/apps/client/src/main.tsx index 5dbf6b81..0c19fe28 100644 --- a/apps/client/src/main.tsx +++ b/apps/client/src/main.tsx @@ -22,6 +22,7 @@ import { isPostHogEnabled, } from "@/lib/config.ts"; import posthog from "posthog-js"; +import { initVitals } from "@/lib/telemetry/vitals"; export const queryClient = new QueryClient({ defaultOptions: { @@ -43,6 +44,10 @@ if (isCloud() && isPostHogEnabled) { }); } +// #355 — client perf-telemetry. Decides sampling ONCE (25%/session) before +// subscribing to any observer; non-sampled sessions send nothing. +initVitals(); + const container = document.getElementById("root") as HTMLElement; const root = (container as any).__reactRoot ??= ReactDOM.createRoot(container); diff --git a/apps/server/package.json b/apps/server/package.json index 4e566d9d..517f1b3d 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -111,6 +111,7 @@ "pino-pretty": "^13.1.3", "postgres": "^3.4.8", "postmark": "^4.0.7", + "prom-client": "^15.1.3", "react": "^18.3.1", "react-email": "6.0.8", "reflect-metadata": "^0.2.2", diff --git a/apps/server/src/app.module.ts b/apps/server/src/app.module.ts index e257fe9b..381970fa 100644 --- a/apps/server/src/app.module.ts +++ b/apps/server/src/app.module.ts @@ -31,6 +31,8 @@ import { McpModule } from './integrations/mcp/mcp.module'; import { SandboxModule } from './integrations/sandbox/sandbox.module'; import { AiModule } from './integrations/ai/ai.module'; import { AiChatModule } from './core/ai-chat/ai-chat.module'; +import { MetricsModule } from './integrations/metrics/metrics.module'; +import { ClientTelemetryModule } from './core/telemetry/client-telemetry.module'; const enterpriseModules = []; try { @@ -93,6 +95,8 @@ try { SandboxModule, AiModule, AiChatModule, + MetricsModule, + ClientTelemetryModule, ...enterpriseModules, ], controllers: [AppController], diff --git a/apps/server/src/collaboration/extensions/persistence.extension.ts b/apps/server/src/collaboration/extensions/persistence.extension.ts index 18eb99e6..35310cf4 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.ts @@ -41,6 +41,7 @@ import { HISTORY_INTERVAL, } from '../constants'; import { TransclusionService } from '../../core/page/transclusion/transclusion.service'; +import { observeCollabStore } from '../../integrations/metrics/metrics.registry'; /** * #251 — wire format of the client→server stateless message that signals a @@ -192,6 +193,17 @@ export class PersistenceExtension implements Extension { } async onStoreDocument(data: onStoreDocumentPayload) { + // #355 — time the full store (persist + post-store side effects) into + // collab_store_duration_seconds. No-op when METRICS_PORT is unset. + const startedAt = performance.now(); + try { + await this.storeDocument(data); + } finally { + observeCollabStore((performance.now() - startedAt) / 1000); + } + } + + private async storeDocument(data: onStoreDocumentPayload) { const { documentName, document, context } = data; const pageId = getPageId(documentName); diff --git a/apps/server/src/core/telemetry/client-metrics.constants.ts b/apps/server/src/core/telemetry/client-metrics.constants.ts new file mode 100644 index 00000000..4f4cc70f --- /dev/null +++ b/apps/server/src/core/telemetry/client-metrics.constants.ts @@ -0,0 +1,93 @@ +/** + * Server-side whitelist + limits for POST /api/telemetry/vitals (#355). + * + * The endpoint is PUBLIC (browsers post it, no auth) so it is a privacy and + * abuse surface: everything not on these lists is silently DROPPED and the + * request still returns 200 (never 400 — a 400 would make browsers retry). + */ + +// The only metric names accepted. Anything else is dropped. +export const ALLOWED_METRIC_NAMES = new Set([ + 'INP', + 'LCP', + 'CLS', + 'TTFB', + 'editor_tx_ms', + 'page_open_ms', + 'longtask_ms', +]); + +// The only rating values accepted (web-vitals). Anything else -> null. +export const ALLOWED_RATINGS = new Set([ + 'good', + 'needs-improvement', + 'poor', +]); + +// Max events accepted per batch; the rest are ignored. +export const MAX_EVENTS_PER_BATCH = 50; + +// Defence-in-depth body cap (~16KB). Fastify's global bodyLimit is far larger, +// so we re-check the parsed payload size here and drop oversized batches. +export const MAX_BODY_BYTES = 16 * 1024; + +// attr is truncated to this many characters (attribution target only, no PII). +export const MAX_ATTR_LENGTH = 120; + +// route label sanity cap (client sends a template like /s/:space/p/:slug). +export const MAX_ROUTE_LENGTH = 200; + +export interface ClientMetricRow { + name: string; + value: number; + rating: string | null; + route: string | null; + attr: string | null; + docSize: number | null; + workspaceId: string | null; +} + +/** + * Validate + normalise a single incoming event into a DB row, or return null to + * DROP it. Pure so it is directly unit-testable. Enforces the name whitelist, + * numeric value, rating whitelist, attr truncation and doc_size (int) coercion. + */ +export function sanitizeVitalEvent( + raw: unknown, + workspaceId: string | null, +): ClientMetricRow | null { + if (!raw || typeof raw !== 'object') return null; + const e = raw as Record; + + const name = e.name; + if (typeof name !== 'string' || !ALLOWED_METRIC_NAMES.has(name)) return null; + + const value = + typeof e.value === 'number' && Number.isFinite(e.value) ? e.value : null; + if (value === null) return null; + + const rating = + typeof e.rating === 'string' && ALLOWED_RATINGS.has(e.rating) + ? e.rating + : null; + + let route: string | null = null; + if (typeof e.route === 'string' && e.route.length > 0) { + route = e.route.slice(0, MAX_ROUTE_LENGTH); + } + + let attr: string | null = null; + if (typeof e.attr === 'string' && e.attr.length > 0) { + attr = e.attr.slice(0, MAX_ATTR_LENGTH); + } + + let docSize: number | null = null; + if (typeof e.docSize === 'number' && Number.isFinite(e.docSize)) { + docSize = Math.trunc(e.docSize); + } else if (typeof e.doc_size === 'number' && Number.isFinite(e.doc_size)) { + // Accept snake_case too, in case a client sends the raw column name. + docSize = Math.trunc(e.doc_size as number); + } + + return { name, value, rating, route, attr, docSize, workspaceId }; +} diff --git a/apps/server/src/core/telemetry/client-telemetry.module.ts b/apps/server/src/core/telemetry/client-telemetry.module.ts new file mode 100644 index 00000000..cf94efc0 --- /dev/null +++ b/apps/server/src/core/telemetry/client-telemetry.module.ts @@ -0,0 +1,15 @@ +import { Module } from '@nestjs/common'; +import { VitalsController } from './vitals.controller'; +import { VitalsService } from './vitals.service'; + +/** + * Client perf-telemetry (#355): the public /api/telemetry/vitals sink that + * persists web-vitals + custom client metrics into `client_metrics`. + * Named ClientTelemetryModule to avoid confusion with the unrelated + * integrations/telemetry (product usage ping) module. + */ +@Module({ + controllers: [VitalsController], + providers: [VitalsService], +}) +export class ClientTelemetryModule {} diff --git a/apps/server/src/core/telemetry/vitals.controller.ts b/apps/server/src/core/telemetry/vitals.controller.ts new file mode 100644 index 00000000..62812671 --- /dev/null +++ b/apps/server/src/core/telemetry/vitals.controller.ts @@ -0,0 +1,47 @@ +import { + Body, + Controller, + HttpCode, + Post, + Req, + UseGuards, +} from '@nestjs/common'; +import { Throttle, ThrottlerGuard } from '@nestjs/throttler'; +import { FastifyRequest } from 'fastify'; +import { Public } from '../../common/decorators/public.decorator'; +import { VITALS_THROTTLER } from '../../integrations/throttle/throttler-names'; +import { VitalsService } from './vitals.service'; + +/** + * POST /api/telemetry/vitals (#355) — public client perf-metrics sink. + * + * PUBLIC (browsers post via sendBeacon, no session) but IP-throttled. Always + * returns 200 with no body of interest: invalid/foreign/oversized payloads are + * silently dropped by the service rather than 400'd, so browsers never retry. + */ +@Controller('telemetry') +export class VitalsController { + constructor(private readonly vitalsService: VitalsService) {} + + @Public() + @UseGuards(ThrottlerGuard) + @Throttle({ [VITALS_THROTTLER]: { limit: 120, ttl: 60_000 } }) + @Post('vitals') + @HttpCode(200) + async vitals( + @Body() body: unknown, + @Req() req: FastifyRequest, + ): Promise<{ ok: true }> { + // workspaceId is resolved by the workspace-host middleware onto req.raw when + // the browser posts from a workspace host; null otherwise. No other PII. + const workspaceId = + ((req.raw as unknown as { workspaceId?: string })?.workspaceId ?? null) || + null; + try { + await this.vitalsService.ingest(body, workspaceId); + } catch { + // Never surface storage errors to the browser; telemetry is best-effort. + } + return { ok: true }; + } +} diff --git a/apps/server/src/core/telemetry/vitals.service.spec.ts b/apps/server/src/core/telemetry/vitals.service.spec.ts new file mode 100644 index 00000000..2fca1228 --- /dev/null +++ b/apps/server/src/core/telemetry/vitals.service.spec.ts @@ -0,0 +1,122 @@ +import { VitalsService } from './vitals.service'; +import { MAX_ATTR_LENGTH } from './client-metrics.constants'; + +// buildRows is pure (no DB access), so a null db is fine here. +const svc = new VitalsService(null as any); + +describe('VitalsService.buildRows', () => { + const WS = 'ws-uuid'; + + it('accepts a valid batch and maps whitelisted fields to rows', () => { + const body = { + events: [ + { name: 'INP', value: 123.4, rating: 'good', route: '/s/:space/p/:slug' }, + { name: 'editor_tx_ms', value: 12, route: '/s/:space/p/:slug', docSize: 4096 }, + ], + }; + const rows = svc.buildRows(body, WS); + expect(rows).toHaveLength(2); + expect(rows[0]).toEqual({ + name: 'INP', + value: 123.4, + rating: 'good', + route: '/s/:space/p/:slug', + attr: null, + docSize: null, + workspaceId: WS, + }); + expect(rows[1].name).toBe('editor_tx_ms'); + expect(rows[1].docSize).toBe(4096); + expect(rows[1].workspaceId).toBe(WS); + }); + + it('accepts a bare array body', () => { + const rows = svc.buildRows([{ name: 'LCP', value: 1 }], WS); + expect(rows).toHaveLength(1); + expect(rows[0].name).toBe('LCP'); + }); + + it('drops events with foreign metric names', () => { + const rows = svc.buildRows( + { events: [{ name: 'evil_metric', value: 1 }, { name: 'LCP', value: 2 }] }, + WS, + ); + expect(rows).toHaveLength(1); + expect(rows[0].name).toBe('LCP'); + }); + + it('drops events with a non-numeric or missing value', () => { + const rows = svc.buildRows( + { + events: [ + { name: 'CLS', value: 'nan' }, + { name: 'CLS' }, + { name: 'CLS', value: 0.1 }, + ], + }, + WS, + ); + expect(rows).toHaveLength(1); + expect(rows[0].value).toBe(0.1); + }); + + it('strips foreign fields and only keeps whitelisted columns', () => { + const rows = svc.buildRows( + { events: [{ name: 'TTFB', value: 5, secret: 'drop-me', title: 'my page' }] }, + WS, + ); + expect(rows).toHaveLength(1); + expect(Object.keys(rows[0]).sort()).toEqual( + ['attr', 'docSize', 'name', 'rating', 'route', 'value', 'workspaceId'].sort(), + ); + expect((rows[0] as any).secret).toBeUndefined(); + expect((rows[0] as any).title).toBeUndefined(); + }); + + it('rejects a rating outside the allowed set (-> null)', () => { + const rows = svc.buildRows( + { events: [{ name: 'INP', value: 1, rating: 'terrible' }] }, + WS, + ); + expect(rows[0].rating).toBeNull(); + }); + + it('truncates attr to 120 chars', () => { + const longAttr = 'a'.repeat(500); + const rows = svc.buildRows( + { events: [{ name: 'INP', value: 1, attr: longAttr }] }, + WS, + ); + expect(rows[0].attr).toHaveLength(MAX_ATTR_LENGTH); + }); + + it('caps the batch at 50 events', () => { + const events = Array.from({ length: 200 }, () => ({ name: 'CLS', value: 1 })); + const rows = svc.buildRows({ events }, WS); + expect(rows).toHaveLength(50); + }); + + it('drops an oversized (>16KB) payload wholesale', () => { + const events = Array.from({ length: 50 }, () => ({ + name: 'INP', + value: 1, + attr: 'x'.repeat(400), + route: '/s/:space/p/:slug', + })); + // Serialised body far exceeds 16KB. + const rows = svc.buildRows({ events }, WS); + expect(rows).toHaveLength(0); + }); + + it('returns [] for malformed bodies', () => { + expect(svc.buildRows(null, WS)).toEqual([]); + expect(svc.buildRows('nope', WS)).toEqual([]); + expect(svc.buildRows({ notEvents: 1 }, WS)).toEqual([]); + expect(svc.buildRows(42, WS)).toEqual([]); + }); + + it('carries a null workspaceId through', () => { + const rows = svc.buildRows({ events: [{ name: 'LCP', value: 1 }] }, null); + expect(rows[0].workspaceId).toBeNull(); + }); +}); diff --git a/apps/server/src/core/telemetry/vitals.service.ts b/apps/server/src/core/telemetry/vitals.service.ts new file mode 100644 index 00000000..076a2506 --- /dev/null +++ b/apps/server/src/core/telemetry/vitals.service.ts @@ -0,0 +1,70 @@ +import { Injectable } from '@nestjs/common'; +import { InjectKysely } from 'nestjs-kysely'; +import { KyselyDB } from '@docmost/db/types/kysely.types'; +import { + ClientMetricRow, + MAX_BODY_BYTES, + MAX_EVENTS_PER_BATCH, + sanitizeVitalEvent, +} from './client-metrics.constants'; + +@Injectable() +export class VitalsService { + constructor(@InjectKysely() private readonly db: KyselyDB) {} + + /** + * Turn a raw request body into the (bounded, whitelisted) rows to persist. + * Pure/synchronous so it is unit-testable without a DB. Returns [] for any + * malformed / oversized / foreign input — the caller still responds 200. + */ + buildRows(body: unknown, workspaceId: string | null): ClientMetricRow[] { + if (!body || typeof body !== 'object') return []; + + // Defence-in-depth body cap (~16KB): drop oversized batches wholesale. + try { + if (JSON.stringify(body).length > MAX_BODY_BYTES) return []; + } catch { + return []; + } + + // Accept either a bare array or `{ events: [...] }`. + const events = Array.isArray(body) + ? body + : Array.isArray((body as { events?: unknown }).events) + ? ((body as { events: unknown[] }).events as unknown[]) + : null; + if (!events) return []; + + const rows: ClientMetricRow[] = []; + for (const event of events) { + if (rows.length >= MAX_EVENTS_PER_BATCH) break; + const row = sanitizeVitalEvent(event, workspaceId); + if (row) rows.push(row); + } + return rows; + } + + /** Batch-insert the sanitised rows in a single statement. No-op on []. */ + async insertRows(rows: ClientMetricRow[]): Promise { + if (rows.length === 0) return; + await this.db + .insertInto('clientMetrics') + .values( + rows.map((r) => ({ + name: r.name, + value: r.value, + rating: r.rating, + route: r.route, + attr: r.attr, + docSize: r.docSize, + workspaceId: r.workspaceId, + })), + ) + .execute(); + } + + async ingest(body: unknown, workspaceId: string | null): Promise { + const rows = this.buildRows(body, workspaceId); + await this.insertRows(rows); + } +} diff --git a/apps/server/src/database/database.module.ts b/apps/server/src/database/database.module.ts index 4e9c5d13..ea56ed11 100644 --- a/apps/server/src/database/database.module.ts +++ b/apps/server/src/database/database.module.ts @@ -40,6 +40,8 @@ import { PageListener } from '@docmost/db/listeners/page.listener'; import { PostgresJSDialect } from 'kysely-postgres-js'; import * as postgres from 'postgres'; import { normalizePostgresUrl } from '../common/helpers'; +import { observeDbQuery } from '../integrations/metrics/metrics.registry'; +import { firstSqlToken } from '../integrations/metrics/metrics.constants'; @Global() @Module({ @@ -67,6 +69,14 @@ import { normalizePostgresUrl } from '../common/helpers'; }), plugins: [new CamelCasePlugin()], log: (event: LogEvent) => { + // #355 — db_query_duration_seconds, labelled by the leading SQL token + // (bounded cardinality). No-op when METRICS_PORT is unset. Runs for + // every query, independent of the dev-only debug logging below. + observeDbQuery( + firstSqlToken(event.query.sql), + event.queryDurationMillis / 1000, + ); + if (environmentService.getNodeEnv() !== 'development') return; const logger = new Logger(DatabaseModule.name); if (process.env.DEBUG_DB?.toLowerCase() === 'true') { diff --git a/apps/server/src/database/migrations/20260704T120000-client-metrics.ts b/apps/server/src/database/migrations/20260704T120000-client-metrics.ts new file mode 100644 index 00000000..92c42334 --- /dev/null +++ b/apps/server/src/database/migrations/20260704T120000-client-metrics.ts @@ -0,0 +1,52 @@ +import { type Kysely, sql } from 'kysely'; + +/** + * #355 — `client_metrics`: raw sink for client-side perf telemetry (web-vitals + * + custom editor/page metrics) posted to /api/telemetry/vitals. + * + * The table/columns/indexes here are a FIXED contract shared with the deployed + * Grafana infra (the `grafana_ro` role reads this table; a separate maintenance + * container prunes rows >90d and re-GRANTs daily). No app-side retention is + * added on purpose. Written as raw SQL to match that contract 1:1 (identity PK, + * conditional GRANT). + */ +export async function up(db: Kysely): Promise { + await sql` + CREATE TABLE client_metrics ( + id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + created_at timestamptz NOT NULL DEFAULT now(), + name text NOT NULL, -- INP|LCP|CLS|TTFB|editor_tx_ms|page_open_ms|longtask_ms + value double precision NOT NULL, + rating text, -- good|needs-improvement|poor (web-vitals only) + route text, -- templated: /s/:space/p/:slug — never raw slugs + attr text, -- attribution target, truncated to 120 chars + doc_size int, -- editor_tx_ms only + workspace_id uuid + ) + `.execute(db); + + await sql` + CREATE INDEX idx_client_metrics_name_created + ON client_metrics (name, created_at) + `.execute(db); + + await sql` + CREATE INDEX idx_client_metrics_created + ON client_metrics (created_at) + `.execute(db); + + // The read-only Grafana role only exists in the deployed environment; guard so + // the migration still applies cleanly in dev/CI where the role is absent. + await sql` + DO $$ + BEGIN + IF EXISTS (SELECT FROM pg_roles WHERE rolname = 'grafana_ro') THEN + GRANT SELECT ON client_metrics TO grafana_ro; + END IF; + END $$; + `.execute(db); +} + +export async function down(db: Kysely): Promise { + await sql`DROP TABLE IF EXISTS client_metrics`.execute(db); +} diff --git a/apps/server/src/database/types/db.d.ts b/apps/server/src/database/types/db.d.ts index dab98b3d..d5ec8d5e 100644 --- a/apps/server/src/database/types/db.d.ts +++ b/apps/server/src/database/types/db.d.ts @@ -156,6 +156,18 @@ export interface Billing { workspaceId: string; } +export interface ClientMetrics { + id: Generated; + createdAt: Generated; + name: string; + value: number; + rating: string | null; + route: string | null; + attr: string | null; + docSize: number | null; + workspaceId: string | null; +} + export interface Comments { aiChatId: string | null; content: Json | null; @@ -691,6 +703,7 @@ export interface DB { authProviders: AuthProviders; backlinks: Backlinks; billing: Billing; + clientMetrics: ClientMetrics; comments: Comments; favorites: Favorites; fileTasks: FileTasks; diff --git a/apps/server/src/integrations/metrics/http-metrics.hook.ts b/apps/server/src/integrations/metrics/http-metrics.hook.ts new file mode 100644 index 00000000..a01a7d1a --- /dev/null +++ b/apps/server/src/integrations/metrics/http-metrics.hook.ts @@ -0,0 +1,46 @@ +import { FastifyReply, FastifyRequest } from 'fastify'; +import { isStreamingResponse } from './metrics.constants'; +import { observeHttp } from './metrics.registry'; + +/** + * Resolve the BOUNDED route label for an HTTP response. + * + * HARD REQUIREMENT (#355): use the ROUTE TEMPLATE (`/pages/:id`), NEVER the raw + * URL (`/pages/abc-123`), so label cardinality stays finite. Fastify exposes the + * matched template on `req.routeOptions.url`. On 404s (no route matched) that is + * missing → collapse to the literal `unknown`. + */ +export function resolveRouteLabel(req: FastifyRequest): string { + const url = req.routeOptions?.url; + return typeof url === 'string' && url.length > 0 ? url : 'unknown'; +} + +/** + * Fastify onResponse handler that records http_request_duration_seconds. + * No-op when metrics are disabled (the hook is only registered when enabled, + * but the observe helpers are also guarded). Never throws into the response + * pipeline — telemetry must not break request handling. + */ +export function recordHttpResponse( + req: FastifyRequest, + reply: FastifyReply, +): void { + try { + const route = resolveRouteLabel(req); + + // Exclude SSE/streaming responses: onResponse fires at connection close for + // those, so it would record the stream lifetime and poison p95/p99. + const contentType = reply.getHeader('content-type'); + if (isStreamingResponse(contentType, route)) return; + + observeHttp( + req.method, + route, + reply.statusCode, + // Fastify measures elapsed time in ms; the metric is in seconds. + reply.elapsedTime / 1000, + ); + } catch { + // Swallow: a telemetry failure must never affect the served response. + } +} diff --git a/apps/server/src/integrations/metrics/metrics-bull.service.ts b/apps/server/src/integrations/metrics/metrics-bull.service.ts new file mode 100644 index 00000000..04722cd1 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics-bull.service.ts @@ -0,0 +1,146 @@ +import { + Injectable, + Logger, + OnModuleDestroy, + OnModuleInit, +} from '@nestjs/common'; +import { InjectQueue } from '@nestjs/bullmq'; +import { Queue, QueueEvents } from 'bullmq'; +import { QueueName } from '../queue/constants'; +import { EnvironmentService } from '../environment/environment.service'; +import { parseRedisUrl } from '../../common/helpers'; +import { + isMetricsEnabled, + observeJobDuration, + setQueueDepth, +} from './metrics.registry'; + +const POLL_INTERVAL_MS = 15_000; +// Cap the in-flight start-time map so a job that never emits completed/failed +// (worker crash) cannot leak memory unbounded. Well above realistic concurrency. +const MAX_INFLIGHT = 10_000; + +/** + * BullMQ instrumentation for #355: + * - `bullmq_queue_depth{queue}`: polled from getJobCounts() every 15s. + * - `bullmq_job_duration_seconds{queue}`: wall-clock time between a job going + * `active` and `completed`/`failed`, observed via per-queue QueueEvents. + * + * Queue names are a FINITE list (the QueueName enum), so labels are bounded — no + * job ids ever enter a label. Everything is gated on METRICS_PORT: when metrics + * are off, onModuleInit does nothing (no interval, no QueueEvents connections). + */ +@Injectable() +export class MetricsBullService implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(MetricsBullService.name); + private readonly queues: { label: string; queue: Queue }[]; + private timer: NodeJS.Timeout | null = null; + private queueEvents: QueueEvents[] = []; + // jobId -> start timestamp (ms). Bounded by MAX_INFLIGHT. + private readonly inflight = new Map(); + + constructor( + private readonly environmentService: EnvironmentService, + @InjectQueue(QueueName.EMAIL_QUEUE) emailQueue: Queue, + @InjectQueue(QueueName.ATTACHMENT_QUEUE) attachmentQueue: Queue, + @InjectQueue(QueueName.GENERAL_QUEUE) generalQueue: Queue, + @InjectQueue(QueueName.BILLING_QUEUE) billingQueue: Queue, + @InjectQueue(QueueName.FILE_TASK_QUEUE) fileTaskQueue: Queue, + @InjectQueue(QueueName.SEARCH_QUEUE) searchQueue: Queue, + @InjectQueue(QueueName.AI_QUEUE) aiQueue: Queue, + @InjectQueue(QueueName.HISTORY_QUEUE) historyQueue: Queue, + @InjectQueue(QueueName.NOTIFICATION_QUEUE) notificationQueue: Queue, + @InjectQueue(QueueName.AUDIT_QUEUE) auditQueue: Queue, + ) { + this.queues = [ + { label: 'email', queue: emailQueue }, + { label: 'attachment', queue: attachmentQueue }, + { label: 'general', queue: generalQueue }, + { label: 'billing', queue: billingQueue }, + { label: 'file-task', queue: fileTaskQueue }, + { label: 'search', queue: searchQueue }, + { label: 'ai', queue: aiQueue }, + { label: 'history', queue: historyQueue }, + { label: 'notification', queue: notificationQueue }, + { label: 'audit', queue: auditQueue }, + ]; + } + + onModuleInit(): void { + if (!isMetricsEnabled()) return; + + // Poll queue depth. + this.timer = setInterval(() => { + void this.pollDepths(); + }, POLL_INTERVAL_MS); + // Do not keep the event loop alive solely for polling. + this.timer.unref?.(); + void this.pollDepths(); + + // Wire per-queue job-duration events. + const redisConfig = parseRedisUrl(this.environmentService.getRedisUrl()); + const connection = { + host: redisConfig.host, + port: redisConfig.port, + password: redisConfig.password, + db: redisConfig.db, + family: redisConfig.family, + }; + + for (const { label, queue } of this.queues) { + const events = new QueueEvents(queue.name, { connection }); + events.on('active', ({ jobId }) => { + if (this.inflight.size >= MAX_INFLIGHT) { + // Drop the oldest tracked start to keep the map bounded. + const oldest = this.inflight.keys().next().value; + if (oldest !== undefined) this.inflight.delete(oldest); + } + this.inflight.set(jobId, Date.now()); + }); + const finalize = ({ jobId }: { jobId: string }) => { + const start = this.inflight.get(jobId); + if (start === undefined) return; + this.inflight.delete(jobId); + observeJobDuration(label, (Date.now() - start) / 1000); + }; + events.on('completed', finalize); + events.on('failed', finalize); + events.on('error', (err) => { + this.logger.debug(`QueueEvents error (${label}): ${err?.message}`); + }); + this.queueEvents.push(events); + } + } + + private async pollDepths(): Promise { + for (const { label, queue } of this.queues) { + try { + const counts = await queue.getJobCounts(); + // "Depth" = jobs not yet finished (backlog + in-flight). + const depth = + (counts.waiting ?? 0) + + (counts.active ?? 0) + + (counts.delayed ?? 0) + + (counts.prioritized ?? 0) + + (counts.paused ?? 0); + setQueueDepth(label, depth); + } catch (err) { + this.logger.debug( + `Failed to read job counts for ${label}: ${(err as Error)?.message}`, + ); + } + } + } + + async onModuleDestroy(): Promise { + if (this.timer) { + clearInterval(this.timer); + this.timer = null; + } + await Promise.all( + this.queueEvents.map((e) => e.close().catch(() => undefined)), + ); + this.queueEvents = []; + this.inflight.clear(); + } +} diff --git a/apps/server/src/integrations/metrics/metrics.constants.ts b/apps/server/src/integrations/metrics/metrics.constants.ts new file mode 100644 index 00000000..16ad4950 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.constants.ts @@ -0,0 +1,80 @@ +/** + * Perf-metrics contract (#355). These names/labels are FIXED by the already + * deployed scrape+dashboard infra (VictoriaMetrics scraping docmost:9464, + * Grafana dashboards, alerts). Do NOT rename them. + */ +export const METRIC_HTTP_REQUEST_DURATION = 'http_request_duration_seconds'; +export const METRIC_DB_QUERY_DURATION = 'db_query_duration_seconds'; +export const METRIC_BULLMQ_QUEUE_DEPTH = 'bullmq_queue_depth'; +export const METRIC_BULLMQ_JOB_DURATION = 'bullmq_job_duration_seconds'; +export const METRIC_COLLAB_STORE_DURATION = 'collab_store_duration_seconds'; + +// Histogram buckets (seconds). Chosen to give useful p50/p95/p99 resolution +// for typical web/DB latencies without exploding series cardinality. +export const HTTP_BUCKETS = [ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, +]; +export const DB_BUCKETS = [ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, +]; +export const COLLAB_BUCKETS = [ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, +]; +export const JOB_BUCKETS = [ + 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, +]; + +/** + * Extract the first SQL token (select/insert/update/delete/...) from a query, + * lower-cased, to use as a BOUNDED label for db_query_duration_seconds. Using + * the full query text would blow up label cardinality; the leading keyword is a + * finite set. Unknown/empty queries collapse to `other`. + */ +export function firstSqlToken(sql: string | undefined): string { + if (!sql) return 'other'; + // Skip leading whitespace / comments and grab the first word. + const match = /^[\s(]*([a-zA-Z]+)/.exec(sql); + if (!match) return 'other'; + const token = match[1].toLowerCase(); + const known = new Set([ + 'select', + 'insert', + 'update', + 'delete', + 'with', + 'begin', + 'commit', + 'rollback', + 'alter', + 'create', + 'drop', + 'truncate', + 'explain', + ]); + return known.has(token) ? token : 'other'; +} + +/** + * Whether an HTTP response must be EXCLUDED from http_request_duration_seconds. + * + * SSE/streaming responses (the AI-chat `text/event-stream`) keep the connection + * open for the whole conversation, so Fastify's onResponse fires only when the + * client disconnects — recording the connection lifetime, not a response time, + * which would poison p95/p99. We skip by content-type (authoritative) with a + * route-suffix fallback for the two known stream endpoints. + */ +export function isStreamingResponse( + contentType: unknown, + route: string | undefined, +): boolean { + if ( + typeof contentType === 'string' && + contentType.toLowerCase().includes('text/event-stream') + ) { + return true; + } + // Fallback: the AI-chat stream routes (/api/ai-chat/stream, + // /api/shares/ai/stream) both end in `/stream`. + if (route && route.endsWith('/stream')) return true; + return false; +} diff --git a/apps/server/src/integrations/metrics/metrics.module.ts b/apps/server/src/integrations/metrics/metrics.module.ts new file mode 100644 index 00000000..a89bdff0 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.module.ts @@ -0,0 +1,13 @@ +import { Module } from '@nestjs/common'; +import { MetricsBullService } from './metrics-bull.service'; + +/** + * Wires the BullMQ collectors (#355). The queues are provided by the @Global + * QueueModule (which exports BullModule), so no re-registration is needed here. + * The HTTP histogram, DB-query and collab-store collectors live in module-level + * singletons (metrics.registry) and are wired directly at their call sites. + */ +@Module({ + providers: [MetricsBullService], +}) +export class MetricsModule {} diff --git a/apps/server/src/integrations/metrics/metrics.registry.ts b/apps/server/src/integrations/metrics/metrics.registry.ts new file mode 100644 index 00000000..d62be165 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.registry.ts @@ -0,0 +1,126 @@ +import { + collectDefaultMetrics, + Histogram, + Gauge, + Registry, +} from 'prom-client'; +import { + COLLAB_BUCKETS, + DB_BUCKETS, + HTTP_BUCKETS, + JOB_BUCKETS, + METRIC_BULLMQ_JOB_DURATION, + METRIC_BULLMQ_QUEUE_DEPTH, + METRIC_COLLAB_STORE_DURATION, + METRIC_DB_QUERY_DURATION, + METRIC_HTTP_REQUEST_DURATION, +} from './metrics.constants'; + +/** + * Process-wide perf-metrics registry (#355). + * + * This is a plain module singleton (NOT a Nest provider) because the collectors + * are cross-cutting: the Kysely `log` callback (built in a DI factory), the + * Fastify onResponse hook (main.ts, before the Nest container hands out + * providers) and the collab persistence extension all need the SAME instruments + * without threading DI through them. + * + * HARD CONTRACT: when `METRICS_PORT` is unset the whole subsystem is OFF — the + * registry is never created, `collectDefaultMetrics` never runs, and every + * observe/set helper is a cheap no-op. Nothing is exposed on :3000. + */ + +// Decided once at process start. Deliberately read here (not via +// EnvironmentService) so the toggle is identical for the DI and non-DI callers. +const enabled = Boolean(process.env.METRICS_PORT); + +let registry: Registry | null = null; +let httpHist: Histogram<'method' | 'route' | 'status'> | null = null; +let dbHist: Histogram<'op'> | null = null; +let queueDepthGauge: Gauge<'queue'> | null = null; +let jobHist: Histogram<'queue'> | null = null; +let collabHist: Histogram | null = null; + +function init(): void { + if (registry || !enabled) return; + + registry = new Registry(); + + // Node/runtime metrics: gives nodejs_eventloop_lag_p99_seconds, GC, heap, etc. + collectDefaultMetrics({ register: registry }); + + httpHist = new Histogram({ + name: METRIC_HTTP_REQUEST_DURATION, + help: 'HTTP request duration in seconds, by method, route template and status', + labelNames: ['method', 'route', 'status'], + buckets: HTTP_BUCKETS, + registers: [registry], + }); + + dbHist = new Histogram({ + name: METRIC_DB_QUERY_DURATION, + help: 'Database query duration in seconds, by leading SQL keyword', + labelNames: ['op'], + buckets: DB_BUCKETS, + registers: [registry], + }); + + queueDepthGauge = new Gauge({ + name: METRIC_BULLMQ_QUEUE_DEPTH, + help: 'Number of not-yet-finished BullMQ jobs per queue', + labelNames: ['queue'], + registers: [registry], + }); + + jobHist = new Histogram({ + name: METRIC_BULLMQ_JOB_DURATION, + help: 'BullMQ job processing duration in seconds, per queue', + labelNames: ['queue'], + buckets: JOB_BUCKETS, + registers: [registry], + }); + + collabHist = new Histogram({ + name: METRIC_COLLAB_STORE_DURATION, + help: 'Collaboration onStoreDocument duration in seconds', + buckets: COLLAB_BUCKETS, + registers: [registry], + }); +} + +// Runs once when this module is first imported. Safe to call again (idempotent). +init(); + +export function isMetricsEnabled(): boolean { + return enabled; +} + +/** The prom-client registry, or null when metrics are disabled. */ +export function getMetricsRegistry(): Registry | null { + return registry; +} + +export function observeHttp( + method: string, + route: string, + status: number, + seconds: number, +): void { + httpHist?.observe({ method, route, status }, seconds); +} + +export function observeDbQuery(op: string, seconds: number): void { + dbHist?.observe({ op }, seconds); +} + +export function setQueueDepth(queue: string, depth: number): void { + queueDepthGauge?.set({ queue }, depth); +} + +export function observeJobDuration(queue: string, seconds: number): void { + jobHist?.observe({ queue }, seconds); +} + +export function observeCollabStore(seconds: number): void { + collabHist?.observe(seconds); +} diff --git a/apps/server/src/integrations/metrics/metrics.server.ts b/apps/server/src/integrations/metrics/metrics.server.ts new file mode 100644 index 00000000..f1b90b15 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.server.ts @@ -0,0 +1,56 @@ +import { createServer, Server } from 'node:http'; +import { Logger } from '@nestjs/common'; +import { getMetricsRegistry, isMetricsEnabled } from './metrics.registry'; + +/** + * Start the Prometheus scrape endpoint on a SEPARATE port (default 9464, + * overridable via `METRICS_PORT`). This is a bare node:http server, NOT part of + * the Fastify app, so `/metrics` never exists on the public :3000 listener. + * + * Returns the http.Server (so callers can close it on shutdown) or null when + * metrics are disabled. + */ +export function startMetricsServer(): Server | null { + if (!isMetricsEnabled()) return null; + + const logger = new Logger('MetricsServer'); + const register = getMetricsRegistry(); + if (!register) return null; + + const port = Number(process.env.METRICS_PORT); + if (!Number.isInteger(port) || port <= 0) { + logger.warn( + `Invalid METRICS_PORT="${process.env.METRICS_PORT}", metrics endpoint not started`, + ); + return null; + } + + const server = createServer(async (req, res) => { + if (req.method === 'GET' && req.url === '/metrics') { + try { + const body = await register.metrics(); + res.setHeader('Content-Type', register.contentType); + res.statusCode = 200; + res.end(body); + } catch (err) { + res.statusCode = 500; + res.end(String((err as Error)?.message ?? 'error')); + } + return; + } + res.statusCode = 404; + res.end(); + }); + + // Bind on all interfaces: the scraper (VictoriaMetrics) reaches this from + // another container as docmost:9464. The port is not published to the host. + server.listen(port, '0.0.0.0', () => { + logger.log(`Metrics endpoint listening on :${port}/metrics`); + }); + + server.on('error', (err) => { + logger.error(`Metrics server error: ${err?.message}`); + }); + + return server; +} diff --git a/apps/server/src/integrations/metrics/metrics.spec.ts b/apps/server/src/integrations/metrics/metrics.spec.ts new file mode 100644 index 00000000..ffcc11e6 --- /dev/null +++ b/apps/server/src/integrations/metrics/metrics.spec.ts @@ -0,0 +1,70 @@ +import { FastifyRequest } from 'fastify'; +import { resolveRouteLabel } from './http-metrics.hook'; +import { firstSqlToken, isStreamingResponse } from './metrics.constants'; + +describe('resolveRouteLabel (histogram route label)', () => { + it('uses the ROUTE TEMPLATE, never the raw URL', () => { + // routeOptions.url is the matched template; url is the raw path with the id. + const req = { + url: '/api/pages/abc-123-def', + routeOptions: { url: '/api/pages/:id' }, + } as unknown as FastifyRequest; + expect(resolveRouteLabel(req)).toBe('/api/pages/:id'); + expect(resolveRouteLabel(req)).not.toContain('abc-123-def'); + }); + + it('falls back to "unknown" on a 404 (no matched route template)', () => { + const req = { + url: '/totally/unmatched/path', + routeOptions: {}, + } as unknown as FastifyRequest; + expect(resolveRouteLabel(req)).toBe('unknown'); + }); + + it('falls back to "unknown" when routeOptions is missing', () => { + const req = { url: '/x' } as unknown as FastifyRequest; + expect(resolveRouteLabel(req)).toBe('unknown'); + }); +}); + +describe('isStreamingResponse (SSE exclusion)', () => { + it('excludes text/event-stream responses by content-type', () => { + expect(isStreamingResponse('text/event-stream', '/api/ai-chat/stream')).toBe( + true, + ); + expect(isStreamingResponse('text/event-stream; charset=utf-8', '/x')).toBe( + true, + ); + }); + + it('excludes known /stream routes by suffix as a fallback', () => { + expect(isStreamingResponse('application/json', '/api/ai-chat/stream')).toBe( + true, + ); + expect(isStreamingResponse(undefined, '/api/shares/ai/stream')).toBe(true); + }); + + it('does not exclude ordinary JSON responses', () => { + expect(isStreamingResponse('application/json', '/api/pages/:id')).toBe( + false, + ); + expect(isStreamingResponse(undefined, '/api/pages/:id')).toBe(false); + }); +}); + +describe('firstSqlToken (bounded db label)', () => { + it('returns the lower-cased leading keyword', () => { + expect(firstSqlToken('SELECT * FROM pages')).toBe('select'); + expect(firstSqlToken(' insert into x values (1)')).toBe('insert'); + expect(firstSqlToken('UPDATE pages SET a=1')).toBe('update'); + expect(firstSqlToken('delete from pages')).toBe('delete'); + expect(firstSqlToken('(SELECT 1)')).toBe('select'); + }); + + it('collapses unknown/empty queries to "other"', () => { + expect(firstSqlToken('')).toBe('other'); + expect(firstSqlToken(undefined)).toBe('other'); + expect(firstSqlToken('123 not sql')).toBe('other'); + expect(firstSqlToken('vacuum analyze')).toBe('other'); + }); +}); diff --git a/apps/server/src/integrations/throttle/throttle.module.ts b/apps/server/src/integrations/throttle/throttle.module.ts index db197015..4ad0f9c4 100644 --- a/apps/server/src/integrations/throttle/throttle.module.ts +++ b/apps/server/src/integrations/throttle/throttle.module.ts @@ -9,6 +9,7 @@ import { AI_CHAT_THROTTLER, PAGE_TEMPLATE_THROTTLER, PUBLIC_SHARE_AI_THROTTLER, + VITALS_THROTTLER, } from './throttler-names'; @Module({ @@ -29,6 +30,8 @@ import { { name: PAGE_TEMPLATE_THROTTLER, ttl: 60_000, limit: 30 }, // Anonymous public-share assistant: ~5 req/min per IP. { name: PUBLIC_SHARE_AI_THROTTLER, ttl: 60_000, limit: 5 }, + // Anonymous client perf-telemetry sink: 120 batched posts/min per IP. + { name: VITALS_THROTTLER, ttl: 60_000, limit: 120 }, ], errorMessage: 'Too many requests', // Pass ioredis options (not a pre-built Redis instance) so diff --git a/apps/server/src/integrations/throttle/throttler-names.ts b/apps/server/src/integrations/throttle/throttler-names.ts index f1ab971e..c54b0898 100644 --- a/apps/server/src/integrations/throttle/throttler-names.ts +++ b/apps/server/src/integrations/throttle/throttler-names.ts @@ -6,3 +6,7 @@ export const PAGE_TEMPLATE_THROTTLER = 'page-template'; // ThrottlerGuard tracker) to bound anonymous abuse — the workspace owner pays // for the tokens. export const PUBLIC_SHARE_AI_THROTTLER = 'public-share-ai'; +// IP-keyed throttler for the anonymous client perf-telemetry sink +// (POST /api/telemetry/vitals). Browsers batch metrics, so the limit is +// generous; it only exists to bound abuse of the public, unauthenticated route. +export const VITALS_THROTTLER = 'vitals'; diff --git a/apps/server/src/main.ts b/apps/server/src/main.ts index b5fa0027..e31f2ddc 100644 --- a/apps/server/src/main.ts +++ b/apps/server/src/main.ts @@ -16,6 +16,9 @@ import { EnvironmentService } from './integrations/environment/environment.servi import { SANDBOX_API_PATH } from './integrations/sandbox/sandbox.constants'; import { resolveFrameHeader } from './common/helpers'; import { resolveTrustProxy } from './integrations/environment/trust-proxy.util'; +import { isMetricsEnabled } from './integrations/metrics/metrics.registry'; +import { recordHttpResponse } from './integrations/metrics/http-metrics.hook'; +import { startMetricsServer } from './integrations/metrics/metrics.server'; async function bootstrap() { const app = await NestFactory.create( @@ -91,6 +94,19 @@ async function bootstrap() { done(); }); + // #355 — HTTP request-duration histogram. Registered ONLY when METRICS_PORT is + // set (otherwise no collector runs at all). Uses the bounded route template + // label and excludes SSE/streaming responses (see recordHttpResponse). + if (isMetricsEnabled()) { + app + .getHttpAdapter() + .getInstance() + .addHook('onResponse', (req, reply, done) => { + recordHttpResponse(req, reply); + done(); + }); + } + app .getHttpAdapter() .getInstance() @@ -127,6 +143,9 @@ async function bootstrap() { '/api/workspace/create', '/api/workspace/joined', '/api/workspace/find-by-email', + // Public client perf-telemetry sink: browsers post it without a + // resolved workspace host, so the workspace-resolution gate must not 404 it. + '/api/telemetry/vitals', // Anonymous in-RAM blob sandbox: a remote consumer fetches blobs by an // unguessable UUID without any workspace host context, so the // workspace-resolution gate must not apply. @@ -175,6 +194,10 @@ async function bootstrap() { `Listening on http://127.0.0.1:${port} / ${process.env.APP_URL}`, ); }); + + // #355 — Prometheus scrape endpoint on a SEPARATE port (METRICS_PORT, default + // 9464), started after the app is up. No-op when METRICS_PORT is unset. + startMetricsServer(); } bootstrap(); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 299f5e51..0d99a612 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -416,6 +416,9 @@ importers: socket.io-client: specifier: 4.8.3 version: 4.8.3 + web-vitals: + specifier: ^5.1.0 + version: 5.1.0 zod: specifier: 4.3.6 version: 4.3.6 @@ -744,6 +747,9 @@ importers: postmark: specifier: ^4.0.7 version: 4.0.7 + prom-client: + specifier: ^15.1.3 + version: 15.1.3 react: specifier: ^18.3.1 version: 18.3.1 @@ -5988,6 +5994,9 @@ packages: bind-event-listener@3.0.0: resolution: {integrity: sha512-PJvH288AWQhKs2v9zyfYdPzlPqf5bXbGMmhmUIY9x4dAUGIWgomO771oBQNwJnMQSnUIXhKu6sgzpBRXTlvb8Q==} + bintrees@1.0.2: + resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==} + bl@4.1.0: resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} @@ -9318,6 +9327,10 @@ packages: process-warning@5.0.0: resolution: {integrity: sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA==} + prom-client@15.1.3: + resolution: {integrity: sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==} + engines: {node: ^16 || ^18 || >=20} + prompts@2.4.2: resolution: {integrity: sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==} engines: {node: '>= 6'} @@ -10145,6 +10158,9 @@ packages: resolution: {integrity: sha512-4LeEWl96twnS2Q7Bz4MGqgazLqO+hJN63GZxXoIqh1T3VweYD997gbU1ItNsQafqqXTXd5WFyFdReLtwvRBNiw==} engines: {node: '>=18'} + tdigest@0.1.2: + resolution: {integrity: sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==} + terser-webpack-plugin@5.4.0: resolution: {integrity: sha512-Bn5vxm48flOIfkdl5CaD2+1CiUVbonWQ3KQPyP7/EuIl9Gbzq/gQFOzaMFUEgVjB1396tcK0SG8XcNJ/2kDH8g==} engines: {node: '>= 10.13.0'} @@ -16153,7 +16169,7 @@ snapshots: obug: 2.1.1 std-env: 4.1.0 tinyrainbow: 3.1.0 - vitest: 4.1.6(@opentelemetry/api@1.9.0)(@types/node@25.5.0)(@vitest/coverage-v8@4.1.6)(happy-dom@20.8.9)(jsdom@27.4.0(@noble/hashes@2.0.1))(vite@8.0.5(@types/node@25.5.0)(esbuild@0.28.0)(jiti@2.4.2)(less@4.2.0)(sugarss@5.0.1(postcss@8.5.14))(terser@5.39.0)(tsx@4.21.0)(yaml@2.8.3)) + vitest: 4.1.6(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(@vitest/coverage-v8@4.1.6)(happy-dom@20.8.9)(jsdom@25.0.0)(vite@8.0.5(@types/node@22.19.1)(esbuild@0.28.0)(jiti@2.4.2)(less@4.2.0)(sugarss@5.0.1(postcss@8.5.14))(terser@5.39.0)(tsx@4.21.0)(yaml@2.8.3)) '@vitest/expect@4.1.6': dependencies: @@ -16665,6 +16681,8 @@ snapshots: bind-event-listener@3.0.0: {} + bintrees@1.0.2: {} + bl@4.1.0: dependencies: buffer: 5.7.1 @@ -20476,6 +20494,11 @@ snapshots: process-warning@5.0.0: {} + prom-client@15.1.3: + dependencies: + '@opentelemetry/api': 1.9.0 + tdigest: 0.1.2 + prompts@2.4.2: dependencies: kleur: 3.0.3 @@ -21521,6 +21544,10 @@ snapshots: minizlib: 3.1.0 yallist: 5.0.0 + tdigest@0.1.2: + dependencies: + bintrees: 1.0.2 + terser-webpack-plugin@5.4.0(@swc/core@1.5.25(@swc/helpers@0.5.5))(webpack@5.106.0(@swc/core@1.5.25(@swc/helpers@0.5.5))): dependencies: '@jridgewell/trace-mapping': 0.3.31