Merge pull request 'feat(observability): дев-часть перф-метрик — /metrics :9464 + client vitals (#355)' (#358) from feat/355-perf-metrics into develop

Reviewed-on: #358
This commit was merged in pull request #358.
This commit is contained in:
2026-07-05 00:31:04 +03:00
35 changed files with 1718 additions and 6 deletions
+24
View File
@@ -242,3 +242,27 @@ MCP_DOCMOST_PASSWORD=
# FAILS CLOSED if Redis is unavailable (default: 1,000,000 tokens per workspace
# per rolling day).
# SHARE_AI_WORKSPACE_TOKEN_BUDGET_PER_DAY=1000000
# --- Observability / perf metrics (#355) ---
#
# Two INDEPENDENT toggles, both OFF by default:
#
# 1) METRICS_PORT — the server-side Prometheus scrape endpoint.
# UNSET (default) => the whole prom subsystem is OFF: no registry, no
# collectors, and NOTHING is exposed on the main app port. There is NO
# default port — leaving it blank disables it. When set to a port (e.g.
# 9464), a SEPARATE bare node:http listener serves GET /metrics on that port
# only (never on the main :3000 app listener), for a scraper such as
# VictoriaMetrics/Prometheus reaching it as <host>:<port>/metrics.
# METRICS_PORT=9464
#
# 2) CLIENT_TELEMETRY_ENABLED — the public client perf-telemetry sink.
# OFF by default. When true, the unauthenticated POST /api/telemetry/vitals
# endpoint is registered and browsers collect + send web-vitals / editor
# metrics into the `client_metrics` table (read directly by Grafana, separate
# from METRICS_PORT). Leave OFF unless you actually consume this data: the
# endpoint is public and the table has NO app-side retention, so enabling it
# requires an EXTERNAL pruner to bound `client_metrics` growth (the deployed
# infra prunes rows >90d via a maintenance container). When off, the endpoint
# does not exist and the client installs no observers.
# CLIENT_TELEMETRY_ENABLED=false
+1
View File
@@ -61,6 +61,7 @@
"react-clear-modal": "^2.0.18",
"react-dom": "^18.3.1",
"react-drawio": "1.0.7",
"web-vitals": "^5.1.0",
"react-error-boundary": "6.1.1",
"react-helmet-async": "3.0.0",
"react-i18next": "16.5.8",
@@ -93,6 +93,11 @@ import {
isBodyEditable,
isCollabSynced,
} from "@/features/editor/editor-sync-state";
import {
isVitalsActive,
measurePageOpen,
reportEditorTx,
} from "@/lib/telemetry/vitals";
interface PageEditorProps {
pageId: string;
@@ -351,6 +356,40 @@ export default function PageEditor({
editor.storage.pageId = pageId;
handleScrollTo(editor);
editorRef.current = editor;
// #355 — perf instrumentation. Skip ALL of it when telemetry is
// disabled (F1 flag off) or this session isn't sampled: no page-open
// measure, and crucially NO dispatch wrapping, so a non-collecting
// session pays zero per-transaction cost.
if (isVitalsActive()) {
// page_open_ms: this is the first editor-content render, so measure
// against any page-open mark set on the tree-row/link click.
measurePageOpen();
// editor_tx_ms: time the SYNCHRONOUS part of applying each
// transaction (state.apply + updateState) by wrapping the view's
// dispatch. Only slow syncs (>8ms) are reported (see reportEditorTx),
// so the common path adds just one performance.now() pair. Passive:
// the original dispatch still runs unchanged.
try {
const view = editor.view as unknown as {
dispatch: (tr: unknown) => void;
};
const originalDispatch = view.dispatch.bind(view);
view.dispatch = (tr: unknown) => {
const started = performance.now();
originalDispatch(tr);
const elapsed = performance.now() - started;
try {
reportEditorTx(elapsed, editor.state.doc.content.size);
} catch {
// never let telemetry break editing
}
};
} catch {
// if the view shape changes, skip editor_tx instrumentation
}
}
}
},
onUpdate({ editor }) {
+7
View File
@@ -47,6 +47,13 @@ export function isCompactPageTreeEnabled(): boolean {
return castToBoolean(getConfigValue("COMPACT_PAGE_TREE", "true"));
}
// #355 — operator toggle for client perf-telemetry. DEFAULT OFF: the server
// mirrors CLIENT_TELEMETRY_ENABLED into window.CONFIG; when off the client
// installs no observers and sends nothing (the sink endpoint doesn't exist).
export function isClientTelemetryEnabled(): boolean {
return castToBoolean(getConfigValue("CLIENT_TELEMETRY_ENABLED", "false"));
}
export function getAvatarUrl(
avatarUrl: string,
type: AvatarIconType = AvatarIconType.AVATAR,
@@ -0,0 +1,35 @@
import { describe, it, expect } from "vitest";
import { templateRoute } from "./route-template";
describe("templateRoute", () => {
it("templates a space page path (never leaks slugs)", () => {
const t = templateRoute("/s/engineering/p/design-doc-abc123");
expect(t).toBe("/s/:space/p/:slug");
expect(t).not.toContain("engineering");
expect(t).not.toContain("design-doc");
});
it("templates share, redirect and space paths", () => {
expect(templateRoute("/share/abc/p/xyz")).toBe("/share/:shareId/p/:slug");
expect(templateRoute("/share/p/xyz")).toBe("/share/p/:slug");
expect(templateRoute("/p/some-slug")).toBe("/p/:slug");
expect(templateRoute("/s/team")).toBe("/s/:space");
expect(templateRoute("/s/team/trash")).toBe("/s/:space/trash");
expect(templateRoute("/labels/urgent")).toBe("/labels/:label");
});
it("keeps known static routes verbatim", () => {
expect(templateRoute("/home")).toBe("/home");
expect(templateRoute("/settings/members")).toBe("/settings/members");
expect(templateRoute("/")).toBe("/");
});
it("normalises a trailing slash", () => {
expect(templateRoute("/s/team/p/slug/")).toBe("/s/:space/p/:slug");
});
it("collapses unknown paths to 'other' (bounded cardinality)", () => {
expect(templateRoute("/weird/unknown/thing")).toBe("other");
expect(templateRoute("/s/team/p/slug/extra/segments")).toBe("other");
});
});
@@ -0,0 +1,70 @@
/**
* Map a raw pathname to a BOUNDED route TEMPLATE (#355).
*
* Perf metrics must be labelled by route template only — never a raw path with
* slugs/ids — so the server-side `route` column and any downstream aggregation
* stay low-cardinality and carry NO page slugs/titles (privacy). Anything that
* does not match a known pattern collapses to `other`.
*
* The template vocabulary mirrors the issue's example (`/s/:space/p/:slug`).
*/
const ROUTE_PATTERNS: { re: RegExp; template: string }[] = [
// Share pages (public).
{ re: /^\/share\/[^/]+\/p\/[^/]+$/, template: '/share/:shareId/p/:slug' },
{ re: /^\/share\/p\/[^/]+$/, template: '/share/p/:slug' },
{ re: /^\/share\/[^/]+$/, template: '/share/:shareId' },
// Page redirect.
{ re: /^\/p\/[^/]+$/, template: '/p/:slug' },
// Space + page.
{ re: /^\/s\/[^/]+\/p\/[^/]+$/, template: '/s/:space/p/:slug' },
{ re: /^\/s\/[^/]+\/trash$/, template: '/s/:space/trash' },
{ re: /^\/s\/[^/]+$/, template: '/s/:space' },
// Misc dynamic.
{ re: /^\/labels\/[^/]+$/, template: '/labels/:label' },
{ re: /^\/invites\/[^/]+$/, template: '/invites/:invitationId' },
{ re: /^\/settings\/groups\/[^/]+$/, template: '/settings/groups/:groupId' },
];
// Static routes we accept verbatim (finite set).
const STATIC_ROUTES = new Set<string>([
'/home',
'/spaces',
'/favorites',
'/login',
'/forgot-password',
'/password-reset',
'/setup/register',
'/settings/account/profile',
'/settings/account/preferences',
'/settings/workspace',
'/settings/ai',
'/settings/members',
'/settings/groups',
'/settings/spaces',
'/settings/sharing',
]);
export function templateRoute(pathname: string): string {
// Normalise a trailing slash (except root).
const path =
pathname.length > 1 && pathname.endsWith('/')
? pathname.slice(0, -1)
: pathname;
if (path === '' || path === '/') return '/';
if (STATIC_ROUTES.has(path)) return path;
for (const { re, template } of ROUTE_PATTERNS) {
if (re.test(path)) return template;
}
return 'other';
}
/** Template for the current window location. */
export function currentRouteTemplate(): string {
try {
return templateRoute(window.location.pathname);
} catch {
return 'other';
}
}
+290
View File
@@ -0,0 +1,290 @@
import {
onCLS,
onINP,
onLCP,
onTTFB,
type CLSMetricWithAttribution,
type INPMetricWithAttribution,
type LCPMetricWithAttribution,
type TTFBMetricWithAttribution,
} from "web-vitals/attribution";
import { isClientTelemetryEnabled } from "@/lib/config";
import { currentRouteTemplate } from "./route-template";
/**
* Client perf-telemetry (#355): web-vitals + custom metrics buffered and posted
* to POST /api/telemetry/vitals via sendBeacon.
*
* Design constraints from the issue:
* - Sampling is decided ONCE per session (25%), cached in sessionStorage,
* BEFORE any observer is subscribed. Non-sampled sessions send nothing.
* - Route labels are TEMPLATES only; attr is truncated to 120 chars; no page
* titles/slugs/text ever leave the browser.
* - Observers are passive and reporting is best-effort — telemetry must not
* degrade the perf it measures.
*/
const ENDPOINT = "/api/telemetry/vitals";
const SAMPLE_RATE = 0.25;
const SAMPLE_KEY = "gm_vitals_sampled";
const FLUSH_INTERVAL_MS = 15_000;
const MAX_BUFFER = 40; // flush early if the buffer fills between timers
const MAX_ATTR_LENGTH = 120;
const EDITOR_TX_MIN_MS = 8; // only report editor transactions slower than this
const ALLOWED_NAMES = new Set([
"INP",
"LCP",
"CLS",
"TTFB",
"editor_tx_ms",
"page_open_ms",
"longtask_ms",
]);
interface VitalEvent {
name: string;
value: number;
rating?: string;
route?: string;
attr?: string;
docSize?: number;
}
let sampledCache: boolean | null = null;
let initialised = false;
let buffer: VitalEvent[] = [];
let longtaskSum = 0; // accumulated longtask duration (ms) for the current window
/**
* Decide once per session whether this session is sampled. Cached in
* sessionStorage so the choice is stable across reloads within the session and
* identical for every observer/custom-metric caller.
*/
export function isVitalsSampled(): boolean {
if (sampledCache !== null) return sampledCache;
try {
const stored = sessionStorage.getItem(SAMPLE_KEY);
if (stored === "1") return (sampledCache = true);
if (stored === "0") return (sampledCache = false);
const sampled = Math.random() < SAMPLE_RATE;
sessionStorage.setItem(SAMPLE_KEY, sampled ? "1" : "0");
return (sampledCache = sampled);
} catch {
// sessionStorage unavailable (private mode / SSR): default to not sampled.
return (sampledCache = false);
}
}
/**
* True only when telemetry is BOTH enabled by the operator (F1 flag) AND this
* session is sampled. Callers outside initVitals (e.g. the editor dispatch
* wrapper) use this to skip ALL instrumentation cost on disabled/non-sampled
* sessions — no observers, no per-transaction timing.
*/
export function isVitalsActive(): boolean {
return isClientTelemetryEnabled() && isVitalsSampled();
}
function truncateAttr(value: unknown): string | undefined {
if (typeof value !== "string" || value.length === 0) return undefined;
return value.slice(0, MAX_ATTR_LENGTH);
}
function enqueue(event: VitalEvent): void {
if (!ALLOWED_NAMES.has(event.name)) return;
if (!Number.isFinite(event.value)) return;
buffer.push(event);
if (buffer.length >= MAX_BUFFER) flush();
}
function flush(): void {
// Fold any pending longtask total into the batch first.
if (longtaskSum > 0) {
buffer.push({
name: "longtask_ms",
value: Math.round(longtaskSum),
route: currentRouteTemplate(),
});
longtaskSum = 0;
}
if (buffer.length === 0) return;
const payload = JSON.stringify({ events: buffer });
buffer = [];
try {
const blob = new Blob([payload], { type: "application/json" });
if (navigator.sendBeacon && navigator.sendBeacon(ENDPOINT, blob)) return;
// Fallback for browsers without sendBeacon: keepalive fetch.
void fetch(ENDPOINT, {
method: "POST",
body: payload,
headers: { "Content-Type": "application/json" },
keepalive: true,
}).catch(() => undefined);
} catch {
// Best-effort: never throw out of telemetry.
}
}
/**
* Report a custom client metric (editor_tx_ms, page_open_ms). No-op unless the
* session is sampled. Route is always the current TEMPLATE.
*/
export function reportClientMetric(
name: "editor_tx_ms" | "page_open_ms",
value: number,
extra?: { docSize?: number },
): void {
if (!isVitalsActive()) return;
if (!Number.isFinite(value)) return;
enqueue({
name,
value,
route: currentRouteTemplate(),
docSize: extra?.docSize,
});
}
/** Threshold-gated editor transaction reporter (only reports slow syncs). */
export function reportEditorTx(ms: number, docSize: number): void {
if (ms <= EDITOR_TX_MIN_MS) return;
reportClientMetric("editor_tx_ms", ms, { docSize });
}
const PAGE_OPEN_MARK = "gm_page_open_start";
/** Mark the start of a page-open interaction (tree-row / link click). */
export function markPageOpenStart(): void {
try {
performance.clearMarks(PAGE_OPEN_MARK);
performance.mark(PAGE_OPEN_MARK);
} catch {
// ignore
}
}
/**
* Measure page_open_ms at first editor-content render, if a start mark exists.
* Consumes the mark so a later render doesn't double-count.
*/
export function measurePageOpen(): void {
try {
const marks = performance.getEntriesByName(PAGE_OPEN_MARK, "mark");
if (marks.length === 0) return;
const started = marks[0].startTime;
const elapsed = performance.now() - started;
performance.clearMarks(PAGE_OPEN_MARK);
if (elapsed > 0 && Number.isFinite(elapsed)) {
reportClientMetric("page_open_ms", elapsed);
}
} catch {
// ignore
}
}
function attrTarget(
metric:
| INPMetricWithAttribution
| LCPMetricWithAttribution
| CLSMetricWithAttribution,
): string | undefined {
const a = metric.attribution as Record<string, unknown> | undefined;
if (!a) return undefined;
// Different vitals expose their culprit element under different keys; only a
// CSS-selector-ish target string is taken (no text content / titles).
return (
truncateAttr(a.interactionTarget) ??
truncateAttr(a.element) ??
truncateAttr(a.largestShiftTarget) ??
undefined
);
}
/**
* Initialise client telemetry. Safe to call multiple times (idempotent). Returns
* immediately without subscribing when the session is not sampled — so a
* non-sampled session subscribes to NO observers and sends nothing.
*/
export function initVitals(): void {
if (initialised) return;
initialised = true;
// Operator flag gate (F1, default OFF): when telemetry is disabled the sink
// endpoint does not even exist server-side, so install ZERO observers.
if (!isClientTelemetryEnabled()) return;
// Sampling gate is evaluated BEFORE any observer subscription.
if (!isVitalsSampled()) return;
const report = (
metric:
| INPMetricWithAttribution
| LCPMetricWithAttribution
| CLSMetricWithAttribution
| TTFBMetricWithAttribution,
) => {
enqueue({
name: metric.name,
value: metric.value,
rating: metric.rating,
route: currentRouteTemplate(),
attr:
metric.name === "TTFB"
? undefined
: attrTarget(
metric as
| INPMetricWithAttribution
| LCPMetricWithAttribution
| CLSMetricWithAttribution,
),
});
};
onINP(report);
onLCP(report);
onCLS(report);
onTTFB(report);
// Long tasks: aggregate the total blocking time per flush window (a passive
// observer; individual entries are summed, never stored/sent individually).
try {
if (typeof PerformanceObserver !== "undefined") {
const observer = new PerformanceObserver((list) => {
for (const entry of list.getEntries()) {
longtaskSum += entry.duration;
}
});
observer.observe({ type: "longtask", buffered: true });
}
} catch {
// longtask entry type unsupported: skip silently.
}
// page_open_ms start: mark when the user clicks a page link/tree-row (any
// anchor navigating to a page URL). Passive capture listener; the matching
// measure fires at first editor-content render (measurePageOpen). No page
// titles/slugs are read — only the click timing is marked.
document.addEventListener(
"click",
(event) => {
const target = event.target as Element | null;
const anchor = target?.closest?.("a[href]") as HTMLAnchorElement | null;
if (!anchor) return;
const href = anchor.getAttribute("href") ?? "";
// A page link is `/s/:space/p/:slug`, `/p/:slug` or a share page path.
if (/\/p\//.test(href)) markPageOpenStart();
},
{ capture: true, passive: true },
);
// Flush on tab hide (most reliable delivery point) and periodically.
const onHidden = () => {
if (document.visibilityState === "hidden") flush();
};
document.addEventListener("visibilitychange", onHidden);
window.addEventListener("pagehide", flush);
setInterval(flush, FLUSH_INTERVAL_MS);
}
+5
View File
@@ -22,6 +22,7 @@ import {
isPostHogEnabled,
} from "@/lib/config.ts";
import posthog from "posthog-js";
import { initVitals } from "@/lib/telemetry/vitals";
export const queryClient = new QueryClient({
defaultOptions: {
@@ -43,6 +44,10 @@ if (isCloud() && isPostHogEnabled) {
});
}
// #355 — client perf-telemetry. Decides sampling ONCE (25%/session) before
// subscribing to any observer; non-sampled sessions send nothing.
initVitals();
const container = document.getElementById("root") as HTMLElement;
const root = (container as any).__reactRoot ??= ReactDOM.createRoot(container);
+1
View File
@@ -111,6 +111,7 @@
"pino-pretty": "^13.1.3",
"postgres": "^3.4.8",
"postmark": "^4.0.7",
"prom-client": "^15.1.3",
"react": "^18.3.1",
"react-email": "6.0.8",
"reflect-metadata": "^0.2.2",
+6
View File
@@ -31,6 +31,8 @@ import { McpModule } from './integrations/mcp/mcp.module';
import { SandboxModule } from './integrations/sandbox/sandbox.module';
import { AiModule } from './integrations/ai/ai.module';
import { AiChatModule } from './core/ai-chat/ai-chat.module';
import { MetricsModule } from './integrations/metrics/metrics.module';
import { ClientTelemetryModule } from './core/telemetry/client-telemetry.module';
const enterpriseModules = [];
try {
@@ -93,6 +95,10 @@ try {
SandboxModule,
AiModule,
AiChatModule,
MetricsModule,
// Gated OFF by default: only registers the public vitals sink controller
// when CLIENT_TELEMETRY_ENABLED=true (maintainer decision E1=B).
ClientTelemetryModule.register(),
...enterpriseModules,
],
controllers: [AppController],
@@ -41,6 +41,7 @@ import {
HISTORY_INTERVAL,
} from '../constants';
import { TransclusionService } from '../../core/page/transclusion/transclusion.service';
import { observeCollabStore } from '../../integrations/metrics/metrics.registry';
/**
* #251 — wire format of the client→server stateless message that signals a
@@ -192,6 +193,17 @@ export class PersistenceExtension implements Extension {
}
async onStoreDocument(data: onStoreDocumentPayload) {
// #355 — time the full store (persist + post-store side effects) into
// collab_store_duration_seconds. No-op when METRICS_PORT is unset.
const startedAt = performance.now();
try {
await this.storeDocument(data);
} finally {
observeCollabStore((performance.now() - startedAt) / 1000);
}
}
private async storeDocument(data: onStoreDocumentPayload) {
const { documentName, document, context } = data;
const pageId = getPageId(documentName);
+11 -5
View File
@@ -16,6 +16,7 @@ import {
AUTH_THROTTLER,
PAGE_TEMPLATE_THROTTLER,
PUBLIC_SHARE_AI_THROTTLER,
VITALS_THROTTLER,
} from '../../integrations/throttle/throttler-names';
import { LoginDto } from './dto/login.dto';
import { AuthService } from './services/auth.service';
@@ -184,16 +185,21 @@ export class AuthController {
}
// The global ThrottlerGuard applies ALL named throttlers to every route by
// default, so each non-AUTH bucket (AI chat, page template, public-share AI)
// is explicitly skipped here. collab-token is auth-guarded (JwtAuthGuard),
// per-user and client-cached, so those feature buckets are irrelevant to it;
// skipping them avoids spurious 429s when a user opens many pages in a short
// window. The AUTH bucket is skipped too for the same per-user, cached reason.
// default, so each non-AUTH bucket (AI chat, page template, public-share AI,
// client vitals) is explicitly skipped here. collab-token is auth-guarded
// (JwtAuthGuard), per-user and client-cached, so those feature buckets are
// irrelevant to it; skipping them avoids spurious 429s when a user opens many
// pages in a short window. The VITALS bucket must be skipped too: it is a
// process-wide named throttler, so without this skip its per-IP limit would
// silently cap collab-token (the one route that opts out of every other
// bucket) and break editing behind shared/NAT IPs. The AUTH bucket is skipped
// for the same per-user, cached reason.
@SkipThrottle({
[AUTH_THROTTLER]: true,
[AI_CHAT_THROTTLER]: true,
[PAGE_TEMPLATE_THROTTLER]: true,
[PUBLIC_SHARE_AI_THROTTLER]: true,
[VITALS_THROTTLER]: true,
})
@UseGuards(JwtAuthGuard)
@HttpCode(HttpStatus.OK)
@@ -0,0 +1,105 @@
/**
* Server-side whitelist + limits for POST /api/telemetry/vitals (#355).
*
* The endpoint is PUBLIC (browsers post it, no auth) so it is a privacy and
* abuse surface: everything not on these lists is silently DROPPED and the
* request still returns 200 (never 400 — a 400 would make browsers retry).
*/
// The only metric names accepted. Anything else is dropped.
export const ALLOWED_METRIC_NAMES = new Set<string>([
'INP',
'LCP',
'CLS',
'TTFB',
'editor_tx_ms',
'page_open_ms',
'longtask_ms',
]);
// The only rating values accepted (web-vitals). Anything else -> null.
export const ALLOWED_RATINGS = new Set<string>([
'good',
'needs-improvement',
'poor',
]);
// Max events accepted per batch; the rest are ignored.
export const MAX_EVENTS_PER_BATCH = 50;
// Defence-in-depth body cap (~16KB). Fastify's global bodyLimit is far larger,
// so we re-check the parsed payload size here and drop oversized batches.
export const MAX_BODY_BYTES = 16 * 1024;
// attr is truncated to this many characters (attribution target only, no PII).
export const MAX_ATTR_LENGTH = 120;
// route label sanity cap (client sends a template like /s/:space/p/:slug).
export const MAX_ROUTE_LENGTH = 200;
// `client_metrics.doc_size` is a Postgres `int` (int4). A garbage/huge docSize
// on a single event would overflow int4 and make Postgres reject the WHOLE
// batch INSERT, losing every event in it. Values outside this range are DROPPED
// to null (the event is still kept) so one bad field never loses the batch.
export const DOC_SIZE_MAX = 2147483647; // 2^31 - 1 (int4 max)
export interface ClientMetricRow {
name: string;
value: number;
rating: string | null;
route: string | null;
attr: string | null;
docSize: number | null;
workspaceId: string | null;
}
/**
* Validate + normalise a single incoming event into a DB row, or return null to
* DROP it. Pure so it is directly unit-testable. Enforces the name whitelist,
* numeric value, rating whitelist, attr truncation and doc_size (int) coercion.
*/
export function sanitizeVitalEvent(
raw: unknown,
workspaceId: string | null,
): ClientMetricRow | null {
if (!raw || typeof raw !== 'object') return null;
const e = raw as Record<string, unknown>;
const name = e.name;
if (typeof name !== 'string' || !ALLOWED_METRIC_NAMES.has(name)) return null;
const value =
typeof e.value === 'number' && Number.isFinite(e.value) ? e.value : null;
if (value === null) return null;
const rating =
typeof e.rating === 'string' && ALLOWED_RATINGS.has(e.rating)
? e.rating
: null;
let route: string | null = null;
if (typeof e.route === 'string' && e.route.length > 0) {
route = e.route.slice(0, MAX_ROUTE_LENGTH);
}
let attr: string | null = null;
if (typeof e.attr === 'string' && e.attr.length > 0) {
attr = e.attr.slice(0, MAX_ATTR_LENGTH);
}
let docSize: number | null = null;
if (typeof e.docSize === 'number' && Number.isFinite(e.docSize)) {
docSize = Math.trunc(e.docSize);
} else if (typeof e.doc_size === 'number' && Number.isFinite(e.doc_size)) {
// Accept snake_case too, in case a client sends the raw column name.
docSize = Math.trunc(e.doc_size as number);
}
// Guard the int4 column: an out-of-range docSize would overflow int4 and make
// Postgres reject the whole batch INSERT. Drop the field (keep the event)
// rather than lose every other event in the batch.
if (docSize !== null && (docSize < 0 || docSize > DOC_SIZE_MAX)) {
docSize = null;
}
return { name, value, rating, route, attr, docSize, workspaceId };
}
@@ -0,0 +1,47 @@
import { ClientTelemetryModule } from './client-telemetry.module';
import { VitalsController } from './vitals.controller';
import { VitalsService } from './vitals.service';
// The register() gate is the CORE of the maintainer's E1=B decision: the public,
// unauthenticated /api/telemetry/vitals endpoint must be OFF by default, so a
// self-host deploy has no anonymous disk-fill surface into `client_metrics`. A
// regression that inverts the flag (or a truthiness bug where "" / "false"
// registers the route) would silently reopen that surface — pin it here.
describe('ClientTelemetryModule.register (E1=B gate)', () => {
const original = process.env.CLIENT_TELEMETRY_ENABLED;
afterEach(() => {
if (original === undefined) delete process.env.CLIENT_TELEMETRY_ENABLED;
else process.env.CLIENT_TELEMETRY_ENABLED = original;
});
it('OFF by default (flag unset) — no controller, no provider (endpoint absent)', () => {
delete process.env.CLIENT_TELEMETRY_ENABLED;
const mod = ClientTelemetryModule.register();
expect(mod.controllers).toEqual([]);
expect(mod.providers).toEqual([]);
});
it.each(['false', 'False', '0', '', 'yes', '1'])(
'stays OFF for non-"true" value %p (no route)',
(val) => {
process.env.CLIENT_TELEMETRY_ENABLED = val;
const mod = ClientTelemetryModule.register();
expect(mod.controllers).toEqual([]);
expect(mod.providers).toEqual([]);
},
);
it('ON only for "true" — registers VitalsController + VitalsService', () => {
process.env.CLIENT_TELEMETRY_ENABLED = 'true';
const mod = ClientTelemetryModule.register();
expect(mod.controllers).toContain(VitalsController);
expect(mod.providers).toContain(VitalsService);
});
it('ON is case-insensitive ("TRUE")', () => {
process.env.CLIENT_TELEMETRY_ENABLED = 'TRUE';
const mod = ClientTelemetryModule.register();
expect(mod.controllers).toContain(VitalsController);
expect(mod.providers).toContain(VitalsService);
});
});
@@ -0,0 +1,32 @@
import { DynamicModule, Module } from '@nestjs/common';
import { VitalsController } from './vitals.controller';
import { VitalsService } from './vitals.service';
/**
* Client perf-telemetry (#355): the public /api/telemetry/vitals sink that
* persists web-vitals + custom client metrics into `client_metrics`.
* Named ClientTelemetryModule to avoid confusion with the unrelated
* integrations/telemetry (product usage ping) module.
*
* GATED OFF BY DEFAULT (maintainer decision E1=B). The public, unauthenticated
* endpoint is only registered when CLIENT_TELEMETRY_ENABLED=true — otherwise the
* route does NOT exist at all (no anonymous disk-fill surface, and no unbounded
* `client_metrics` growth on a self-host deploy without an external pruner). The
* client is told the same flag via window.CONFIG and skips sending when off.
*/
@Module({})
export class ClientTelemetryModule {
static register(): DynamicModule {
// Read process.env directly (not EnvironmentService) so the toggle is
// resolved at module-registration time, identical to how the metrics
// subsystem reads METRICS_PORT. Absent/anything-but-"true" => OFF.
const enabled =
(process.env.CLIENT_TELEMETRY_ENABLED ?? '').toLowerCase() === 'true';
return {
module: ClientTelemetryModule,
controllers: enabled ? [VitalsController] : [],
providers: enabled ? [VitalsService] : [],
};
}
}
@@ -0,0 +1,64 @@
import {
Body,
Controller,
HttpCode,
Post,
Req,
UseGuards,
} from '@nestjs/common';
import { SkipThrottle, Throttle, ThrottlerGuard } from '@nestjs/throttler';
import { FastifyRequest } from 'fastify';
import { Public } from '../../common/decorators/public.decorator';
import {
AI_CHAT_THROTTLER,
AUTH_THROTTLER,
PAGE_TEMPLATE_THROTTLER,
PUBLIC_SHARE_AI_THROTTLER,
VITALS_THROTTLER,
} from '../../integrations/throttle/throttler-names';
import { VitalsService } from './vitals.service';
/**
* POST /api/telemetry/vitals (#355) — public client perf-metrics sink.
*
* PUBLIC (browsers post via sendBeacon, no session) but IP-throttled. Always
* returns 200 with no body of interest: invalid/foreign/oversized payloads are
* silently dropped by the service rather than 400'd, so browsers never retry.
*/
@Controller('telemetry')
export class VitalsController {
constructor(private readonly vitalsService: VitalsService) {}
@Public()
@UseGuards(ThrottlerGuard)
// The global ThrottlerGuard applies ALL named throttlers to every route, so
// every OTHER bucket must be skipped here — otherwise the strictest of them
// (public-share AI at 5/min) would override the intended vitals limit and cap
// this route at 5/min instead of 120/min. Skip them all so ONLY the VITALS
// bucket below applies.
@SkipThrottle({
[AUTH_THROTTLER]: true,
[AI_CHAT_THROTTLER]: true,
[PAGE_TEMPLATE_THROTTLER]: true,
[PUBLIC_SHARE_AI_THROTTLER]: true,
})
@Throttle({ [VITALS_THROTTLER]: { limit: 120, ttl: 60_000 } })
@Post('vitals')
@HttpCode(200)
async vitals(
@Body() body: unknown,
@Req() req: FastifyRequest,
): Promise<{ ok: true }> {
// workspaceId is resolved by the workspace-host middleware onto req.raw when
// the browser posts from a workspace host; null otherwise. No other PII.
const workspaceId =
((req.raw as unknown as { workspaceId?: string })?.workspaceId ?? null) ||
null;
try {
await this.vitalsService.ingest(body, workspaceId);
} catch {
// Never surface storage errors to the browser; telemetry is best-effort.
}
return { ok: true };
}
}
@@ -0,0 +1,149 @@
import { VitalsService } from './vitals.service';
import { MAX_ATTR_LENGTH } from './client-metrics.constants';
// buildRows is pure (no DB access), so a null db is fine here.
const svc = new VitalsService(null as any);
describe('VitalsService.buildRows', () => {
const WS = 'ws-uuid';
it('accepts a valid batch and maps whitelisted fields to rows', () => {
const body = {
events: [
{ name: 'INP', value: 123.4, rating: 'good', route: '/s/:space/p/:slug' },
{ name: 'editor_tx_ms', value: 12, route: '/s/:space/p/:slug', docSize: 4096 },
],
};
const rows = svc.buildRows(body, WS);
expect(rows).toHaveLength(2);
expect(rows[0]).toEqual({
name: 'INP',
value: 123.4,
rating: 'good',
route: '/s/:space/p/:slug',
attr: null,
docSize: null,
workspaceId: WS,
});
expect(rows[1].name).toBe('editor_tx_ms');
expect(rows[1].docSize).toBe(4096);
expect(rows[1].workspaceId).toBe(WS);
});
it('accepts a bare array body', () => {
const rows = svc.buildRows([{ name: 'LCP', value: 1 }], WS);
expect(rows).toHaveLength(1);
expect(rows[0].name).toBe('LCP');
});
it('drops events with foreign metric names', () => {
const rows = svc.buildRows(
{ events: [{ name: 'evil_metric', value: 1 }, { name: 'LCP', value: 2 }] },
WS,
);
expect(rows).toHaveLength(1);
expect(rows[0].name).toBe('LCP');
});
it('drops events with a non-numeric or missing value', () => {
const rows = svc.buildRows(
{
events: [
{ name: 'CLS', value: 'nan' },
{ name: 'CLS' },
{ name: 'CLS', value: 0.1 },
],
},
WS,
);
expect(rows).toHaveLength(1);
expect(rows[0].value).toBe(0.1);
});
it('strips foreign fields and only keeps whitelisted columns', () => {
const rows = svc.buildRows(
{ events: [{ name: 'TTFB', value: 5, secret: 'drop-me', title: 'my page' }] },
WS,
);
expect(rows).toHaveLength(1);
expect(Object.keys(rows[0]).sort()).toEqual(
['attr', 'docSize', 'name', 'rating', 'route', 'value', 'workspaceId'].sort(),
);
expect((rows[0] as any).secret).toBeUndefined();
expect((rows[0] as any).title).toBeUndefined();
});
it('rejects a rating outside the allowed set (-> null)', () => {
const rows = svc.buildRows(
{ events: [{ name: 'INP', value: 1, rating: 'terrible' }] },
WS,
);
expect(rows[0].rating).toBeNull();
});
it('truncates attr to 120 chars', () => {
const longAttr = 'a'.repeat(500);
const rows = svc.buildRows(
{ events: [{ name: 'INP', value: 1, attr: longAttr }] },
WS,
);
expect(rows[0].attr).toHaveLength(MAX_ATTR_LENGTH);
});
it('caps the batch at 50 events', () => {
const events = Array.from({ length: 200 }, () => ({ name: 'CLS', value: 1 }));
const rows = svc.buildRows({ events }, WS);
expect(rows).toHaveLength(50);
});
it('drops an oversized (>16KB) payload wholesale', () => {
const events = Array.from({ length: 50 }, () => ({
name: 'INP',
value: 1,
attr: 'x'.repeat(400),
route: '/s/:space/p/:slug',
}));
// Serialised body far exceeds 16KB.
const rows = svc.buildRows({ events }, WS);
expect(rows).toHaveLength(0);
});
it('returns [] for malformed bodies', () => {
expect(svc.buildRows(null, WS)).toEqual([]);
expect(svc.buildRows('nope', WS)).toEqual([]);
expect(svc.buildRows({ notEvents: 1 }, WS)).toEqual([]);
expect(svc.buildRows(42, WS)).toEqual([]);
});
it('carries a null workspaceId through', () => {
const rows = svc.buildRows({ events: [{ name: 'LCP', value: 1 }] }, null);
expect(rows[0].workspaceId).toBeNull();
});
it('drops an out-of-int4-range docSize to null without losing the batch', () => {
const rows = svc.buildRows(
{
events: [
// Garbage docSize overflowing int4 must NOT reject the whole batch:
// the field is dropped to null and the event is kept.
{ name: 'editor_tx_ms', value: 10, docSize: 9_999_999_999 },
{ name: 'editor_tx_ms', value: 20, docSize: -5 },
{ name: 'editor_tx_ms', value: 30, docSize: 4096 },
],
},
WS,
);
expect(rows).toHaveLength(3);
expect(rows[0].docSize).toBeNull();
expect(rows[1].docSize).toBeNull();
expect(rows[2].docSize).toBe(4096);
});
it('keeps a docSize exactly at the int4 max', () => {
const rows = svc.buildRows(
{ events: [{ name: 'editor_tx_ms', value: 1, docSize: 2147483647 }] },
WS,
);
expect(rows[0].docSize).toBe(2147483647);
});
});
@@ -0,0 +1,70 @@
import { Injectable } from '@nestjs/common';
import { InjectKysely } from 'nestjs-kysely';
import { KyselyDB } from '@docmost/db/types/kysely.types';
import {
ClientMetricRow,
MAX_BODY_BYTES,
MAX_EVENTS_PER_BATCH,
sanitizeVitalEvent,
} from './client-metrics.constants';
@Injectable()
export class VitalsService {
constructor(@InjectKysely() private readonly db: KyselyDB) {}
/**
* Turn a raw request body into the (bounded, whitelisted) rows to persist.
* Pure/synchronous so it is unit-testable without a DB. Returns [] for any
* malformed / oversized / foreign input — the caller still responds 200.
*/
buildRows(body: unknown, workspaceId: string | null): ClientMetricRow[] {
if (!body || typeof body !== 'object') return [];
// Defence-in-depth body cap (~16KB): drop oversized batches wholesale.
try {
if (JSON.stringify(body).length > MAX_BODY_BYTES) return [];
} catch {
return [];
}
// Accept either a bare array or `{ events: [...] }`.
const events = Array.isArray(body)
? body
: Array.isArray((body as { events?: unknown }).events)
? ((body as { events: unknown[] }).events as unknown[])
: null;
if (!events) return [];
const rows: ClientMetricRow[] = [];
for (const event of events) {
if (rows.length >= MAX_EVENTS_PER_BATCH) break;
const row = sanitizeVitalEvent(event, workspaceId);
if (row) rows.push(row);
}
return rows;
}
/** Batch-insert the sanitised rows in a single statement. No-op on []. */
async insertRows(rows: ClientMetricRow[]): Promise<void> {
if (rows.length === 0) return;
await this.db
.insertInto('clientMetrics')
.values(
rows.map((r) => ({
name: r.name,
value: r.value,
rating: r.rating,
route: r.route,
attr: r.attr,
docSize: r.docSize,
workspaceId: r.workspaceId,
})),
)
.execute();
}
async ingest(body: unknown, workspaceId: string | null): Promise<void> {
const rows = this.buildRows(body, workspaceId);
await this.insertRows(rows);
}
}
@@ -40,6 +40,11 @@ import { PageListener } from '@docmost/db/listeners/page.listener';
import { PostgresJSDialect } from 'kysely-postgres-js';
import * as postgres from 'postgres';
import { normalizePostgresUrl } from '../common/helpers';
import {
observeDbQuery,
isMetricsEnabled,
} from '../integrations/metrics/metrics.registry';
import { firstSqlToken } from '../integrations/metrics/metrics.constants';
@Global()
@Module({
@@ -67,6 +72,18 @@ import { normalizePostgresUrl } from '../common/helpers';
}),
plugins: [new CamelCasePlugin()],
log: (event: LogEvent) => {
// #355 — db_query_duration_seconds, labelled by the leading SQL token
// (bounded cardinality). Gated on isMetricsEnabled() so the token work
// (regex + Set lookup) is skipped entirely when metrics are OFF — not
// just observeDbQuery no-op'd — so a non-metrics deployment pays nothing
// per query. Runs independent of the dev-only debug logging below.
if (isMetricsEnabled()) {
observeDbQuery(
firstSqlToken(event.query.sql),
event.queryDurationMillis / 1000,
);
}
if (environmentService.getNodeEnv() !== 'development') return;
const logger = new Logger(DatabaseModule.name);
if (process.env.DEBUG_DB?.toLowerCase() === 'true') {
@@ -0,0 +1,52 @@
import { type Kysely, sql } from 'kysely';
/**
* #355 — `client_metrics`: raw sink for client-side perf telemetry (web-vitals
* + custom editor/page metrics) posted to /api/telemetry/vitals.
*
* The table/columns/indexes here are a FIXED contract shared with the deployed
* Grafana infra (the `grafana_ro` role reads this table; a separate maintenance
* container prunes rows >90d and re-GRANTs daily). No app-side retention is
* added on purpose. Written as raw SQL to match that contract 1:1 (identity PK,
* conditional GRANT).
*/
export async function up(db: Kysely<any>): Promise<void> {
await sql`
CREATE TABLE client_metrics (
id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
created_at timestamptz NOT NULL DEFAULT now(),
name text NOT NULL, -- INP|LCP|CLS|TTFB|editor_tx_ms|page_open_ms|longtask_ms
value double precision NOT NULL,
rating text, -- good|needs-improvement|poor (web-vitals only)
route text, -- templated: /s/:space/p/:slug — never raw slugs
attr text, -- attribution target, truncated to 120 chars
doc_size int, -- editor_tx_ms only
workspace_id uuid
)
`.execute(db);
await sql`
CREATE INDEX idx_client_metrics_name_created
ON client_metrics (name, created_at)
`.execute(db);
await sql`
CREATE INDEX idx_client_metrics_created
ON client_metrics (created_at)
`.execute(db);
// The read-only Grafana role only exists in the deployed environment; guard so
// the migration still applies cleanly in dev/CI where the role is absent.
await sql`
DO $$
BEGIN
IF EXISTS (SELECT FROM pg_roles WHERE rolname = 'grafana_ro') THEN
GRANT SELECT ON client_metrics TO grafana_ro;
END IF;
END $$;
`.execute(db);
}
export async function down(db: Kysely<any>): Promise<void> {
await sql`DROP TABLE IF EXISTS client_metrics`.execute(db);
}
+13
View File
@@ -156,6 +156,18 @@ export interface Billing {
workspaceId: string;
}
export interface ClientMetrics {
id: Generated<Int8>;
createdAt: Generated<Timestamp>;
name: string;
value: number;
rating: string | null;
route: string | null;
attr: string | null;
docSize: number | null;
workspaceId: string | null;
}
export interface Comments {
aiChatId: string | null;
content: Json | null;
@@ -691,6 +703,7 @@ export interface DB {
authProviders: AuthProviders;
backlinks: Backlinks;
billing: Billing;
clientMetrics: ClientMetrics;
comments: Comments;
favorites: Favorites;
fileTasks: FileTasks;
@@ -227,6 +227,22 @@ export class EnvironmentService {
return compactTree === 'true';
}
/**
* Operator toggle for the public client-telemetry sink (#355). DEFAULT OFF:
* the unauthenticated POST /api/telemetry/vitals endpoint + client vitals
* collection are only wired when this is explicitly true. Kept SEPARATE from
* METRICS_PORT (the server Prometheus half) because Grafana reads the
* `client_metrics` table directly, independent of the scrape port — and
* because `client_metrics` has no app-side retention, so an operator must opt
* in and run an external pruner.
*/
isClientTelemetryEnabled(): boolean {
const enabled = this.configService
.get<string>('CLIENT_TELEMETRY_ENABLED', 'false')
.toLowerCase();
return enabled === 'true';
}
getStripePublishableKey(): string {
return this.configService.get<string>('STRIPE_PUBLISHABLE_KEY');
}
@@ -0,0 +1,46 @@
import { FastifyReply, FastifyRequest } from 'fastify';
import { isStreamingResponse } from './metrics.constants';
import { observeHttp } from './metrics.registry';
/**
* Resolve the BOUNDED route label for an HTTP response.
*
* HARD REQUIREMENT (#355): use the ROUTE TEMPLATE (`/pages/:id`), NEVER the raw
* URL (`/pages/abc-123`), so label cardinality stays finite. Fastify exposes the
* matched template on `req.routeOptions.url`. On 404s (no route matched) that is
* missing → collapse to the literal `unknown`.
*/
export function resolveRouteLabel(req: FastifyRequest): string {
const url = req.routeOptions?.url;
return typeof url === 'string' && url.length > 0 ? url : 'unknown';
}
/**
* Fastify onResponse handler that records http_request_duration_seconds.
* No-op when metrics are disabled (the hook is only registered when enabled,
* but the observe helpers are also guarded). Never throws into the response
* pipeline — telemetry must not break request handling.
*/
export function recordHttpResponse(
req: FastifyRequest,
reply: FastifyReply,
): void {
try {
const route = resolveRouteLabel(req);
// Exclude SSE/streaming responses: onResponse fires at connection close for
// those, so it would record the stream lifetime and poison p95/p99.
const contentType = reply.getHeader('content-type');
if (isStreamingResponse(contentType, route)) return;
observeHttp(
req.method,
route,
reply.statusCode,
// Fastify measures elapsed time in ms; the metric is in seconds.
reply.elapsedTime / 1000,
);
} catch {
// Swallow: a telemetry failure must never affect the served response.
}
}
@@ -0,0 +1,146 @@
import {
Injectable,
Logger,
OnModuleDestroy,
OnModuleInit,
} from '@nestjs/common';
import { InjectQueue } from '@nestjs/bullmq';
import { Queue, QueueEvents } from 'bullmq';
import { QueueName } from '../queue/constants';
import { EnvironmentService } from '../environment/environment.service';
import { parseRedisUrl } from '../../common/helpers';
import {
isMetricsEnabled,
observeJobDuration,
setQueueDepth,
} from './metrics.registry';
const POLL_INTERVAL_MS = 15_000;
// Cap the in-flight start-time map so a job that never emits completed/failed
// (worker crash) cannot leak memory unbounded. Well above realistic concurrency.
const MAX_INFLIGHT = 10_000;
/**
* BullMQ instrumentation for #355:
* - `bullmq_queue_depth{queue}`: polled from getJobCounts() every 15s.
* - `bullmq_job_duration_seconds{queue}`: wall-clock time between a job going
* `active` and `completed`/`failed`, observed via per-queue QueueEvents.
*
* Queue names are a FINITE list (the QueueName enum), so labels are bounded — no
* job ids ever enter a label. Everything is gated on METRICS_PORT: when metrics
* are off, onModuleInit does nothing (no interval, no QueueEvents connections).
*/
@Injectable()
export class MetricsBullService implements OnModuleInit, OnModuleDestroy {
private readonly logger = new Logger(MetricsBullService.name);
private readonly queues: { label: string; queue: Queue }[];
private timer: NodeJS.Timeout | null = null;
private queueEvents: QueueEvents[] = [];
// jobId -> start timestamp (ms). Bounded by MAX_INFLIGHT.
private readonly inflight = new Map<string, number>();
constructor(
private readonly environmentService: EnvironmentService,
@InjectQueue(QueueName.EMAIL_QUEUE) emailQueue: Queue,
@InjectQueue(QueueName.ATTACHMENT_QUEUE) attachmentQueue: Queue,
@InjectQueue(QueueName.GENERAL_QUEUE) generalQueue: Queue,
@InjectQueue(QueueName.BILLING_QUEUE) billingQueue: Queue,
@InjectQueue(QueueName.FILE_TASK_QUEUE) fileTaskQueue: Queue,
@InjectQueue(QueueName.SEARCH_QUEUE) searchQueue: Queue,
@InjectQueue(QueueName.AI_QUEUE) aiQueue: Queue,
@InjectQueue(QueueName.HISTORY_QUEUE) historyQueue: Queue,
@InjectQueue(QueueName.NOTIFICATION_QUEUE) notificationQueue: Queue,
@InjectQueue(QueueName.AUDIT_QUEUE) auditQueue: Queue,
) {
this.queues = [
{ label: 'email', queue: emailQueue },
{ label: 'attachment', queue: attachmentQueue },
{ label: 'general', queue: generalQueue },
{ label: 'billing', queue: billingQueue },
{ label: 'file-task', queue: fileTaskQueue },
{ label: 'search', queue: searchQueue },
{ label: 'ai', queue: aiQueue },
{ label: 'history', queue: historyQueue },
{ label: 'notification', queue: notificationQueue },
{ label: 'audit', queue: auditQueue },
];
}
onModuleInit(): void {
if (!isMetricsEnabled()) return;
// Poll queue depth.
this.timer = setInterval(() => {
void this.pollDepths();
}, POLL_INTERVAL_MS);
// Do not keep the event loop alive solely for polling.
this.timer.unref?.();
void this.pollDepths();
// Wire per-queue job-duration events.
const redisConfig = parseRedisUrl(this.environmentService.getRedisUrl());
const connection = {
host: redisConfig.host,
port: redisConfig.port,
password: redisConfig.password,
db: redisConfig.db,
family: redisConfig.family,
};
for (const { label, queue } of this.queues) {
const events = new QueueEvents(queue.name, { connection });
events.on('active', ({ jobId }) => {
if (this.inflight.size >= MAX_INFLIGHT) {
// Drop the oldest tracked start to keep the map bounded.
const oldest = this.inflight.keys().next().value;
if (oldest !== undefined) this.inflight.delete(oldest);
}
this.inflight.set(jobId, Date.now());
});
const finalize = ({ jobId }: { jobId: string }) => {
const start = this.inflight.get(jobId);
if (start === undefined) return;
this.inflight.delete(jobId);
observeJobDuration(label, (Date.now() - start) / 1000);
};
events.on('completed', finalize);
events.on('failed', finalize);
events.on('error', (err) => {
this.logger.debug(`QueueEvents error (${label}): ${err?.message}`);
});
this.queueEvents.push(events);
}
}
private async pollDepths(): Promise<void> {
for (const { label, queue } of this.queues) {
try {
const counts = await queue.getJobCounts();
// "Depth" = jobs not yet finished (backlog + in-flight).
const depth =
(counts.waiting ?? 0) +
(counts.active ?? 0) +
(counts.delayed ?? 0) +
(counts.prioritized ?? 0) +
(counts.paused ?? 0);
setQueueDepth(label, depth);
} catch (err) {
this.logger.debug(
`Failed to read job counts for ${label}: ${(err as Error)?.message}`,
);
}
}
}
async onModuleDestroy(): Promise<void> {
if (this.timer) {
clearInterval(this.timer);
this.timer = null;
}
await Promise.all(
this.queueEvents.map((e) => e.close().catch(() => undefined)),
);
this.queueEvents = [];
this.inflight.clear();
}
}
@@ -0,0 +1,16 @@
import { Injectable, OnModuleDestroy } from '@nestjs/common';
import { closeMetricsServer } from './metrics.server';
/**
* Ties the bare node:http metrics scrape server (started in main.ts after the
* Fastify app is up, outside the DI container) into Nest's shutdown lifecycle.
* With `app.enableShutdownHooks()`, onModuleDestroy fires on SIGTERM/SIGINT and
* closes the listener so it is not left dangling (jest/e2e never exits, and a
* prod restart doesn't leak the port). No-op when metrics are disabled.
*/
@Injectable()
export class MetricsServerLifecycle implements OnModuleDestroy {
async onModuleDestroy(): Promise<void> {
await closeMetricsServer();
}
}
@@ -0,0 +1,84 @@
/**
* Perf-metrics contract (#355). These names/labels are FIXED by the already
* deployed scrape+dashboard infra (VictoriaMetrics scraping docmost:9464,
* Grafana dashboards, alerts). Do NOT rename them.
*/
export const METRIC_HTTP_REQUEST_DURATION = 'http_request_duration_seconds';
export const METRIC_DB_QUERY_DURATION = 'db_query_duration_seconds';
export const METRIC_BULLMQ_QUEUE_DEPTH = 'bullmq_queue_depth';
export const METRIC_BULLMQ_JOB_DURATION = 'bullmq_job_duration_seconds';
export const METRIC_COLLAB_STORE_DURATION = 'collab_store_duration_seconds';
// Histogram buckets (seconds). Chosen to give useful p50/p95/p99 resolution
// for typical web/DB latencies without exploding series cardinality.
export const HTTP_BUCKETS = [
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10,
];
export const DB_BUCKETS = [
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5,
];
export const COLLAB_BUCKETS = [
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5,
];
export const JOB_BUCKETS = [
0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120,
];
/**
* Extract the first SQL token (select/insert/update/delete/...) from a query,
* lower-cased, to use as a BOUNDED label for db_query_duration_seconds. Using
* the full query text would blow up label cardinality; the leading keyword is a
* finite set. Unknown/empty queries collapse to `other`.
*/
// The bounded set of SQL leading keywords used as db_query_duration_seconds
// labels. Module-const so it is built ONCE, not per query (this runs on every DB
// query when metrics are enabled).
const KNOWN_SQL_TOKENS = new Set([
'select',
'insert',
'update',
'delete',
'with',
'begin',
'commit',
'rollback',
'alter',
'create',
'drop',
'truncate',
'explain',
]);
export function firstSqlToken(sql: string | undefined): string {
if (!sql) return 'other';
// Skip leading whitespace / comments and grab the first word.
const match = /^[\s(]*([a-zA-Z]+)/.exec(sql);
if (!match) return 'other';
const token = match[1].toLowerCase();
return KNOWN_SQL_TOKENS.has(token) ? token : 'other';
}
/**
* Whether an HTTP response must be EXCLUDED from http_request_duration_seconds.
*
* SSE/streaming responses (the AI-chat `text/event-stream`) keep the connection
* open for the whole conversation, so Fastify's onResponse fires only when the
* client disconnects — recording the connection lifetime, not a response time,
* which would poison p95/p99. We skip by content-type (authoritative) with a
* route-suffix fallback for the two known stream endpoints.
*/
export function isStreamingResponse(
contentType: unknown,
route: string | undefined,
): boolean {
if (
typeof contentType === 'string' &&
contentType.toLowerCase().includes('text/event-stream')
) {
return true;
}
// Fallback: the AI-chat stream routes (/api/ai-chat/stream,
// /api/shares/ai/stream) both end in `/stream`.
if (route && route.endsWith('/stream')) return true;
return false;
}
@@ -0,0 +1,15 @@
import { Module } from '@nestjs/common';
import { MetricsBullService } from './metrics-bull.service';
import { MetricsServerLifecycle } from './metrics-server.lifecycle';
/**
* Wires the BullMQ collectors (#355). The queues are provided by the @Global
* QueueModule (which exports BullModule), so no re-registration is needed here.
* The HTTP histogram, DB-query and collab-store collectors live in module-level
* singletons (metrics.registry) and are wired directly at their call sites.
* MetricsServerLifecycle closes the scrape server on shutdown.
*/
@Module({
providers: [MetricsBullService, MetricsServerLifecycle],
})
export class MetricsModule {}
@@ -0,0 +1,126 @@
import {
collectDefaultMetrics,
Histogram,
Gauge,
Registry,
} from 'prom-client';
import {
COLLAB_BUCKETS,
DB_BUCKETS,
HTTP_BUCKETS,
JOB_BUCKETS,
METRIC_BULLMQ_JOB_DURATION,
METRIC_BULLMQ_QUEUE_DEPTH,
METRIC_COLLAB_STORE_DURATION,
METRIC_DB_QUERY_DURATION,
METRIC_HTTP_REQUEST_DURATION,
} from './metrics.constants';
/**
* Process-wide perf-metrics registry (#355).
*
* This is a plain module singleton (NOT a Nest provider) because the collectors
* are cross-cutting: the Kysely `log` callback (built in a DI factory), the
* Fastify onResponse hook (main.ts, before the Nest container hands out
* providers) and the collab persistence extension all need the SAME instruments
* without threading DI through them.
*
* HARD CONTRACT: when `METRICS_PORT` is unset the whole subsystem is OFF — the
* registry is never created, `collectDefaultMetrics` never runs, and every
* observe/set helper is a cheap no-op. Nothing is exposed on :3000.
*/
// Decided once at process start. Deliberately read here (not via
// EnvironmentService) so the toggle is identical for the DI and non-DI callers.
const enabled = Boolean(process.env.METRICS_PORT);
let registry: Registry | null = null;
let httpHist: Histogram<'method' | 'route' | 'status'> | null = null;
let dbHist: Histogram<'op'> | null = null;
let queueDepthGauge: Gauge<'queue'> | null = null;
let jobHist: Histogram<'queue'> | null = null;
let collabHist: Histogram | null = null;
function init(): void {
if (registry || !enabled) return;
registry = new Registry();
// Node/runtime metrics: gives nodejs_eventloop_lag_p99_seconds, GC, heap, etc.
collectDefaultMetrics({ register: registry });
httpHist = new Histogram({
name: METRIC_HTTP_REQUEST_DURATION,
help: 'HTTP request duration in seconds, by method, route template and status',
labelNames: ['method', 'route', 'status'],
buckets: HTTP_BUCKETS,
registers: [registry],
});
dbHist = new Histogram({
name: METRIC_DB_QUERY_DURATION,
help: 'Database query duration in seconds, by leading SQL keyword',
labelNames: ['op'],
buckets: DB_BUCKETS,
registers: [registry],
});
queueDepthGauge = new Gauge({
name: METRIC_BULLMQ_QUEUE_DEPTH,
help: 'Number of not-yet-finished BullMQ jobs per queue',
labelNames: ['queue'],
registers: [registry],
});
jobHist = new Histogram({
name: METRIC_BULLMQ_JOB_DURATION,
help: 'BullMQ job processing duration in seconds, per queue',
labelNames: ['queue'],
buckets: JOB_BUCKETS,
registers: [registry],
});
collabHist = new Histogram({
name: METRIC_COLLAB_STORE_DURATION,
help: 'Collaboration onStoreDocument duration in seconds',
buckets: COLLAB_BUCKETS,
registers: [registry],
});
}
// Runs once when this module is first imported. Safe to call again (idempotent).
init();
export function isMetricsEnabled(): boolean {
return enabled;
}
/** The prom-client registry, or null when metrics are disabled. */
export function getMetricsRegistry(): Registry | null {
return registry;
}
export function observeHttp(
method: string,
route: string,
status: number,
seconds: number,
): void {
httpHist?.observe({ method, route, status }, seconds);
}
export function observeDbQuery(op: string, seconds: number): void {
dbHist?.observe({ op }, seconds);
}
export function setQueueDepth(queue: string, depth: number): void {
queueDepthGauge?.set({ queue }, depth);
}
export function observeJobDuration(queue: string, seconds: number): void {
jobHist?.observe({ queue }, seconds);
}
export function observeCollabStore(seconds: number): void {
collabHist?.observe(seconds);
}
@@ -0,0 +1,86 @@
import { createServer, Server } from 'node:http';
import { Logger } from '@nestjs/common';
import { getMetricsRegistry, isMetricsEnabled } from './metrics.registry';
/**
* Start the Prometheus scrape endpoint on a SEPARATE port, taken from
* `METRICS_PORT`. There is NO default port: when `METRICS_PORT` is unset the
* whole metrics subsystem is OFF and this returns null. This is a bare node:http
* server, NOT part of the Fastify app, so `/metrics` never exists on the public
* :3000 listener.
*
* Returns the http.Server (so callers can close it on shutdown) or null when
* metrics are disabled. The reference is also kept module-side so the Nest
* lifecycle (see MetricsModule) can close it on application shutdown without
* threading the handle back through the non-DI bootstrap.
*/
let metricsServer: Server | null = null;
export function startMetricsServer(): Server | null {
if (!isMetricsEnabled()) return null;
const logger = new Logger('MetricsServer');
const register = getMetricsRegistry();
if (!register) return null;
const port = Number(process.env.METRICS_PORT);
if (!Number.isInteger(port) || port <= 0) {
logger.warn(
`Invalid METRICS_PORT="${process.env.METRICS_PORT}", metrics endpoint not started`,
);
return null;
}
const server = createServer(async (req, res) => {
if (req.method === 'GET' && req.url === '/metrics') {
try {
const body = await register.metrics();
res.setHeader('Content-Type', register.contentType);
res.statusCode = 200;
res.end(body);
} catch (err) {
res.statusCode = 500;
res.end(String((err as Error)?.message ?? 'error'));
}
return;
}
res.statusCode = 404;
res.end();
});
// Bind on all interfaces: the scraper (VictoriaMetrics) reaches this from
// another container as docmost:9464. The port is not published to the host.
server.listen(port, '0.0.0.0', () => {
logger.log(`Metrics endpoint listening on :${port}/metrics`);
});
server.on('error', (err) => {
logger.error(`Metrics server error: ${err?.message}`);
});
metricsServer = server;
return server;
}
/**
* Close the metrics scrape server if one is running. Idempotent and safe to call
* when metrics are disabled (no server was ever started). Wired into Nest's
* shutdown lifecycle so the listener is not left dangling on shutdown.
*/
export function closeMetricsServer(): Promise<void> {
const server = metricsServer;
metricsServer = null;
if (!server) return Promise.resolve();
return new Promise((resolve) => {
server.close(() => resolve());
// server.close() stops accepting NEW connections but its callback does not
// fire until existing keep-alive sockets drain. The scraper (VictoriaMetrics/
// vmagent) holds an idle HTTP keep-alive socket, so without this the callback
// — and thus shutdown — would hang until the scraper disconnects or the
// orchestrator escalates to SIGKILL on the kill-grace window. Force-close idle
// keep-alive sockets so close() completes immediately, and unref so this
// server never keeps the event loop alive on its own.
server.closeIdleConnections();
server.unref();
});
}
@@ -0,0 +1,70 @@
import { FastifyRequest } from 'fastify';
import { resolveRouteLabel } from './http-metrics.hook';
import { firstSqlToken, isStreamingResponse } from './metrics.constants';
describe('resolveRouteLabel (histogram route label)', () => {
it('uses the ROUTE TEMPLATE, never the raw URL', () => {
// routeOptions.url is the matched template; url is the raw path with the id.
const req = {
url: '/api/pages/abc-123-def',
routeOptions: { url: '/api/pages/:id' },
} as unknown as FastifyRequest;
expect(resolveRouteLabel(req)).toBe('/api/pages/:id');
expect(resolveRouteLabel(req)).not.toContain('abc-123-def');
});
it('falls back to "unknown" on a 404 (no matched route template)', () => {
const req = {
url: '/totally/unmatched/path',
routeOptions: {},
} as unknown as FastifyRequest;
expect(resolveRouteLabel(req)).toBe('unknown');
});
it('falls back to "unknown" when routeOptions is missing', () => {
const req = { url: '/x' } as unknown as FastifyRequest;
expect(resolveRouteLabel(req)).toBe('unknown');
});
});
describe('isStreamingResponse (SSE exclusion)', () => {
it('excludes text/event-stream responses by content-type', () => {
expect(isStreamingResponse('text/event-stream', '/api/ai-chat/stream')).toBe(
true,
);
expect(isStreamingResponse('text/event-stream; charset=utf-8', '/x')).toBe(
true,
);
});
it('excludes known /stream routes by suffix as a fallback', () => {
expect(isStreamingResponse('application/json', '/api/ai-chat/stream')).toBe(
true,
);
expect(isStreamingResponse(undefined, '/api/shares/ai/stream')).toBe(true);
});
it('does not exclude ordinary JSON responses', () => {
expect(isStreamingResponse('application/json', '/api/pages/:id')).toBe(
false,
);
expect(isStreamingResponse(undefined, '/api/pages/:id')).toBe(false);
});
});
describe('firstSqlToken (bounded db label)', () => {
it('returns the lower-cased leading keyword', () => {
expect(firstSqlToken('SELECT * FROM pages')).toBe('select');
expect(firstSqlToken(' insert into x values (1)')).toBe('insert');
expect(firstSqlToken('UPDATE pages SET a=1')).toBe('update');
expect(firstSqlToken('delete from pages')).toBe('delete');
expect(firstSqlToken('(SELECT 1)')).toBe('select');
});
it('collapses unknown/empty queries to "other"', () => {
expect(firstSqlToken('')).toBe('other');
expect(firstSqlToken(undefined)).toBe('other');
expect(firstSqlToken('123 not sql')).toBe('other');
expect(firstSqlToken('vacuum analyze')).toBe('other');
});
});
@@ -50,6 +50,10 @@ export class StaticModule implements OnModuleInit {
: undefined,
POSTHOG_HOST: this.environmentService.getPostHogHost(),
POSTHOG_KEY: this.environmentService.getPostHogKey(),
// #355 — mirrors the server-side CLIENT_TELEMETRY_ENABLED gate so the
// client only collects/sends vitals when the operator opts in.
CLIENT_TELEMETRY_ENABLED:
this.environmentService.isClientTelemetryEnabled(),
};
const windowScriptContent = `<script>window.CONFIG=${JSON.stringify(configString)};</script>`;
@@ -9,6 +9,7 @@ import {
AI_CHAT_THROTTLER,
PAGE_TEMPLATE_THROTTLER,
PUBLIC_SHARE_AI_THROTTLER,
VITALS_THROTTLER,
} from './throttler-names';
@Module({
@@ -29,6 +30,8 @@ import {
{ name: PAGE_TEMPLATE_THROTTLER, ttl: 60_000, limit: 30 },
// Anonymous public-share assistant: ~5 req/min per IP.
{ name: PUBLIC_SHARE_AI_THROTTLER, ttl: 60_000, limit: 5 },
// Anonymous client perf-telemetry sink: 120 batched posts/min per IP.
{ name: VITALS_THROTTLER, ttl: 60_000, limit: 120 },
],
errorMessage: 'Too many requests',
// Pass ioredis options (not a pre-built Redis instance) so
@@ -6,3 +6,7 @@ export const PAGE_TEMPLATE_THROTTLER = 'page-template';
// ThrottlerGuard tracker) to bound anonymous abuse — the workspace owner pays
// for the tokens.
export const PUBLIC_SHARE_AI_THROTTLER = 'public-share-ai';
// IP-keyed throttler for the anonymous client perf-telemetry sink
// (POST /api/telemetry/vitals). Browsers batch metrics, so the limit is
// generous; it only exists to bound abuse of the public, unauthenticated route.
export const VITALS_THROTTLER = 'vitals';
+24
View File
@@ -16,6 +16,9 @@ import { EnvironmentService } from './integrations/environment/environment.servi
import { SANDBOX_API_PATH } from './integrations/sandbox/sandbox.constants';
import { resolveFrameHeader } from './common/helpers';
import { resolveTrustProxy } from './integrations/environment/trust-proxy.util';
import { isMetricsEnabled } from './integrations/metrics/metrics.registry';
import { recordHttpResponse } from './integrations/metrics/http-metrics.hook';
import { startMetricsServer } from './integrations/metrics/metrics.server';
async function bootstrap() {
const app = await NestFactory.create<NestFastifyApplication>(
@@ -91,6 +94,19 @@ async function bootstrap() {
done();
});
// #355 — HTTP request-duration histogram. Registered ONLY when METRICS_PORT is
// set (otherwise no collector runs at all). Uses the bounded route template
// label and excludes SSE/streaming responses (see recordHttpResponse).
if (isMetricsEnabled()) {
app
.getHttpAdapter()
.getInstance()
.addHook('onResponse', (req, reply, done) => {
recordHttpResponse(req, reply);
done();
});
}
app
.getHttpAdapter()
.getInstance()
@@ -127,6 +143,9 @@ async function bootstrap() {
'/api/workspace/create',
'/api/workspace/joined',
'/api/workspace/find-by-email',
// Public client perf-telemetry sink: browsers post it without a
// resolved workspace host, so the workspace-resolution gate must not 404 it.
'/api/telemetry/vitals',
// Anonymous in-RAM blob sandbox: a remote consumer fetches blobs by an
// unguessable UUID without any workspace host context, so the
// workspace-resolution gate must not apply.
@@ -175,6 +194,11 @@ async function bootstrap() {
`Listening on http://127.0.0.1:${port} / ${process.env.APP_URL}`,
);
});
// #355 — Prometheus scrape endpoint on a SEPARATE port (METRICS_PORT),
// started after the app is up. No default port: a no-op when METRICS_PORT is
// unset. Closed on shutdown by MetricsServerLifecycle (MetricsModule).
startMetricsServer();
}
bootstrap();
+28 -1
View File
@@ -416,6 +416,9 @@ importers:
socket.io-client:
specifier: 4.8.3
version: 4.8.3
web-vitals:
specifier: ^5.1.0
version: 5.1.0
zod:
specifier: 4.3.6
version: 4.3.6
@@ -744,6 +747,9 @@ importers:
postmark:
specifier: ^4.0.7
version: 4.0.7
prom-client:
specifier: ^15.1.3
version: 15.1.3
react:
specifier: ^18.3.1
version: 18.3.1
@@ -5988,6 +5994,9 @@ packages:
bind-event-listener@3.0.0:
resolution: {integrity: sha512-PJvH288AWQhKs2v9zyfYdPzlPqf5bXbGMmhmUIY9x4dAUGIWgomO771oBQNwJnMQSnUIXhKu6sgzpBRXTlvb8Q==}
bintrees@1.0.2:
resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==}
bl@4.1.0:
resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==}
@@ -9318,6 +9327,10 @@ packages:
process-warning@5.0.0:
resolution: {integrity: sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA==}
prom-client@15.1.3:
resolution: {integrity: sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==}
engines: {node: ^16 || ^18 || >=20}
prompts@2.4.2:
resolution: {integrity: sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==}
engines: {node: '>= 6'}
@@ -10145,6 +10158,9 @@ packages:
resolution: {integrity: sha512-4LeEWl96twnS2Q7Bz4MGqgazLqO+hJN63GZxXoIqh1T3VweYD997gbU1ItNsQafqqXTXd5WFyFdReLtwvRBNiw==}
engines: {node: '>=18'}
tdigest@0.1.2:
resolution: {integrity: sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==}
terser-webpack-plugin@5.4.0:
resolution: {integrity: sha512-Bn5vxm48flOIfkdl5CaD2+1CiUVbonWQ3KQPyP7/EuIl9Gbzq/gQFOzaMFUEgVjB1396tcK0SG8XcNJ/2kDH8g==}
engines: {node: '>= 10.13.0'}
@@ -16153,7 +16169,7 @@ snapshots:
obug: 2.1.1
std-env: 4.1.0
tinyrainbow: 3.1.0
vitest: 4.1.6(@opentelemetry/api@1.9.0)(@types/node@25.5.0)(@vitest/coverage-v8@4.1.6)(happy-dom@20.8.9)(jsdom@27.4.0(@noble/hashes@2.0.1))(vite@8.0.5(@types/node@25.5.0)(esbuild@0.28.0)(jiti@2.4.2)(less@4.2.0)(sugarss@5.0.1(postcss@8.5.14))(terser@5.39.0)(tsx@4.21.0)(yaml@2.8.3))
vitest: 4.1.6(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(@vitest/coverage-v8@4.1.6)(happy-dom@20.8.9)(jsdom@25.0.0)(vite@8.0.5(@types/node@22.19.1)(esbuild@0.28.0)(jiti@2.4.2)(less@4.2.0)(sugarss@5.0.1(postcss@8.5.14))(terser@5.39.0)(tsx@4.21.0)(yaml@2.8.3))
'@vitest/expect@4.1.6':
dependencies:
@@ -16665,6 +16681,8 @@ snapshots:
bind-event-listener@3.0.0: {}
bintrees@1.0.2: {}
bl@4.1.0:
dependencies:
buffer: 5.7.1
@@ -20476,6 +20494,11 @@ snapshots:
process-warning@5.0.0: {}
prom-client@15.1.3:
dependencies:
'@opentelemetry/api': 1.9.0
tdigest: 0.1.2
prompts@2.4.2:
dependencies:
kleur: 3.0.3
@@ -21521,6 +21544,10 @@ snapshots:
minizlib: 3.1.0
yallist: 5.0.0
tdigest@0.1.2:
dependencies:
bintrees: 1.0.2
terser-webpack-plugin@5.4.0(@swc/core@1.5.25(@swc/helpers@0.5.5))(webpack@5.106.0(@swc/core@1.5.25(@swc/helpers@0.5.5))):
dependencies:
'@jridgewell/trace-mapping': 0.3.31