fix(#345): restore prom-client, harden normalizer against ReDoS, strip frontmatter (review round 1)

Addresses the round-1 review of #369:

F1 [CRITICAL] Restore prom-client. The prior commit removed it as a 'stray dep',
but metrics.registry.ts imports it unconditionally at startup (main.ts boot), so
a clean frozen install had no prom-client -> server tsc TS2307 + boot crash. It
was surviving only via hoisting from a warm store. Restored to apps/server
dependencies + regenerated the lock (prom-client/tdigest/bintrees return),
keeping the @docmost/prosemirror-markdown dep. Verified: clean frozen install ->
require.resolve('prom-client') ok, server tsc EXIT 0.

F2 [HIGH] Two quadratic ReDoS vectors in foreign-markdown.ts on untrusted import
(runs synchronously on the request thread, 30MB cap):
  (a) pass-2 was O(lines x defs) — a per-def RegExp rebuilt and run over every
      line. Replaced with ONE precompiled alternation regex over all def ids,
      built once per document, with an id->body lookup in the replacer: O(text).
  (b) the inline-code split alternation backtracks quadratically on a long
      UNCLOSED backtick run. Lines over 8KB now skip the split (left untouched) —
      a real footnote line is never that long.

F3 [WARNING] Restore the leading YAML front-matter strip that the retired
markdownToHtml layer did. Without it, Obsidian/Hugo/Jekyll/git-sync files leak
their front-matter into the body (and 'title:' renders as a setext heading that
title extraction can hijack).

F4 [WARNING] Extend the zip-import spec with an image (width+align) + callout
fidelity assertion through the PM->HTML->PM hop (the one hop the package suite
does not cover).

F5/F6 Update AGENTS.md (apps/server is now a prosemirror-markdown consumer) and
make the server pretest build prosemirror-markdown too.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
agent_coder
2026-07-05 04:54:07 +03:00
parent 2c2d60a5dc
commit e17d5bc060
6 changed files with 240 additions and 91 deletions
@@ -91,88 +91,127 @@ function chainable(result: any): any {
return proxy;
}
/**
* Run one markdown file through the REAL zip-import pipeline
* (`processGenericImport` -> `markdownToProseMirror` -> `jsonToHtml` ->
* `processHTML`/`htmlToJson`) and return the persisted page `content`. This is
* the server-specific PM->HTML->PM hop that the package's own PM<->MD tests do
* NOT cover.
*/
async function runZipImport(markdown: string): Promise<any> {
const extractDir = await fs.mkdtemp(path.join(os.tmpdir(), 'fit-canon-'));
await fs.writeFile(path.join(extractDir, 'note.md'), markdown, 'utf-8');
const importService = new ImportService(
{} as any,
{} as any,
{} as any,
{} as any,
);
jest
.spyOn(importService as any, 'createYdoc')
.mockResolvedValue(Buffer.from([]) as any);
let captured: any = null;
const trx = {
insertInto: (table: string) => ({
values: (v: any) => {
if (table === 'pages') captured = v;
return { execute: async () => {} };
},
}),
};
const db: any = {
selectFrom: () => chainable({ slug: 'space-slug' }),
transaction: () => ({ execute: (fn: any) => fn(trx) }),
};
const importAttachmentService = {
processAttachments: async ({ html }: any) => html,
};
const service = new FileImportTaskService(
{} as any, // storageService
importService as any,
{ nextPagePosition: async () => 'a0' } as any,
{ insertBacklink: jest.fn() } as any,
db,
importAttachmentService as any,
{ emit: jest.fn() } as any,
{ logBatchWithContext: jest.fn() } as any,
);
const fileTask: any = {
id: 'task-1',
source: 'generic',
spaceId: 'space-1',
workspaceId: 'ws-1',
creatorId: 'user-1',
};
try {
await service.processGenericImport({ extractDir, fileTask });
expect(captured).toBeTruthy();
return captured.content;
} finally {
await fs.rm(extractDir, { recursive: true, force: true });
}
}
/** Find the first node of a given type anywhere in a PM content tree. */
function findFirst(node: any, type: string): any {
if (!node || typeof node !== 'object') return null;
if (node.type === type) return node;
for (const child of node.content ?? []) {
const hit = findFirst(child, type);
if (hit) return hit;
}
return null;
}
describe('FileImportTaskService.processGenericImport — footnote canonicalization (#228)', () => {
it('orders footnotes by first reference, dedupes reuse, and drops orphans on zip import', async () => {
const extractDir = await fs.mkdtemp(path.join(os.tmpdir(), 'fit-canon-'));
await fs.writeFile(path.join(extractDir, 'note.md'), MARKDOWN, 'utf-8');
// Real ImportService for the html -> JSON conversion; stub the yjs encode.
const importService = new ImportService(
{} as any,
{} as any,
{} as any,
{} as any,
const content = await runZipImport(MARKDOWN);
// Definitions ordered by FIRST REFERENCE (C, A, B), NOT the markdown
// definition order (A, B, C). Ids are the parser's fresh `fn-*`, so pin
// the BODIES.
expect(footnoteListBodies(content)).toEqual(['note C', 'note A', 'note B']);
// Orphan [^z] dropped; reused [^a] collapses to one definition; one list.
expect(footnoteListBodies(content)).not.toContain('orphan note');
const lists = (content.content ?? []).filter(
(n: any) => n.type === 'footnotesList',
);
jest
.spyOn(importService as any, 'createYdoc')
.mockResolvedValue(Buffer.from([]) as any);
expect(lists).toHaveLength(1);
expect(
footnoteListBodies(content).filter((b) => b === 'note A'),
).toHaveLength(1);
});
let captured: any = null;
const trx = {
insertInto: (table: string) => ({
values: (v: any) => {
if (table === 'pages') captured = v;
return { execute: async () => {} };
},
}),
};
const db: any = {
selectFrom: () => chainable({ slug: 'space-slug' }),
transaction: () => ({ execute: (fn: any) => fn(trx) }),
};
// #345 F4: the zip path routes markdown through jsonToHtml -> processHTML ->
// htmlToJson (the shared HTML attachment pipeline). #345's headline is LOSSLESS
// image width/align via the `<!--img {...}-->` comment; a callout carries its
// `type`. This asserts those survive the PM->HTML->PM hop — the one hop the
// package's PM<->MD suite does not exercise.
it('preserves image width/align and callout type through the PM->HTML->PM hop', async () => {
const md = [
'# Doc',
'',
'![a picture](https://example.com/i.png) <!--img {"width":"320","align":"left"}-->',
'',
':::warning',
'Careful now.',
':::',
].join('\n');
const importAttachmentService = {
processAttachments: async ({ html }: any) => html,
};
const backlinkRepo = { insertBacklink: jest.fn() };
const eventEmitter = { emit: jest.fn() };
const auditService = { logBatchWithContext: jest.fn() };
const content = await runZipImport(md);
const pageService = { nextPagePosition: async () => 'a0' };
const image = findFirst(content, 'image');
expect(image).toBeTruthy();
// The lossless sizing/alignment must survive the HTML hop.
expect(String(image.attrs?.width)).toBe('320');
expect(image.attrs?.align).toBe('left');
const service = new FileImportTaskService(
{} as any, // storageService
importService as any,
pageService as any,
backlinkRepo as any,
db,
importAttachmentService as any,
eventEmitter as any,
auditService as any,
);
const fileTask: any = {
id: 'task-1',
source: 'generic',
spaceId: 'space-1',
workspaceId: 'ws-1',
creatorId: 'user-1',
};
try {
await service.processGenericImport({ extractDir, fileTask });
expect(captured).toBeTruthy();
const content = captured.content;
// Definitions ordered by FIRST REFERENCE (C, A, B), NOT the markdown
// definition order (A, B, C). Ids are the parser's fresh `fn-*`, so pin
// the BODIES.
expect(footnoteListBodies(content)).toEqual([
'note C',
'note A',
'note B',
]);
// Orphan [^z] dropped; reused [^a] collapses to one definition; one list.
expect(footnoteListBodies(content)).not.toContain('orphan note');
const lists = (content.content ?? []).filter(
(n: any) => n.type === 'footnotesList',
);
expect(lists).toHaveLength(1);
expect(
footnoteListBodies(content).filter((b) => b === 'note A'),
).toHaveLength(1);
} finally {
await fs.rm(extractDir, { recursive: true, force: true });
}
const callout = findFirst(content, 'callout');
expect(callout).toBeTruthy();
expect(callout.attrs?.type).toBe('warning');
});
});
@@ -83,6 +83,49 @@ describe('normalizeForeignMarkdown — GFM reference footnotes', () => {
const callouts = ':::info\nHi\n:::\n\n> [!warning]\n> Careful';
expect(normalizeForeignMarkdown(callouts)).toBe(callouts);
});
it('strips a leading YAML front-matter block (Obsidian/Hugo/git-sync files)', () => {
const out = normalizeForeignMarkdown(
'---\ntitle: My Page\ntags: [a, b]\n---\n\n# Heading\n\nBody.',
);
expect(out).toBe('# Heading\n\nBody.');
// The front-matter must not leak into the body as a setext heading.
expect(out).not.toContain('title: My Page');
expect(out).not.toContain('---');
});
it('does not strip a horizontal rule that is not leading front-matter', () => {
const md = 'Intro paragraph.\n\n---\n\nAfter the rule.';
expect(normalizeForeignMarkdown(md)).toBe(md);
});
it('is linear on a document with thousands of definitions (no quadratic blowup)', () => {
// F2(a): the pass-2 rewrite must be O(text), not O(text × defs). Build a
// pathological doc (many defs + many plain text lines) and assert it
// completes well under a second — a quadratic implementation took ~14s.
const N = 4000;
const refs = Array.from({ length: N }, (_, i) => `line ${i} plain text`).join('\n');
const defs = Array.from({ length: N }, (_, i) => `[^n${i}]: def ${i}`).join('\n');
const doc = `start[^n0] and[^n${N - 1}] end\n\n${refs}\n\n${defs}`;
const t0 = Date.now();
const out = normalizeForeignMarkdown(doc);
const elapsed = Date.now() - t0;
expect(elapsed).toBeLessThan(2000);
// Sanity: the two real references were still inlined.
expect(out).toContain('^[def 0]');
expect(out).toContain(`^[def ${N - 1}]`);
});
it('is bounded on a long unclosed backtick run (no inline-split ReDoS)', () => {
// F2(b): a huge unterminated backtick run must not cause quadratic
// backtracking in the inline-code split. Oversized lines skip the split
// entirely (left untouched), so this returns promptly.
const line = 'x' + '`'.repeat(200000);
const doc = `${line}\n\n[^1]: def`;
const t0 = Date.now();
normalizeForeignMarkdown(doc);
expect(Date.now() - t0).toBeLessThan(2000);
});
});
describe('foreign markdown import acceptance (normalizer + canonical parser)', () => {
@@ -74,10 +74,26 @@ function escapeFootnoteBody(body: string): string {
* We split the line on inline-code spans (paired backtick runs) and rewrite only
* the non-code segments.
*/
// Above this length a single line is not split into inline-code spans (see
// below). A genuine markdown line carrying a footnote reference is never tens of
// KB; the cap only bypasses the inline-code protection for pathological lines.
const INLINE_SPLIT_MAX_LINE = 8192;
function rewriteRefsOutsideInlineCode(
line: string,
replace: (text: string) => string,
): string {
// The inline-code split alternation `(`+)(?:(?!\1)[\s\S])*\1` backtracks
// quadratically on a long UNCLOSED backtick run (its middle can consume the
// rest of the line, then fail to find a closing run and retry from each
// position). On an untrusted import this is a request-thread ReDoS. A real
// footnote line is short, so for an oversized line we skip the inline-code
// protection entirely and leave the line UNTOUCHED (rewriting it wholesale
// could corrupt a `[^id]` that legitimately lives inside inline code). This is
// a conservative bypass: an over-8KB line simply does not get its reference
// footnotes inlined — acceptable for a pathological input.
if (line.length > INLINE_SPLIT_MAX_LINE) return line;
// Alternation: an inline-code span (one or more backticks, then anything up to
// the SAME run of backticks) OR a run of non-backtick text. Unterminated
// backticks fall through as ordinary text (matched by the second branch on the
@@ -161,6 +177,26 @@ function convertReferenceFootnotes(markdown: string): string {
return markdown;
}
// ONE precompiled alternation regex over ALL definition ids, built once per
// document (not once per definition per line). This makes pass 2 O(total text)
// instead of O(text × defs): a line with no reference pays a single failed
// scan, and the replacer looks the matched id up in `defs`. The previous
// per-def loop (`for (id) line.replace(new RegExp(...))`) was quadratic in the
// definition count — a modest upload with thousands of defs could freeze the
// request thread (and thus the whole instance, since import runs synchronously
// on it). The ids are escaped and joined; `defs` is the id→body lookup.
const refRe = new RegExp(
'\\[\\^(' + [...defs.keys()].map(escapeRegExp).join('|') + ')\\]',
'g',
);
const rewriteSegment = (segment: string): string =>
segment.replace(refRe, (whole, id: string) => {
const body = defs.get(id);
// A ref whose id is not a real definition should not be reachable (the
// alternation only contains real ids), but stay defensive: leave it as-is.
return body === undefined ? whole : `^[${escapeFootnoteBody(body)}]`;
});
// Pass 2: rewrite in-text references, skipping fenced code and dropped lines.
const out: string[] = [];
inFence = false;
@@ -185,27 +221,33 @@ function convertReferenceFootnotes(markdown: string): string {
continue;
}
line = rewriteRefsOutsideInlineCode(line, (segment) => {
let s = segment;
for (const [id, body] of defs) {
const ref = new RegExp('\\[\\^' + escapeRegExp(id) + '\\]', 'g');
s = s.replace(ref, `^[${escapeFootnoteBody(body)}]`);
}
return s;
});
line = rewriteRefsOutsideInlineCode(line, rewriteSegment);
out.push(line);
}
return out.join('\n');
}
/**
* Strip a single leading YAML front-matter block (`---\n…\n---`). Foreign files
* from Obsidian / Hugo / Jekyll / Notion — and Docmost's OWN git-sync page files
* — open with front-matter that the canonical parser does not consume, so
* without this it leaks into the body (and `title: Foo` above the closing `---`
* renders as a setext `<h2>` that `extractTitleAndRemoveHeading` can hijack as
* the page title). This mirrors the strip the retired `markdownToHtml` layer did
* (editor-ext marked.utils.ts). It is a no-op for front-matter-free input.
*/
const YAML_FRONT_MATTER_RE = /^\s*---[\s\S]*?---\s*/;
/**
* Normalize a foreign markdown string into Docmost's canonical markdown surface
* so the strict canonical parser accepts it losslessly. Currently this rewrites
* GFM reference footnotes into inline footnotes; add further fixture-driven
* foreign-surface cases here as they are found.
* so the strict canonical parser accepts it losslessly: strip a leading YAML
* front-matter block, then rewrite GFM reference footnotes into inline
* footnotes. Add further fixture-driven foreign-surface cases here as they are
* found.
*/
export function normalizeForeignMarkdown(markdown: string): string {
if (!markdown) return markdown;
return convertReferenceFootnotes(markdown);
const withoutFrontMatter = markdown.replace(YAML_FRONT_MATTER_RE, '').trimStart();
return convertReferenceFootnotes(withoutFrontMatter);
}