fix(#345): restore prom-client, harden normalizer against ReDoS, strip frontmatter (review round 1)
Addresses the round-1 review of #369: F1 [CRITICAL] Restore prom-client. The prior commit removed it as a 'stray dep', but metrics.registry.ts imports it unconditionally at startup (main.ts boot), so a clean frozen install had no prom-client -> server tsc TS2307 + boot crash. It was surviving only via hoisting from a warm store. Restored to apps/server dependencies + regenerated the lock (prom-client/tdigest/bintrees return), keeping the @docmost/prosemirror-markdown dep. Verified: clean frozen install -> require.resolve('prom-client') ok, server tsc EXIT 0. F2 [HIGH] Two quadratic ReDoS vectors in foreign-markdown.ts on untrusted import (runs synchronously on the request thread, 30MB cap): (a) pass-2 was O(lines x defs) — a per-def RegExp rebuilt and run over every line. Replaced with ONE precompiled alternation regex over all def ids, built once per document, with an id->body lookup in the replacer: O(text). (b) the inline-code split alternation backtracks quadratically on a long UNCLOSED backtick run. Lines over 8KB now skip the split (left untouched) — a real footnote line is never that long. F3 [WARNING] Restore the leading YAML front-matter strip that the retired markdownToHtml layer did. Without it, Obsidian/Hugo/Jekyll/git-sync files leak their front-matter into the body (and 'title:' renders as a setext heading that title extraction can hijack). F4 [WARNING] Extend the zip-import spec with an image (width+align) + callout fidelity assertion through the PM->HTML->PM hop (the one hop the package suite does not cover). F5/F6 Update AGENTS.md (apps/server is now a prosemirror-markdown consumer) and make the server pretest build prosemirror-markdown too. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+115
-76
@@ -91,88 +91,127 @@ function chainable(result: any): any {
|
||||
return proxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run one markdown file through the REAL zip-import pipeline
|
||||
* (`processGenericImport` -> `markdownToProseMirror` -> `jsonToHtml` ->
|
||||
* `processHTML`/`htmlToJson`) and return the persisted page `content`. This is
|
||||
* the server-specific PM->HTML->PM hop that the package's own PM<->MD tests do
|
||||
* NOT cover.
|
||||
*/
|
||||
async function runZipImport(markdown: string): Promise<any> {
|
||||
const extractDir = await fs.mkdtemp(path.join(os.tmpdir(), 'fit-canon-'));
|
||||
await fs.writeFile(path.join(extractDir, 'note.md'), markdown, 'utf-8');
|
||||
|
||||
const importService = new ImportService(
|
||||
{} as any,
|
||||
{} as any,
|
||||
{} as any,
|
||||
{} as any,
|
||||
);
|
||||
jest
|
||||
.spyOn(importService as any, 'createYdoc')
|
||||
.mockResolvedValue(Buffer.from([]) as any);
|
||||
|
||||
let captured: any = null;
|
||||
const trx = {
|
||||
insertInto: (table: string) => ({
|
||||
values: (v: any) => {
|
||||
if (table === 'pages') captured = v;
|
||||
return { execute: async () => {} };
|
||||
},
|
||||
}),
|
||||
};
|
||||
const db: any = {
|
||||
selectFrom: () => chainable({ slug: 'space-slug' }),
|
||||
transaction: () => ({ execute: (fn: any) => fn(trx) }),
|
||||
};
|
||||
|
||||
const importAttachmentService = {
|
||||
processAttachments: async ({ html }: any) => html,
|
||||
};
|
||||
const service = new FileImportTaskService(
|
||||
{} as any, // storageService
|
||||
importService as any,
|
||||
{ nextPagePosition: async () => 'a0' } as any,
|
||||
{ insertBacklink: jest.fn() } as any,
|
||||
db,
|
||||
importAttachmentService as any,
|
||||
{ emit: jest.fn() } as any,
|
||||
{ logBatchWithContext: jest.fn() } as any,
|
||||
);
|
||||
|
||||
const fileTask: any = {
|
||||
id: 'task-1',
|
||||
source: 'generic',
|
||||
spaceId: 'space-1',
|
||||
workspaceId: 'ws-1',
|
||||
creatorId: 'user-1',
|
||||
};
|
||||
|
||||
try {
|
||||
await service.processGenericImport({ extractDir, fileTask });
|
||||
expect(captured).toBeTruthy();
|
||||
return captured.content;
|
||||
} finally {
|
||||
await fs.rm(extractDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
/** Find the first node of a given type anywhere in a PM content tree. */
|
||||
function findFirst(node: any, type: string): any {
|
||||
if (!node || typeof node !== 'object') return null;
|
||||
if (node.type === type) return node;
|
||||
for (const child of node.content ?? []) {
|
||||
const hit = findFirst(child, type);
|
||||
if (hit) return hit;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
describe('FileImportTaskService.processGenericImport — footnote canonicalization (#228)', () => {
|
||||
it('orders footnotes by first reference, dedupes reuse, and drops orphans on zip import', async () => {
|
||||
const extractDir = await fs.mkdtemp(path.join(os.tmpdir(), 'fit-canon-'));
|
||||
await fs.writeFile(path.join(extractDir, 'note.md'), MARKDOWN, 'utf-8');
|
||||
|
||||
// Real ImportService for the html -> JSON conversion; stub the yjs encode.
|
||||
const importService = new ImportService(
|
||||
{} as any,
|
||||
{} as any,
|
||||
{} as any,
|
||||
{} as any,
|
||||
const content = await runZipImport(MARKDOWN);
|
||||
// Definitions ordered by FIRST REFERENCE (C, A, B), NOT the markdown
|
||||
// definition order (A, B, C). Ids are the parser's fresh `fn-*`, so pin
|
||||
// the BODIES.
|
||||
expect(footnoteListBodies(content)).toEqual(['note C', 'note A', 'note B']);
|
||||
// Orphan [^z] dropped; reused [^a] collapses to one definition; one list.
|
||||
expect(footnoteListBodies(content)).not.toContain('orphan note');
|
||||
const lists = (content.content ?? []).filter(
|
||||
(n: any) => n.type === 'footnotesList',
|
||||
);
|
||||
jest
|
||||
.spyOn(importService as any, 'createYdoc')
|
||||
.mockResolvedValue(Buffer.from([]) as any);
|
||||
expect(lists).toHaveLength(1);
|
||||
expect(
|
||||
footnoteListBodies(content).filter((b) => b === 'note A'),
|
||||
).toHaveLength(1);
|
||||
});
|
||||
|
||||
let captured: any = null;
|
||||
const trx = {
|
||||
insertInto: (table: string) => ({
|
||||
values: (v: any) => {
|
||||
if (table === 'pages') captured = v;
|
||||
return { execute: async () => {} };
|
||||
},
|
||||
}),
|
||||
};
|
||||
const db: any = {
|
||||
selectFrom: () => chainable({ slug: 'space-slug' }),
|
||||
transaction: () => ({ execute: (fn: any) => fn(trx) }),
|
||||
};
|
||||
// #345 F4: the zip path routes markdown through jsonToHtml -> processHTML ->
|
||||
// htmlToJson (the shared HTML attachment pipeline). #345's headline is LOSSLESS
|
||||
// image width/align via the `<!--img {...}-->` comment; a callout carries its
|
||||
// `type`. This asserts those survive the PM->HTML->PM hop — the one hop the
|
||||
// package's PM<->MD suite does not exercise.
|
||||
it('preserves image width/align and callout type through the PM->HTML->PM hop', async () => {
|
||||
const md = [
|
||||
'# Doc',
|
||||
'',
|
||||
' <!--img {"width":"320","align":"left"}-->',
|
||||
'',
|
||||
':::warning',
|
||||
'Careful now.',
|
||||
':::',
|
||||
].join('\n');
|
||||
|
||||
const importAttachmentService = {
|
||||
processAttachments: async ({ html }: any) => html,
|
||||
};
|
||||
const backlinkRepo = { insertBacklink: jest.fn() };
|
||||
const eventEmitter = { emit: jest.fn() };
|
||||
const auditService = { logBatchWithContext: jest.fn() };
|
||||
const content = await runZipImport(md);
|
||||
|
||||
const pageService = { nextPagePosition: async () => 'a0' };
|
||||
const image = findFirst(content, 'image');
|
||||
expect(image).toBeTruthy();
|
||||
// The lossless sizing/alignment must survive the HTML hop.
|
||||
expect(String(image.attrs?.width)).toBe('320');
|
||||
expect(image.attrs?.align).toBe('left');
|
||||
|
||||
const service = new FileImportTaskService(
|
||||
{} as any, // storageService
|
||||
importService as any,
|
||||
pageService as any,
|
||||
backlinkRepo as any,
|
||||
db,
|
||||
importAttachmentService as any,
|
||||
eventEmitter as any,
|
||||
auditService as any,
|
||||
);
|
||||
|
||||
const fileTask: any = {
|
||||
id: 'task-1',
|
||||
source: 'generic',
|
||||
spaceId: 'space-1',
|
||||
workspaceId: 'ws-1',
|
||||
creatorId: 'user-1',
|
||||
};
|
||||
|
||||
try {
|
||||
await service.processGenericImport({ extractDir, fileTask });
|
||||
|
||||
expect(captured).toBeTruthy();
|
||||
const content = captured.content;
|
||||
// Definitions ordered by FIRST REFERENCE (C, A, B), NOT the markdown
|
||||
// definition order (A, B, C). Ids are the parser's fresh `fn-*`, so pin
|
||||
// the BODIES.
|
||||
expect(footnoteListBodies(content)).toEqual([
|
||||
'note C',
|
||||
'note A',
|
||||
'note B',
|
||||
]);
|
||||
// Orphan [^z] dropped; reused [^a] collapses to one definition; one list.
|
||||
expect(footnoteListBodies(content)).not.toContain('orphan note');
|
||||
const lists = (content.content ?? []).filter(
|
||||
(n: any) => n.type === 'footnotesList',
|
||||
);
|
||||
expect(lists).toHaveLength(1);
|
||||
expect(
|
||||
footnoteListBodies(content).filter((b) => b === 'note A'),
|
||||
).toHaveLength(1);
|
||||
} finally {
|
||||
await fs.rm(extractDir, { recursive: true, force: true });
|
||||
}
|
||||
const callout = findFirst(content, 'callout');
|
||||
expect(callout).toBeTruthy();
|
||||
expect(callout.attrs?.type).toBe('warning');
|
||||
});
|
||||
});
|
||||
|
||||
@@ -83,6 +83,49 @@ describe('normalizeForeignMarkdown — GFM reference footnotes', () => {
|
||||
const callouts = ':::info\nHi\n:::\n\n> [!warning]\n> Careful';
|
||||
expect(normalizeForeignMarkdown(callouts)).toBe(callouts);
|
||||
});
|
||||
|
||||
it('strips a leading YAML front-matter block (Obsidian/Hugo/git-sync files)', () => {
|
||||
const out = normalizeForeignMarkdown(
|
||||
'---\ntitle: My Page\ntags: [a, b]\n---\n\n# Heading\n\nBody.',
|
||||
);
|
||||
expect(out).toBe('# Heading\n\nBody.');
|
||||
// The front-matter must not leak into the body as a setext heading.
|
||||
expect(out).not.toContain('title: My Page');
|
||||
expect(out).not.toContain('---');
|
||||
});
|
||||
|
||||
it('does not strip a horizontal rule that is not leading front-matter', () => {
|
||||
const md = 'Intro paragraph.\n\n---\n\nAfter the rule.';
|
||||
expect(normalizeForeignMarkdown(md)).toBe(md);
|
||||
});
|
||||
|
||||
it('is linear on a document with thousands of definitions (no quadratic blowup)', () => {
|
||||
// F2(a): the pass-2 rewrite must be O(text), not O(text × defs). Build a
|
||||
// pathological doc (many defs + many plain text lines) and assert it
|
||||
// completes well under a second — a quadratic implementation took ~14s.
|
||||
const N = 4000;
|
||||
const refs = Array.from({ length: N }, (_, i) => `line ${i} plain text`).join('\n');
|
||||
const defs = Array.from({ length: N }, (_, i) => `[^n${i}]: def ${i}`).join('\n');
|
||||
const doc = `start[^n0] and[^n${N - 1}] end\n\n${refs}\n\n${defs}`;
|
||||
const t0 = Date.now();
|
||||
const out = normalizeForeignMarkdown(doc);
|
||||
const elapsed = Date.now() - t0;
|
||||
expect(elapsed).toBeLessThan(2000);
|
||||
// Sanity: the two real references were still inlined.
|
||||
expect(out).toContain('^[def 0]');
|
||||
expect(out).toContain(`^[def ${N - 1}]`);
|
||||
});
|
||||
|
||||
it('is bounded on a long unclosed backtick run (no inline-split ReDoS)', () => {
|
||||
// F2(b): a huge unterminated backtick run must not cause quadratic
|
||||
// backtracking in the inline-code split. Oversized lines skip the split
|
||||
// entirely (left untouched), so this returns promptly.
|
||||
const line = 'x' + '`'.repeat(200000);
|
||||
const doc = `${line}\n\n[^1]: def`;
|
||||
const t0 = Date.now();
|
||||
normalizeForeignMarkdown(doc);
|
||||
expect(Date.now() - t0).toBeLessThan(2000);
|
||||
});
|
||||
});
|
||||
|
||||
describe('foreign markdown import acceptance (normalizer + canonical parser)', () => {
|
||||
|
||||
@@ -74,10 +74,26 @@ function escapeFootnoteBody(body: string): string {
|
||||
* We split the line on inline-code spans (paired backtick runs) and rewrite only
|
||||
* the non-code segments.
|
||||
*/
|
||||
// Above this length a single line is not split into inline-code spans (see
|
||||
// below). A genuine markdown line carrying a footnote reference is never tens of
|
||||
// KB; the cap only bypasses the inline-code protection for pathological lines.
|
||||
const INLINE_SPLIT_MAX_LINE = 8192;
|
||||
|
||||
function rewriteRefsOutsideInlineCode(
|
||||
line: string,
|
||||
replace: (text: string) => string,
|
||||
): string {
|
||||
// The inline-code split alternation `(`+)(?:(?!\1)[\s\S])*\1` backtracks
|
||||
// quadratically on a long UNCLOSED backtick run (its middle can consume the
|
||||
// rest of the line, then fail to find a closing run and retry from each
|
||||
// position). On an untrusted import this is a request-thread ReDoS. A real
|
||||
// footnote line is short, so for an oversized line we skip the inline-code
|
||||
// protection entirely and leave the line UNTOUCHED (rewriting it wholesale
|
||||
// could corrupt a `[^id]` that legitimately lives inside inline code). This is
|
||||
// a conservative bypass: an over-8KB line simply does not get its reference
|
||||
// footnotes inlined — acceptable for a pathological input.
|
||||
if (line.length > INLINE_SPLIT_MAX_LINE) return line;
|
||||
|
||||
// Alternation: an inline-code span (one or more backticks, then anything up to
|
||||
// the SAME run of backticks) OR a run of non-backtick text. Unterminated
|
||||
// backticks fall through as ordinary text (matched by the second branch on the
|
||||
@@ -161,6 +177,26 @@ function convertReferenceFootnotes(markdown: string): string {
|
||||
return markdown;
|
||||
}
|
||||
|
||||
// ONE precompiled alternation regex over ALL definition ids, built once per
|
||||
// document (not once per definition per line). This makes pass 2 O(total text)
|
||||
// instead of O(text × defs): a line with no reference pays a single failed
|
||||
// scan, and the replacer looks the matched id up in `defs`. The previous
|
||||
// per-def loop (`for (id) line.replace(new RegExp(...))`) was quadratic in the
|
||||
// definition count — a modest upload with thousands of defs could freeze the
|
||||
// request thread (and thus the whole instance, since import runs synchronously
|
||||
// on it). The ids are escaped and joined; `defs` is the id→body lookup.
|
||||
const refRe = new RegExp(
|
||||
'\\[\\^(' + [...defs.keys()].map(escapeRegExp).join('|') + ')\\]',
|
||||
'g',
|
||||
);
|
||||
const rewriteSegment = (segment: string): string =>
|
||||
segment.replace(refRe, (whole, id: string) => {
|
||||
const body = defs.get(id);
|
||||
// A ref whose id is not a real definition should not be reachable (the
|
||||
// alternation only contains real ids), but stay defensive: leave it as-is.
|
||||
return body === undefined ? whole : `^[${escapeFootnoteBody(body)}]`;
|
||||
});
|
||||
|
||||
// Pass 2: rewrite in-text references, skipping fenced code and dropped lines.
|
||||
const out: string[] = [];
|
||||
inFence = false;
|
||||
@@ -185,27 +221,33 @@ function convertReferenceFootnotes(markdown: string): string {
|
||||
continue;
|
||||
}
|
||||
|
||||
line = rewriteRefsOutsideInlineCode(line, (segment) => {
|
||||
let s = segment;
|
||||
for (const [id, body] of defs) {
|
||||
const ref = new RegExp('\\[\\^' + escapeRegExp(id) + '\\]', 'g');
|
||||
s = s.replace(ref, `^[${escapeFootnoteBody(body)}]`);
|
||||
}
|
||||
return s;
|
||||
});
|
||||
line = rewriteRefsOutsideInlineCode(line, rewriteSegment);
|
||||
out.push(line);
|
||||
}
|
||||
|
||||
return out.join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip a single leading YAML front-matter block (`---\n…\n---`). Foreign files
|
||||
* from Obsidian / Hugo / Jekyll / Notion — and Docmost's OWN git-sync page files
|
||||
* — open with front-matter that the canonical parser does not consume, so
|
||||
* without this it leaks into the body (and `title: Foo` above the closing `---`
|
||||
* renders as a setext `<h2>` that `extractTitleAndRemoveHeading` can hijack as
|
||||
* the page title). This mirrors the strip the retired `markdownToHtml` layer did
|
||||
* (editor-ext marked.utils.ts). It is a no-op for front-matter-free input.
|
||||
*/
|
||||
const YAML_FRONT_MATTER_RE = /^\s*---[\s\S]*?---\s*/;
|
||||
|
||||
/**
|
||||
* Normalize a foreign markdown string into Docmost's canonical markdown surface
|
||||
* so the strict canonical parser accepts it losslessly. Currently this rewrites
|
||||
* GFM reference footnotes into inline footnotes; add further fixture-driven
|
||||
* foreign-surface cases here as they are found.
|
||||
* so the strict canonical parser accepts it losslessly: strip a leading YAML
|
||||
* front-matter block, then rewrite GFM reference footnotes into inline
|
||||
* footnotes. Add further fixture-driven foreign-surface cases here as they are
|
||||
* found.
|
||||
*/
|
||||
export function normalizeForeignMarkdown(markdown: string): string {
|
||||
if (!markdown) return markdown;
|
||||
return convertReferenceFootnotes(markdown);
|
||||
const withoutFrontMatter = markdown.replace(YAML_FRONT_MATTER_RE, '').trimStart();
|
||||
return convertReferenceFootnotes(withoutFrontMatter);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user