From 80fc30633b44acaa976801d5382dccb6fb5e6309 Mon Sep 17 00:00:00 2001 From: agent_coder Date: Sun, 5 Jul 2026 05:18:44 +0300 Subject: [PATCH] fix(#345): replace id-alternation regex with a fixed generic scanner + line-anchor frontmatter (review round 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F7 [CRITICAL] The round-1 F2(a) fix built ONE alternation regex over all definition ids (`(id1|id2|...)`). On prefix-chain ids (a, aa, aaa, ...) V8's regex compiler blows its stack with a fatal, UNCATCHABLE 'RegExpCompiler Allocation failed' that kills the whole process — strictly worse than the original per-def thread-hang, and its match cost was still O(text x defs). Replaced with a single FIXED generic scanner `/\[\^([^\]]+)\]/g` plus a map lookup in the replacer: genuinely O(total text), no per-document regex compilation, cannot blow up. Output is identical (only real def ids are inlined). F8 [WARNING] The frontmatter strip regex was not line-anchored: it closed on the FIRST `---` anywhere, so a value containing a triple-dash (e.g. 'title: Q1 --- Q2') truncated the frontmatter and leaked the rest into the body. Replaced with the line-anchored shape the canonical parser already uses (page-file.ts): open on `---\n`, close on a `\n---` line. Adds tests: 4000 prefix-chain ids do not crash and stay fast; a frontmatter value containing '---' is stripped whole. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../import/utils/foreign-markdown.spec.ts | 29 ++++++++++++++ .../import/utils/foreign-markdown.ts | 40 +++++++++++-------- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts index 0e50dd2c..c0d8dac4 100644 --- a/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts +++ b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts @@ -126,6 +126,35 @@ describe('normalizeForeignMarkdown — GFM reference footnotes', () => { normalizeForeignMarkdown(doc); expect(Date.now() - t0).toBeLessThan(2000); }); + + it('does not crash or slow down on thousands of prefix-chain definition ids', () => { + // F7: the rewrite must use a FIXED generic scanner, not an alternation built + // from the ids. A `(a|aa|aaa|…)` alternation over prefix-chain ids blows the + // V8 regex compiler (FATAL RegExpCompiler Allocation failed — uncatchable, + // kills the process). A fixed scanner has no id-dependent compilation cost. + const N = 4000; + const ids = Array.from({ length: N }, (_, i) => 'a'.repeat(i + 1)); + const defs = ids.map((id) => `[^${id}]: body ${id.length}`).join('\n'); + const doc = `ref[^${ids[0]}] and[^${ids[N - 1]}] end\n\n${defs}`; + const t0 = Date.now(); + const out = normalizeForeignMarkdown(doc); + expect(Date.now() - t0).toBeLessThan(2000); + // Prefix disambiguation is correct: [^a] and [^aaaa...] inline their OWN body. + expect(out).toContain('^[body 1]'); + expect(out).toContain(`^[body ${N}]`); + }); + + it('strips front-matter whose value contains a triple-dash (line-anchored)', () => { + // F8: the block must close only on a `\n---` LINE, not the first inline + // `---`. A value like `title: Q1 --- Q2` must not truncate the front-matter + // and leak the rest (author/closing ---) into the body. + const out = normalizeForeignMarkdown( + '---\ntitle: Q1 --- Q2 results\nauthor: bob\n---\n\nReal body.', + ); + expect(out).toBe('Real body.'); + expect(out).not.toContain('author: bob'); + expect(out).not.toContain('Q2 results'); + }); }); describe('foreign markdown import acceptance (normalizer + canonical parser)', () => { diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.ts b/apps/server/src/integrations/import/utils/foreign-markdown.ts index f0079aa0..ef253a61 100644 --- a/apps/server/src/integrations/import/utils/foreign-markdown.ts +++ b/apps/server/src/integrations/import/utils/foreign-markdown.ts @@ -177,23 +177,23 @@ function convertReferenceFootnotes(markdown: string): string { return markdown; } - // ONE precompiled alternation regex over ALL definition ids, built once per - // document (not once per definition per line). This makes pass 2 O(total text) - // instead of O(text × defs): a line with no reference pays a single failed - // scan, and the replacer looks the matched id up in `defs`. The previous - // per-def loop (`for (id) line.replace(new RegExp(...))`) was quadratic in the - // definition count — a modest upload with thousands of defs could freeze the - // request thread (and thus the whole instance, since import runs synchronously - // on it). The ids are escaped and joined; `defs` is the id→body lookup. - const refRe = new RegExp( - '\\[\\^(' + [...defs.keys()].map(escapeRegExp).join('|') + ')\\]', - 'g', - ); + // ONE fixed, generic scanner regex — NOT one built from the definition ids. + // It matches ANY `[^id]` shape, and the replacer decides per match via a map + // lookup whether that id is a real definition (replace) or not (leave as-is). + // This is genuinely O(total text) with no per-document regex compilation. + // + // Do NOT rebuild this as an alternation over `[...defs.keys()]`: a giant + // `(id1|id2|...)` alternation over thousands of ids can blow the V8 regex + // compiler's stack — a fatal, UNCATCHABLE "RegExpCompiler Allocation failed" + // on prefix-chain ids (`a`, `aa`, `aaa`, ...) that kills the whole process + // (worse than the earlier per-def thread-hang). A fixed scanner has no + // id-dependent compilation cost and cannot blow up. + const refRe = /\[\^([^\]]+)\]/g; const rewriteSegment = (segment: string): string => segment.replace(refRe, (whole, id: string) => { const body = defs.get(id); - // A ref whose id is not a real definition should not be reachable (the - // alternation only contains real ids), but stay defensive: leave it as-is. + // Only real definitions are inlined; an unknown id is left literal (same as + // the old per-def loop, which simply never matched it). return body === undefined ? whole : `^[${escapeFootnoteBody(body)}]`; }); @@ -234,10 +234,16 @@ function convertReferenceFootnotes(markdown: string): string { * — open with front-matter that the canonical parser does not consume, so * without this it leaks into the body (and `title: Foo` above the closing `---` * renders as a setext `

` that `extractTitleAndRemoveHeading` can hijack as - * the page title). This mirrors the strip the retired `markdownToHtml` layer did - * (editor-ext marked.utils.ts). It is a no-op for front-matter-free input. + * the page title). It is a no-op for front-matter-free input. + * + * LINE-ANCHORED (the same shape the canonical parser uses in + * prosemirror-markdown/page-file.ts): the block opens only on `---\n` at the + * very start and closes only on a `\n---` line. The retired `markdownToHtml` + * strip closed on the FIRST `---` ANYWHERE (an unanchored close), so a value + * containing a triple-dash (e.g. `title: Q1 --- Q2`) truncated the front-matter + * and leaked the rest into the body. An optional leading BOM is tolerated. */ -const YAML_FRONT_MATTER_RE = /^\s*---[\s\S]*?---\s*/; +const YAML_FRONT_MATTER_RE = /^\uFEFF?---\n[\s\S]*?\n---\n?/; /** * Normalize a foreign markdown string into Docmost's canonical markdown surface