diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts index 0e50dd2c..c0d8dac4 100644 --- a/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts +++ b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts @@ -126,6 +126,35 @@ describe('normalizeForeignMarkdown — GFM reference footnotes', () => { normalizeForeignMarkdown(doc); expect(Date.now() - t0).toBeLessThan(2000); }); + + it('does not crash or slow down on thousands of prefix-chain definition ids', () => { + // F7: the rewrite must use a FIXED generic scanner, not an alternation built + // from the ids. A `(a|aa|aaa|…)` alternation over prefix-chain ids blows the + // V8 regex compiler (FATAL RegExpCompiler Allocation failed — uncatchable, + // kills the process). A fixed scanner has no id-dependent compilation cost. + const N = 4000; + const ids = Array.from({ length: N }, (_, i) => 'a'.repeat(i + 1)); + const defs = ids.map((id) => `[^${id}]: body ${id.length}`).join('\n'); + const doc = `ref[^${ids[0]}] and[^${ids[N - 1]}] end\n\n${defs}`; + const t0 = Date.now(); + const out = normalizeForeignMarkdown(doc); + expect(Date.now() - t0).toBeLessThan(2000); + // Prefix disambiguation is correct: [^a] and [^aaaa...] inline their OWN body. + expect(out).toContain('^[body 1]'); + expect(out).toContain(`^[body ${N}]`); + }); + + it('strips front-matter whose value contains a triple-dash (line-anchored)', () => { + // F8: the block must close only on a `\n---` LINE, not the first inline + // `---`. A value like `title: Q1 --- Q2` must not truncate the front-matter + // and leak the rest (author/closing ---) into the body. + const out = normalizeForeignMarkdown( + '---\ntitle: Q1 --- Q2 results\nauthor: bob\n---\n\nReal body.', + ); + expect(out).toBe('Real body.'); + expect(out).not.toContain('author: bob'); + expect(out).not.toContain('Q2 results'); + }); }); describe('foreign markdown import acceptance (normalizer + canonical parser)', () => { diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.ts b/apps/server/src/integrations/import/utils/foreign-markdown.ts index f0079aa0..ef253a61 100644 --- a/apps/server/src/integrations/import/utils/foreign-markdown.ts +++ b/apps/server/src/integrations/import/utils/foreign-markdown.ts @@ -177,23 +177,23 @@ function convertReferenceFootnotes(markdown: string): string { return markdown; } - // ONE precompiled alternation regex over ALL definition ids, built once per - // document (not once per definition per line). This makes pass 2 O(total text) - // instead of O(text × defs): a line with no reference pays a single failed - // scan, and the replacer looks the matched id up in `defs`. The previous - // per-def loop (`for (id) line.replace(new RegExp(...))`) was quadratic in the - // definition count — a modest upload with thousands of defs could freeze the - // request thread (and thus the whole instance, since import runs synchronously - // on it). The ids are escaped and joined; `defs` is the id→body lookup. - const refRe = new RegExp( - '\\[\\^(' + [...defs.keys()].map(escapeRegExp).join('|') + ')\\]', - 'g', - ); + // ONE fixed, generic scanner regex — NOT one built from the definition ids. + // It matches ANY `[^id]` shape, and the replacer decides per match via a map + // lookup whether that id is a real definition (replace) or not (leave as-is). + // This is genuinely O(total text) with no per-document regex compilation. + // + // Do NOT rebuild this as an alternation over `[...defs.keys()]`: a giant + // `(id1|id2|...)` alternation over thousands of ids can blow the V8 regex + // compiler's stack — a fatal, UNCATCHABLE "RegExpCompiler Allocation failed" + // on prefix-chain ids (`a`, `aa`, `aaa`, ...) that kills the whole process + // (worse than the earlier per-def thread-hang). A fixed scanner has no + // id-dependent compilation cost and cannot blow up. + const refRe = /\[\^([^\]]+)\]/g; const rewriteSegment = (segment: string): string => segment.replace(refRe, (whole, id: string) => { const body = defs.get(id); - // A ref whose id is not a real definition should not be reachable (the - // alternation only contains real ids), but stay defensive: leave it as-is. + // Only real definitions are inlined; an unknown id is left literal (same as + // the old per-def loop, which simply never matched it). return body === undefined ? whole : `^[${escapeFootnoteBody(body)}]`; }); @@ -234,10 +234,16 @@ function convertReferenceFootnotes(markdown: string): string { * — open with front-matter that the canonical parser does not consume, so * without this it leaks into the body (and `title: Foo` above the closing `---` * renders as a setext `