fix(#345): replace id-alternation regex with a fixed generic scanner + line-anchor frontmatter (review round 2)

F7 [CRITICAL] The round-1 F2(a) fix built ONE alternation regex over all definition ids (`(id1|id2|...)`). On prefix-chain ids (a, aa, aaa, ...) V8's regex compiler blows its stack with a fatal, UNCATCHABLE 'RegExpCompiler Allocation failed' that kills the whole process — strictly worse than the original per-def thread-hang, and its match cost was still O(text x defs). Replaced with a single FIXED generic scanner `/\[\^([^\]]+)\]/g` plus a map lookup in the replacer: genuinely O(total text), no per-document regex compilation, cannot blow up. Output is identical (only real def ids are inlined). F8 [WARNING] The frontmatter strip regex was not line-anchored: it closed on the FIRST `---` anywhere, so a value containing a triple-dash (e.g. 'title: Q1 --- Q2') truncated the frontmatter and leaked the rest into the body. Replaced with the line-anchored shape the canonical parser already uses (page-file.ts): open on `---\n`, close on a `\n---` line. Adds tests: 4000 prefix-chain ids do not crash and stay fast; a frontmatter value containing '---' is stripped whole. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-05 05:18:44 +03:00
parent e17d5bc060
commit 80fc30633b
2 changed files with 52 additions and 17 deletions
@@ -126,6 +126,35 @@ describe('normalizeForeignMarkdown — GFM reference footnotes', () => {
    normalizeForeignMarkdown(doc);
    expect(Date.now() - t0).toBeLessThan(2000);
  });
+
+  it('does not crash or slow down on thousands of prefix-chain definition ids', () => {
+    // F7: the rewrite must use a FIXED generic scanner, not an alternation built
+    // from the ids. A `(a|aa|aaa|…)` alternation over prefix-chain ids blows the
+    // V8 regex compiler (FATAL RegExpCompiler Allocation failed — uncatchable,
+    // kills the process). A fixed scanner has no id-dependent compilation cost.
+    const N = 4000;
+    const ids = Array.from({ length: N }, (_, i) => 'a'.repeat(i + 1));
+    const defs = ids.map((id) => `[^${id}]: body ${id.length}`).join('\n');
+    const doc = `ref[^${ids[0]}] and[^${ids[N - 1]}] end\n\n${defs}`;
+    const t0 = Date.now();
+    const out = normalizeForeignMarkdown(doc);
+    expect(Date.now() - t0).toBeLessThan(2000);
+    // Prefix disambiguation is correct: [^a] and [^aaaa...] inline their OWN body.
+    expect(out).toContain('^[body 1]');
+    expect(out).toContain(`^[body ${N}]`);
+  });
+
+  it('strips front-matter whose value contains a triple-dash (line-anchored)', () => {
+    // F8: the block must close only on a `\n---` LINE, not the first inline
+    // `---`. A value like `title: Q1 --- Q2` must not truncate the front-matter
+    // and leak the rest (author/closing ---) into the body.
+    const out = normalizeForeignMarkdown(
+      '---\ntitle: Q1 --- Q2 results\nauthor: bob\n---\n\nReal body.',
+    );
+    expect(out).toBe('Real body.');
+    expect(out).not.toContain('author: bob');
+    expect(out).not.toContain('Q2 results');
+  });
 });

 describe('foreign markdown import acceptance (normalizer + canonical parser)', () => {
@@ -177,23 +177,23 @@ function convertReferenceFootnotes(markdown: string): string {
    return markdown;
  }

-  // ONE precompiled alternation regex over ALL definition ids, built once per
-  // document (not once per definition per line). This makes pass 2 O(total text)
-  // instead of O(text × defs): a line with no reference pays a single failed
-  // scan, and the replacer looks the matched id up in `defs`. The previous
-  // per-def loop (`for (id) line.replace(new RegExp(...))`) was quadratic in the
-  // definition count — a modest upload with thousands of defs could freeze the
-  // request thread (and thus the whole instance, since import runs synchronously
-  // on it). The ids are escaped and joined; `defs` is the id→body lookup.
-  const refRe = new RegExp(
-    '\\[\\^(' + [...defs.keys()].map(escapeRegExp).join('|') + ')\\]',
-    'g',
-  );
+  // ONE fixed, generic scanner regex — NOT one built from the definition ids.
+  // It matches ANY `[^id]` shape, and the replacer decides per match via a map
+  // lookup whether that id is a real definition (replace) or not (leave as-is).
+  // This is genuinely O(total text) with no per-document regex compilation.
+  //
+  // Do NOT rebuild this as an alternation over `[...defs.keys()]`: a giant
+  // `(id1|id2|...)` alternation over thousands of ids can blow the V8 regex
+  // compiler's stack — a fatal, UNCATCHABLE "RegExpCompiler Allocation failed"
+  // on prefix-chain ids (`a`, `aa`, `aaa`, ...) that kills the whole process
+  // (worse than the earlier per-def thread-hang). A fixed scanner has no
+  // id-dependent compilation cost and cannot blow up.
+  const refRe = /\[\^([^\]]+)\]/g;
  const rewriteSegment = (segment: string): string =>
    segment.replace(refRe, (whole, id: string) => {
      const body = defs.get(id);
-      // A ref whose id is not a real definition should not be reachable (the
-      // alternation only contains real ids), but stay defensive: leave it as-is.
+      // Only real definitions are inlined; an unknown id is left literal (same as
+      // the old per-def loop, which simply never matched it).
      return body === undefined ? whole : `^[${escapeFootnoteBody(body)}]`;
    });

@@ -234,10 +234,16 @@ function convertReferenceFootnotes(markdown: string): string {
 * — open with front-matter that the canonical parser does not consume, so
 * without this it leaks into the body (and `title: Foo` above the closing `---`
 * renders as a setext `<h2>` that `extractTitleAndRemoveHeading` can hijack as
- * the page title). This mirrors the strip the retired `markdownToHtml` layer did
- * (editor-ext marked.utils.ts). It is a no-op for front-matter-free input.
+ * the page title). It is a no-op for front-matter-free input.
+ *
+ * LINE-ANCHORED (the same shape the canonical parser uses in
+ * prosemirror-markdown/page-file.ts): the block opens only on `---\n` at the
+ * very start and closes only on a `\n---` line. The retired `markdownToHtml`
+ * strip closed on the FIRST `---` ANYWHERE (an unanchored close), so a value
+ * containing a triple-dash (e.g. `title: Q1 --- Q2`) truncated the front-matter
+ * and leaked the rest into the body. An optional leading BOM is tolerated.
 */
-const YAML_FRONT_MATTER_RE = /^\s*---[\s\S]*?---\s*/;
+const YAML_FRONT_MATTER_RE = /^\uFEFF?---\n[\s\S]*?\n---\n?/;

 /**
 * Normalize a foreign markdown string into Docmost's canonical markdown surface