80fc30633b
F7 [CRITICAL] The round-1 F2(a) fix built ONE alternation regex over all definition ids (`(id1|id2|...)`). On prefix-chain ids (a, aa, aaa, ...) V8's regex compiler blows its stack with a fatal, UNCATCHABLE 'RegExpCompiler Allocation failed' that kills the whole process — strictly worse than the original per-def thread-hang, and its match cost was still O(text x defs). Replaced with a single FIXED generic scanner `/\[\^([^\]]+)\]/g` plus a map lookup in the replacer: genuinely O(total text), no per-document regex compilation, cannot blow up. Output is identical (only real def ids are inlined). F8 [WARNING] The frontmatter strip regex was not line-anchored: it closed on the FIRST `---` anywhere, so a value containing a triple-dash (e.g. 'title: Q1 --- Q2') truncated the frontmatter and leaked the rest into the body. Replaced with the line-anchored shape the canonical parser already uses (page-file.ts): open on `---\n`, close on a `\n---` line. Adds tests: 4000 prefix-chain ids do not crash and stay fast; a frontmatter value containing '---' is stripped whole. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
207 lines
8.5 KiB
TypeScript
207 lines
8.5 KiB
TypeScript
import {
|
|
convertProseMirrorToMarkdown,
|
|
markdownToProseMirror,
|
|
} from '@docmost/prosemirror-markdown';
|
|
import { normalizeForeignMarkdown } from './foreign-markdown';
|
|
|
|
/**
|
|
* STEP 2 goldens for issue #345: the foreign-markdown normalizer that runs at the
|
|
* import boundary BEFORE the strict canonical parser (`markdownToProseMirror`).
|
|
*
|
|
* Two layers:
|
|
* 1. PURE string→string cases pinning the normalizer's own behavior (GFM
|
|
* reference footnotes → inline `^[…]`).
|
|
* 2. END-TO-END acceptance: for a foreign corpus, `normalizeForeignMarkdown`
|
|
* then `markdownToProseMirror` then `convertProseMirrorToMarkdown` must leave
|
|
* NO literal `[^id]` / `:::` garbage in the document and must re-export in the
|
|
* canonical forms.
|
|
*/
|
|
|
|
describe('normalizeForeignMarkdown — GFM reference footnotes', () => {
|
|
it('inlines a single-line reference footnote and drops its definition', () => {
|
|
const out = normalizeForeignMarkdown(
|
|
'A note[^1] here.\n\n[^1]: The definition.',
|
|
);
|
|
expect(out).toBe('A note^[The definition.] here.\n');
|
|
});
|
|
|
|
it('inlines every reference to a reused id (downstream dedups)', () => {
|
|
const out = normalizeForeignMarkdown(
|
|
'X[^a] and Y[^a].\n\n[^a]: shared.',
|
|
);
|
|
expect(out).toBe('X^[shared.] and Y^[shared.].\n');
|
|
});
|
|
|
|
it('joins indented continuation lines of a definition with a space', () => {
|
|
const out = normalizeForeignMarkdown(
|
|
'See[^n].\n\n[^n]: line one\n line two',
|
|
);
|
|
expect(out).toBe('See^[line one line two].\n');
|
|
});
|
|
|
|
it('never rewrites a reference inside a fenced code block', () => {
|
|
const out = normalizeForeignMarkdown(
|
|
'```\ncode[^1] here\n```\n\n[^1]: def.',
|
|
);
|
|
expect(out).toContain('code[^1] here');
|
|
// The (now orphaned) definition line is still removed.
|
|
expect(out).not.toContain('[^1]: def.');
|
|
});
|
|
|
|
it('never rewrites a reference inside an INLINE-code span (backticks)', () => {
|
|
// The `[^1]` inside backticks is literal code and must survive verbatim;
|
|
// the one outside is rewritten. (Bug #1: only fenced blocks were protected.)
|
|
const out = normalizeForeignMarkdown(
|
|
'Use `arr[^1]` in code but note[^1] in prose.\n\n[^1]: def.',
|
|
);
|
|
expect(out).toBe('Use `arr[^1]` in code but note^[def.] in prose.\n');
|
|
});
|
|
|
|
it('escapes brackets in a body so an unbalanced ] cannot truncate the footnote', () => {
|
|
// A foreign definition body with a stray `]` would, unescaped, close the
|
|
// canonical `^[...]` early and leak the tail as text (bug #2). The body's
|
|
// brackets are backslash-escaped so the footnote stays whole.
|
|
const out = normalizeForeignMarkdown(
|
|
'Ref[^1] here.\n\n[^1]: see item ] and [more] later',
|
|
);
|
|
expect(out).toBe('Ref^[see item \\] and \\[more\\] later] here.\n');
|
|
// The tokenizer must see exactly one unescaped closing bracket (our own).
|
|
expect(out.match(/(?<!\\)\]/g)).toHaveLength(1);
|
|
});
|
|
|
|
it('leaves a reference with no matching definition literal (no body to inline)', () => {
|
|
const out = normalizeForeignMarkdown('Dangling[^x] ref.');
|
|
expect(out).toBe('Dangling[^x] ref.');
|
|
});
|
|
|
|
it('returns the input unchanged when there are no reference footnotes', () => {
|
|
const md = '# Title\n\nJust text with `inline code` and a [link](/x).';
|
|
expect(normalizeForeignMarkdown(md)).toBe(md);
|
|
});
|
|
|
|
it('does NOT touch callout surfaces — the canonical parser handles them', () => {
|
|
const callouts = ':::info\nHi\n:::\n\n> [!warning]\n> Careful';
|
|
expect(normalizeForeignMarkdown(callouts)).toBe(callouts);
|
|
});
|
|
|
|
it('strips a leading YAML front-matter block (Obsidian/Hugo/git-sync files)', () => {
|
|
const out = normalizeForeignMarkdown(
|
|
'---\ntitle: My Page\ntags: [a, b]\n---\n\n# Heading\n\nBody.',
|
|
);
|
|
expect(out).toBe('# Heading\n\nBody.');
|
|
// The front-matter must not leak into the body as a setext heading.
|
|
expect(out).not.toContain('title: My Page');
|
|
expect(out).not.toContain('---');
|
|
});
|
|
|
|
it('does not strip a horizontal rule that is not leading front-matter', () => {
|
|
const md = 'Intro paragraph.\n\n---\n\nAfter the rule.';
|
|
expect(normalizeForeignMarkdown(md)).toBe(md);
|
|
});
|
|
|
|
it('is linear on a document with thousands of definitions (no quadratic blowup)', () => {
|
|
// F2(a): the pass-2 rewrite must be O(text), not O(text × defs). Build a
|
|
// pathological doc (many defs + many plain text lines) and assert it
|
|
// completes well under a second — a quadratic implementation took ~14s.
|
|
const N = 4000;
|
|
const refs = Array.from({ length: N }, (_, i) => `line ${i} plain text`).join('\n');
|
|
const defs = Array.from({ length: N }, (_, i) => `[^n${i}]: def ${i}`).join('\n');
|
|
const doc = `start[^n0] and[^n${N - 1}] end\n\n${refs}\n\n${defs}`;
|
|
const t0 = Date.now();
|
|
const out = normalizeForeignMarkdown(doc);
|
|
const elapsed = Date.now() - t0;
|
|
expect(elapsed).toBeLessThan(2000);
|
|
// Sanity: the two real references were still inlined.
|
|
expect(out).toContain('^[def 0]');
|
|
expect(out).toContain(`^[def ${N - 1}]`);
|
|
});
|
|
|
|
it('is bounded on a long unclosed backtick run (no inline-split ReDoS)', () => {
|
|
// F2(b): a huge unterminated backtick run must not cause quadratic
|
|
// backtracking in the inline-code split. Oversized lines skip the split
|
|
// entirely (left untouched), so this returns promptly.
|
|
const line = 'x' + '`'.repeat(200000);
|
|
const doc = `${line}\n\n[^1]: def`;
|
|
const t0 = Date.now();
|
|
normalizeForeignMarkdown(doc);
|
|
expect(Date.now() - t0).toBeLessThan(2000);
|
|
});
|
|
|
|
it('does not crash or slow down on thousands of prefix-chain definition ids', () => {
|
|
// F7: the rewrite must use a FIXED generic scanner, not an alternation built
|
|
// from the ids. A `(a|aa|aaa|…)` alternation over prefix-chain ids blows the
|
|
// V8 regex compiler (FATAL RegExpCompiler Allocation failed — uncatchable,
|
|
// kills the process). A fixed scanner has no id-dependent compilation cost.
|
|
const N = 4000;
|
|
const ids = Array.from({ length: N }, (_, i) => 'a'.repeat(i + 1));
|
|
const defs = ids.map((id) => `[^${id}]: body ${id.length}`).join('\n');
|
|
const doc = `ref[^${ids[0]}] and[^${ids[N - 1]}] end\n\n${defs}`;
|
|
const t0 = Date.now();
|
|
const out = normalizeForeignMarkdown(doc);
|
|
expect(Date.now() - t0).toBeLessThan(2000);
|
|
// Prefix disambiguation is correct: [^a] and [^aaaa...] inline their OWN body.
|
|
expect(out).toContain('^[body 1]');
|
|
expect(out).toContain(`^[body ${N}]`);
|
|
});
|
|
|
|
it('strips front-matter whose value contains a triple-dash (line-anchored)', () => {
|
|
// F8: the block must close only on a `\n---` LINE, not the first inline
|
|
// `---`. A value like `title: Q1 --- Q2` must not truncate the front-matter
|
|
// and leak the rest (author/closing ---) into the body.
|
|
const out = normalizeForeignMarkdown(
|
|
'---\ntitle: Q1 --- Q2 results\nauthor: bob\n---\n\nReal body.',
|
|
);
|
|
expect(out).toBe('Real body.');
|
|
expect(out).not.toContain('author: bob');
|
|
expect(out).not.toContain('Q2 results');
|
|
});
|
|
});
|
|
|
|
describe('foreign markdown import acceptance (normalizer + canonical parser)', () => {
|
|
const FOREIGN = [
|
|
'# Doc',
|
|
'',
|
|
'Body refs [^c] and [^a] and [^b] and again [^a].',
|
|
'',
|
|
':::info',
|
|
'A legacy callout.',
|
|
':::',
|
|
'',
|
|
'| h1 | h2 |',
|
|
'| --- | --- |',
|
|
'| 1 | 2 |',
|
|
'',
|
|
'[^a]: note A',
|
|
'[^b]: note B',
|
|
'[^c]: note C',
|
|
'[^z]: orphan note',
|
|
].join('\n');
|
|
|
|
it('leaves no literal [^id] or ::: in the imported doc and re-exports canonically', async () => {
|
|
const normalized = normalizeForeignMarkdown(FOREIGN);
|
|
const doc = await markdownToProseMirror(normalized);
|
|
const reexport = convertProseMirrorToMarkdown(doc);
|
|
|
|
// No foreign garbage leaks into the document.
|
|
expect(reexport).not.toMatch(/\[\^/); // no reference footnote refs/defs
|
|
expect(reexport).not.toContain(':::'); // no legacy callout fences
|
|
|
|
// Canonical forms are present.
|
|
expect(reexport).toContain('^[note C]');
|
|
expect(reexport).toContain('> [!info]');
|
|
expect(reexport).toContain('| h1 | h2 |');
|
|
|
|
// Footnotes: ordered by first reference (C, A, B), reused [^a] deduped to one,
|
|
// orphan [^z] dropped (it had no reference after normalization).
|
|
const list = doc.content.find((n: any) => n.type === 'footnotesList');
|
|
const bodies = list.content.map(
|
|
(d: any) => d.content[0].content[0].text,
|
|
);
|
|
expect(bodies).toEqual(['note C', 'note A', 'note B']);
|
|
expect(bodies).not.toContain('orphan note');
|
|
expect(
|
|
doc.content.filter((n: any) => n.type === 'footnotesList'),
|
|
).toHaveLength(1);
|
|
});
|
|
});
|