bfbd927866
mathInline serializes as `$LaTeX$` and mathBlock as an own-line `$$\n<latex>\n$$` fence (multi-line safe), closing hand-authoring gap A18. The LaTeX still lives in node.attrs.text; a literal `$` inside it is escaped `\$`. On the raw-HTML path (columns/cells) math keeps the schema-HTML `<span data-type="mathInline">` / `<div data-type="mathBlock">` form (markdown is not re-parsed inside raw HTML) — blockToHtml gets an explicit mathBlock case and inlineToHtml a mathInline case, sharing the mathInlineHtml/mathBlockHtml helpers with the fallbacks so the two forms cannot drift. Parse: mathInlineExtension (inline) + mathBlockExtension (block) are added to the SAME dedicated marked instance introduced for canon #7 (global singleton untouched). The inline extension uses a currency-safe PANDOC rule: an opening `$` must not be followed by whitespace, and the closing `$` must not be preceded by whitespace nor followed by a digit — so `$5`, `$5 and $10`, `a $5 b $6 c`, `100$` stay literal text while `$x^2$` is math. The block extension matches a `$$` fence line and captures multi-line LaTeX non-greedily up to the next `$$` line. The pandoc boundary rule lives ONCE in the new math-inline.ts (INLINE_MATH_SOURCE) and is shared by the import tokenizer (^-anchored) and the export prose escaper (global), so parse and serialize cannot disagree about what is math. escapeProseMath (case "text", non-code runs only) escapes ONLY the two delimiting `$` of a span the rule WOULD match, so a would-be-math prose span like `the set $A$` re-imports as literal text while currency `$5 and $10` is emitted CLEAN (zero backslash churn). marked decodes `\$`→`$` on re-parse, byte-stable. Fallbacks to the lossless schema-HTML form (all documented + tested): mathInline → <span> when empty / whitespace-edged / multi-line / pre-existing `\$` / trailing `\` / immediately before a digit-text sibling (renderInlineChildren guard, so `$…$5` can't lose the node); mathBlock → <div> when the LaTeX contains `$$`. Each fallback round-trips losslessly and byte-stably. Code safety (guards the canon #7 regression class): codeBlock reads raw child text and inline `code` runs are excluded from escapeProseMath, so `$5`/`$x$` in code stay literal with no math and no backslash corruption. ReDoS-checked on adversarial 40k-char inputs (0–1 ms). Tests: new math.test.ts (26 cases: serialize exactness, multi-line block, `\$` escaping, currency ×5 asserting no `\$`, prose escape, columns schema-HTML, inline-code/codeBlock safety, fail-open). Goldens in roundtrip / markdown-converter flipped top-level math to `$…$`/`$$…$$`; the escapeAttr-idempotence golden wraps math in a column (still exercises escapeAttr); columns/raw-HTML math assertions unchanged. package vitest: 585 passed; tsc clean. git-sync: 268 passed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
263 lines
12 KiB
TypeScript
263 lines
12 KiB
TypeScript
import { describe, expect, it } from 'vitest';
|
|
// Import DIRECTLY from src so we exercise the real converter pair (the parser
|
|
// lives in markdown-to-prosemirror.ts; importing it mutates the global DOM via
|
|
// jsdom at module load, which @tiptap/html's generateJSON needs under Node).
|
|
import { convertProseMirrorToMarkdown } from '../src/lib/markdown-converter.js';
|
|
import { markdownToProseMirror } from '../src/lib/markdown-to-prosemirror.js';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// #293 canon #6: math -> `$…$` (inline) and `$$…$$` (block).
|
|
//
|
|
// The CENTRAL correctness constraint is that a single/currency `$` is NEVER
|
|
// math (`$5`, `it costs $5 and $10` stay literal), and a would-be-math `$x$`
|
|
// span in PROSE round-trips as literal text (never a phantom math node). These
|
|
// tests pin the serialize forms, the pandoc currency rule, the low-churn prose
|
|
// escape, the columns/raw-HTML schema-HTML form, and codeBlock/inline-code
|
|
// safety, and assert byte-stable round-trips throughout.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const doc = (...nodes: any[]) => ({ type: 'doc', content: nodes });
|
|
const text = (t: string, marks?: any[]) =>
|
|
marks ? { type: 'text', text: t, marks } : { type: 'text', text: t };
|
|
const para = (...inline: any[]) => ({ type: 'paragraph', content: inline });
|
|
|
|
// export -> import -> export. Returns md1, the re-imported doc, and md2 (which
|
|
// MUST equal md1 for the git-sync data path to be byte-stable).
|
|
async function roundTrip(node: any) {
|
|
const md1 = convertProseMirrorToMarkdown(doc(node));
|
|
const doc2 = await markdownToProseMirror(md1);
|
|
const md2 = convertProseMirrorToMarkdown(doc2);
|
|
return { md1, doc2, md2 };
|
|
}
|
|
|
|
// Depth-first find the first node of a type in a re-imported doc.
|
|
function findNode(n: any, type: string): any {
|
|
if (!n || typeof n !== 'object') return undefined;
|
|
if (n.type === type) return n;
|
|
if (Array.isArray(n.content)) {
|
|
for (const c of n.content) {
|
|
const hit = findNode(c, type);
|
|
if (hit) return hit;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
// Concatenate every text run under a node (for asserting text is preserved).
|
|
function allText(n: any): string {
|
|
if (!n || typeof n !== 'object') return '';
|
|
if (n.type === 'text') return n.text || '';
|
|
if (Array.isArray(n.content)) return n.content.map(allText).join('');
|
|
return '';
|
|
}
|
|
|
|
describe('mathInline serialize + round-trip', () => {
|
|
it('mathInline x^2 -> exact $x^2$ and re-imports as mathInline attrs.text x^2', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(para({ type: 'mathInline', attrs: { text: 'x^2' } }));
|
|
expect(md1).toBe('$x^2$');
|
|
expect(md2).toBe(md1); // byte-stable
|
|
const math = findNode(doc2, 'mathInline');
|
|
expect(math).toBeDefined();
|
|
expect(math.attrs.text).toBe('x^2');
|
|
// No stray literal text, no math-shaped currency false positive.
|
|
expect(allText(doc2)).toBe('');
|
|
});
|
|
|
|
it('mathInline surrounded by prose round-trips as math (not currency)', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(
|
|
para(text('let '), { type: 'mathInline', attrs: { text: 'x^2' } }, text(' be')),
|
|
);
|
|
expect(md1).toBe('let $x^2$ be');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathInline').attrs.text).toBe('x^2');
|
|
});
|
|
|
|
it('LaTeX containing a literal $ is escaped \\$ and round-trips exact', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(para({ type: 'mathInline', attrs: { text: 'a$b' } }));
|
|
expect(md1).toBe('$a\\$b$'); // inner $ escaped so it cannot close early
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathInline').attrs.text).toBe('a$b');
|
|
});
|
|
|
|
it('empty mathInline falls back to the lossless schema-HTML <span> form', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(para({ type: 'mathInline', attrs: { text: '' } }));
|
|
// An empty `$$` would look like a block; the span form is lossless.
|
|
expect(md1).toBe('<span data-type="mathInline" data-katex="true" text=""></span>');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathInline')).toBeDefined();
|
|
});
|
|
|
|
it('mathInline whose LaTeX carries a pre-existing \\$ takes the span fallback', async () => {
|
|
// `\$` before escaping would make the `$`→`\$` escape ambiguous, so this
|
|
// rare case uses the always-lossless schema-HTML form (documented fork).
|
|
const { md1, doc2, md2 } = await roundTrip(para({ type: 'mathInline', attrs: { text: '\\$100' } }));
|
|
expect(md1).toContain('<span data-type="mathInline"');
|
|
expect(md1).not.toContain('$\\$100$');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathInline').attrs.text).toBe('\\$100');
|
|
});
|
|
|
|
it('mathInline immediately followed by a digit text run uses the span fallback (round-trips)', async () => {
|
|
// `$x^2$5` would fail the pandoc closing rule (digit after `$`), so the math
|
|
// node falls back to the lossless span form; the "5" stays literal text.
|
|
const { md1, doc2, md2 } = await roundTrip(
|
|
para({ type: 'mathInline', attrs: { text: 'x^2' } }, text('5')),
|
|
);
|
|
expect(md1).toBe('<span data-type="mathInline" data-katex="true" text="x^2"></span>5');
|
|
expect(md1).not.toContain('$x^2$5');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathInline').attrs.text).toBe('x^2');
|
|
expect(allText(doc2)).toBe('5');
|
|
});
|
|
});
|
|
|
|
describe('mathBlock serialize + round-trip', () => {
|
|
it('multi-line mathBlock -> $$ fence with LaTeX intact, byte-stable', async () => {
|
|
const latex = '\\int_0^1 f\n= 1';
|
|
const { md1, doc2, md2 } = await roundTrip({ type: 'mathBlock', attrs: { text: latex } });
|
|
expect(md1).toBe('$$\n\\int_0^1 f\n= 1\n$$');
|
|
expect(md2).toBe(md1);
|
|
const math = findNode(doc2, 'mathBlock');
|
|
expect(math).toBeDefined();
|
|
expect(math.attrs.text).toBe(latex); // multi-line preserved
|
|
});
|
|
|
|
it('single-line mathBlock round-trips', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip({ type: 'mathBlock', attrs: { text: 'a^2+b^2' } });
|
|
expect(md1).toBe('$$\na^2+b^2\n$$');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathBlock').attrs.text).toBe('a^2+b^2');
|
|
});
|
|
|
|
it('empty mathBlock round-trips as an empty $$ fence', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip({ type: 'mathBlock', attrs: { text: '' } });
|
|
expect(md1).toBe('$$\n\n$$');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathBlock')).toBeDefined();
|
|
});
|
|
|
|
it('mathBlock whose LaTeX contains a $$ takes the lossless <div> fallback', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip({ type: 'mathBlock', attrs: { text: 'a $$ b' } });
|
|
expect(md1).toContain('<div data-type="mathBlock"');
|
|
expect(md1).not.toBe('$$\na $$ b\n$$');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathBlock').attrs.text).toBe('a $$ b');
|
|
});
|
|
});
|
|
|
|
describe('currency: a single/currency $ is NEVER math', () => {
|
|
const cases = ['it costs $5', '$5 and $10', 'a $5 b $6 c', 'price is $5', 'pay $5 now'];
|
|
for (const original of cases) {
|
|
it(`"${original}" stays literal text with NO backslashes and NO math node`, async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(para(text(original)));
|
|
// Emitted markdown carries NO escaping (currency has no valid closing $).
|
|
expect(md1).toBe(original);
|
|
expect(md1).not.toContain('\\$');
|
|
expect(md2).toBe(md1);
|
|
// No math node materialized; the text is preserved EXACTLY.
|
|
expect(findNode(doc2, 'mathInline')).toBeUndefined();
|
|
expect(allText(doc2)).toBe(original);
|
|
});
|
|
}
|
|
|
|
it('a currency amount preserves the exact string across a round trip', async () => {
|
|
const { doc2 } = await roundTrip(para(text('$5 and $10')));
|
|
expect(allText(doc2)).toBe('$5 and $10');
|
|
expect(findNode(doc2, 'mathInline')).toBeUndefined();
|
|
});
|
|
});
|
|
|
|
describe('prose $x$ (would-be math) round-trips as literal text (escaped)', () => {
|
|
it('the set $A$ -> \\$A\\$ and re-imports as literal text, no math node', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(para(text('the set $A$ is closed')));
|
|
expect(md1).toBe('the set \\$A\\$ is closed');
|
|
expect(md2).toBe(md1); // byte-stable
|
|
expect(findNode(doc2, 'mathInline')).toBeUndefined();
|
|
// The literal text is preserved exactly (backslashes are a serialization
|
|
// detail, decoded back on import).
|
|
expect(allText(doc2)).toBe('the set $A$ is closed');
|
|
});
|
|
});
|
|
|
|
describe('math inside a column keeps the schema-HTML form (NOT $…$)', () => {
|
|
const oneColumn = (child: any) => ({
|
|
type: 'columns',
|
|
content: [{ type: 'column', content: [child] }],
|
|
});
|
|
|
|
it('mathBlock in a column emits <div> (no $$ fence), round-trips', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(
|
|
oneColumn({ type: 'mathBlock', attrs: { text: 'a^2+b^2' } }),
|
|
);
|
|
expect(md1).toContain('<div data-type="mathBlock" data-katex="true" text="a^2+b^2"></div>');
|
|
expect(md1).not.toContain('$$');
|
|
// The schema-HTML math form survives the round trip (a re-imported column
|
|
// gains a default data-layout, so we assert the math div, not full equality).
|
|
expect(md2).toContain('<div data-type="mathBlock" data-katex="true" text="a^2+b^2"></div>');
|
|
expect(md2).not.toContain('$$');
|
|
expect(findNode(doc2, 'mathBlock').attrs.text).toBe('a^2+b^2');
|
|
});
|
|
|
|
it('mathInline in a column paragraph emits <span> (no $…$), round-trips', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(
|
|
oneColumn(para(text('eq: '), { type: 'mathInline', attrs: { text: 'x_i' } })),
|
|
);
|
|
expect(md1).toContain('<span data-type="mathInline" data-katex="true" text="x_i"></span>');
|
|
expect(md1).not.toContain('$x_i$');
|
|
expect(md2).toContain('<span data-type="mathInline" data-katex="true" text="x_i"></span>');
|
|
expect(md2).not.toContain('$x_i$');
|
|
expect(findNode(doc2, 'mathInline').attrs.text).toBe('x_i');
|
|
});
|
|
});
|
|
|
|
describe('code is never math (canon #7 codeBlock regression class)', () => {
|
|
it('inline `code` span containing $x$ / $5 stays code, no math, no backslashes', async () => {
|
|
const { md1, doc2, md2 } = await roundTrip(
|
|
para(text('$x$ and $5', [{ type: 'code' }])),
|
|
);
|
|
// A code run is emitted verbatim in a backtick span — no `$` escaping, no math.
|
|
expect(md1).toBe('`$x$ and $5`');
|
|
expect(md1).not.toContain('\\$');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathInline')).toBeUndefined();
|
|
const codeRun = findNode(doc2, 'text');
|
|
expect(codeRun.marks?.some((m: any) => m.type === 'code')).toBe(true);
|
|
expect(codeRun.text).toBe('$x$ and $5');
|
|
});
|
|
|
|
it('codeBlock containing $…$ and $5 stays code, no math, no backslash corruption', async () => {
|
|
const code = 'cost = $5\nx = $y$';
|
|
const { md1, doc2, md2 } = await roundTrip({
|
|
type: 'codeBlock',
|
|
attrs: { language: 'python' },
|
|
content: [text(code)],
|
|
});
|
|
// Fenced code is literal: the `$` are verbatim, no escaping, no math node.
|
|
expect(md1).toContain('cost = $5');
|
|
expect(md1).toContain('x = $y$');
|
|
expect(md1).not.toContain('\\$');
|
|
expect(md2).toBe(md1);
|
|
expect(findNode(doc2, 'mathInline')).toBeUndefined();
|
|
expect(findNode(doc2, 'mathBlock')).toBeUndefined();
|
|
// The `$` are preserved verbatim inside the fence (marked re-adds one
|
|
// trailing newline the exporter strips again, so compare against that).
|
|
const codeText = allText(findNode(doc2, 'codeBlock'));
|
|
expect(codeText).toContain('cost = $5');
|
|
expect(codeText).toContain('x = $y$');
|
|
expect(codeText).not.toContain('\\$');
|
|
});
|
|
});
|
|
|
|
describe('fail-open: unbalanced / lone $ never crashes and stays literal', () => {
|
|
for (const src of ['$', '$$', 'a $ b', '$ x $', 'unbalanced $x here']) {
|
|
it(`"${src}" imports without crash and materializes no math node`, async () => {
|
|
const doc2 = await markdownToProseMirror(src);
|
|
expect(doc2).toBeDefined();
|
|
expect(findNode(doc2, 'mathInline')).toBeUndefined();
|
|
// `$$` alone would only ever be a fence with content; a lone `$$` line is
|
|
// not a valid fence, so no mathBlock either.
|
|
expect(findNode(doc2, 'mathBlock')).toBeUndefined();
|
|
});
|
|
}
|
|
});
|