Files
gitmost/packages/git-sync/test/markdown-to-prosemirror-gaps.test.ts
claude code agent 227 8b14a2e7b1 test(git-sync): exhaustive converter coverage + fix 3 round-trip data-loss bugs
Coder↔reviewer design loop (9 rounds, reviewer verdict: exhaustive) produced
92 specs; implemented +123 tests (465 -> 588 passing). The new round-trip
coverage exposed three genuine data-loss bugs in the Markdown<->ProseMirror
converter, all now FIXED (round-trip is lossless for these):

1. pageBreak was lost on export (no converter case -> rendered to "" and the
   node vanished). Now emits <div data-type="pageBreak"></div>, which the schema
   parses back -> round-trips.
2. A block image between blocks left an empty <p> artifact after import-hoisting,
   producing a phantom blank-gap diff on every sync. markdownToProseMirror now
   strips content-less paragraphs after generateJSON — with a schema-validity
   guard that keeps the obligatory single empty paragraph in `content: "block+"`
   containers (tableCell/tableHeader/blockquote/column/callout/doc), so empty
   cells/quotes never become an invalid `content: []`.
3. The `code` mark combined with another mark was not byte-stable (emitted nested
   HTML that the schema's `code` `excludes:"_"` collapsed on import). The
   converter now emits code-only when `code` co-occurs, matching the editor.

New coverage spans media/diagram/details/columns/math/mention attribute
round-trips, converter emission branches, git error paths, and engine decision
branches. A dedicated test pins the empty-container schema validity (the review
catch on the bug-2 fix).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-24 13:43:40 +03:00

450 lines
21 KiB
TypeScript

import { describe, expect, it } from 'vitest';
// markdownToProseMirror lives next to the markdown->HTML preprocessors
// (preprocessCallouts, bridgeTaskLists). Those helpers are NOT exported, so we
// exercise them through the public entry point, which runs the full
// markdown -> preprocessCallouts -> marked -> bridgeTaskLists -> generateJSON
// pipeline. Importing this module mutates the global DOM via jsdom (required for
// @tiptap/html under Node) — expected, same as the property test.
import { markdownToProseMirror } from '../src/lib/markdown-to-prosemirror.js';
// The export side (ProseMirror -> markdown) is pulled in for the round-trip
// specs below (underline/sub/sup marks, heading levels, link title). Imported
// directly from src/lib (not the barrel) like the other converter unit tests.
import { convertProseMirrorToMarkdown } from '../src/lib/markdown-converter.js';
// Find every node of a given type anywhere in a ProseMirror doc tree.
const findAll = (node: any, type: string, acc: any[] = []): any[] => {
if (node && node.type === type) acc.push(node);
for (const child of node?.content || []) findAll(child, type, acc);
return acc;
};
// Concatenate all text within a subtree (order-preserving).
const allText = (node: any): string => {
if (node?.type === 'text') return node.text || '';
return (node?.content || []).map(allText).join('');
};
// ---------------------------------------------------------------------------
// 3. preprocessCallouts — two uncovered branches.
//
// (a) NESTED callouts: an inner `:::type ... :::` inside an outer callout body
// must be matched at its own nesting level (the depth counter) and emerge as
// a callout NESTED inside the outer callout — not flattened or mis-closed.
// (b) A `:::` line INSIDE a fenced code block must NOT be treated as a callout
// delimiter: the scanner tracks code fences and copies their lines verbatim,
// so the outer callout's matching `:::` is the one AFTER the fence closes.
// ---------------------------------------------------------------------------
describe('preprocessCallouts: nested callouts + code-fenced ":::"', () => {
it('(a) parses a callout nested inside another callout', async () => {
const md = [
':::info',
'outer text',
':::warning',
'inner text',
':::',
':::',
].join('\n');
const docNode = await markdownToProseMirror(md);
// Exactly two callouts, and one is nested inside the other.
const callouts = findAll(docNode, 'callout');
expect(callouts).toHaveLength(2);
const outer = docNode.content?.[0];
expect(outer?.type).toBe('callout');
expect(outer?.attrs?.type).toBe('info');
// The inner callout is a CHILD of the outer one (not a sibling at doc level).
const innerCallouts = (outer?.content || []).filter(
(n: any) => n.type === 'callout',
);
expect(innerCallouts).toHaveLength(1);
expect(innerCallouts[0].attrs?.type).toBe('warning');
// Both bodies kept their text.
expect(allText(outer)).toContain('outer text');
expect(allText(innerCallouts[0])).toContain('inner text');
});
it('(b) a ":::" line inside a fenced code block is NOT a callout delimiter', async () => {
// The inner ``` ... ``` fence contains a `:::` line. If preprocessCallouts
// treated it as the closing fence, the callout would terminate early and the
// code text would leak out. The correct behavior: the fence content survives
// verbatim in a codeBlock, and the callout closes at the LAST ":::".
const md = [
':::info',
'before code',
'```',
':::',
'still inside the code fence',
'```',
'after code',
':::',
].join('\n');
const docNode = await markdownToProseMirror(md);
// One callout wrapping everything (it did not close early on the fenced ":::")
const callouts = findAll(docNode, 'callout');
expect(callouts).toHaveLength(1);
const callout = callouts[0];
// The code block is a CHILD of the callout and still contains the ":::" line.
const codeBlocks = findAll(callout, 'codeBlock');
expect(codeBlocks).toHaveLength(1);
expect(allText(codeBlocks[0])).toContain(':::');
expect(allText(codeBlocks[0])).toContain('still inside the code fence');
// The text before and after the fence is part of the callout, not a stray
// top-level paragraph created by an early close.
expect(allText(callout)).toContain('before code');
expect(allText(callout)).toContain('after code');
});
});
// ---------------------------------------------------------------------------
// 4. bridgeTaskLists — numbered checklist + mixed-list negative.
//
// (a) A NUMBERED checklist (`1. [x] ...`) is rendered by marked as an <ol> of
// checkbox <li>s. The bridge must convert it to a taskList AND rename the
// <ol> to a <ul> so generateJSON does NOT also match the orderedList rule
// and emit a phantom empty orderedList beside the real taskList.
// (b) NEGATIVE: a MIXED list (some items have checkboxes, some don't) must NOT
// be converted — it stays an ordinary bullet/numbered list.
// ---------------------------------------------------------------------------
describe('bridgeTaskLists: numbered checklist + mixed-list negative', () => {
it('(a) a numbered <ol> checklist becomes a taskList with NO phantom orderedList', async () => {
const md = ['1. [x] done', '2. [ ] todo'].join('\n');
const docNode = await markdownToProseMirror(md);
// It became a taskList...
const taskLists = findAll(docNode, 'taskList');
expect(taskLists).toHaveLength(1);
const items = (taskLists[0].content || []).filter(
(n: any) => n.type === 'taskItem',
);
expect(items).toHaveLength(2);
expect(items[0].attrs?.checked).toBe(true);
expect(items[1].attrs?.checked).toBe(false);
expect(allText(items[0])).toContain('done');
expect(allText(items[1])).toContain('todo');
// ...and NO phantom (empty) orderedList survived the <ol> -> <ul> rename.
const orderedLists = findAll(docNode, 'orderedList');
expect(orderedLists).toHaveLength(0);
});
it('(b) a MIXED list (some items checkboxed, some not) is NOT converted to a taskList', async () => {
const md = ['- [x] checked item', '- plain item'].join('\n');
const docNode = await markdownToProseMirror(md);
// The bridge requires EVERY direct <li> to carry its own checkbox; one plain
// item disqualifies the whole list, so it stays a bulletList.
expect(findAll(docNode, 'taskList')).toHaveLength(0);
expect(findAll(docNode, 'taskItem')).toHaveLength(0);
const bulletLists = findAll(docNode, 'bulletList');
expect(bulletLists).toHaveLength(1);
const listItems = findAll(bulletLists[0], 'listItem');
expect(listItems).toHaveLength(2);
// Both items survive as ordinary list items (text preserved).
expect(allText(bulletLists[0])).toContain('checked item');
expect(allText(bulletLists[0])).toContain('plain item');
});
});
// Find the first mark of a given type on a text node anywhere in the tree.
const firstMark = (node: any, markType: string): any => {
if (node?.type === 'text') {
for (const m of node.marks || []) if (m.type === markType) return m;
}
for (const child of node?.content || []) {
const found = firstMark(child, markType);
if (found) return found;
}
return null;
};
// ---------------------------------------------------------------------------
// Spec 1. IMPORT-side color sanitization for the highlight + textStyle marks.
//
// The Highlight.extend / TextStyle parseHTML run attacker-controlled colors
// through sanitizeCssColor when generateJSON re-parses stored HTML. This is the
// real defense that strips a crafted color on IMPORT (the export-side emission
// is tested elsewhere; the parse path was not).
// ---------------------------------------------------------------------------
describe('import: highlight/textStyle color sanitization (parseHTML)', () => {
it('strips the unsafe "--x:1" declaration but keeps the safe "red" background-color', async () => {
const doc = await markdownToProseMirror(
'<mark style="background-color: red; --x:1">x</mark>',
);
const mark = firstMark(doc, 'highlight');
// The highlight mark IS present on the text run.
expect(mark).not.toBeNull();
expect(allText(doc)).toContain('x');
// NOTE(review): Spec 1 expected attrs.color === null for this input. The
// ACTUAL behavior is attrs.color === 'red': the schema's Highlight.extend
// reads the color via getStyleProperty(el, 'background-color'), which
// isolates the `background-color: red` declaration and DROPS the separate
// unsafe `--x:1` declaration. sanitizeCssColor('red') then accepts the bare
// named color. So the injection ('--x:1') is stripped (the defense holds)
// but the legitimate 'red' survives — color is 'red', not null. The
// color-dropped-to-null path is exercised by the data-color variant below,
// where the whole "red; --x:1" string reaches sanitizeCssColor and fails.
expect(mark.attrs.color).toBe('red');
});
it('drops a crafted color carried whole in data-color (sanitizeCssColor -> null)', async () => {
// Here the entire unsafe string is the candidate color (no per-declaration
// splitting), so sanitizeCssColor rejects it and the highlight color is null
// while the highlight mark itself is still applied.
const doc = await markdownToProseMirror(
'<mark data-color="red; --x:1">x</mark>',
);
const mark = firstMark(doc, 'highlight');
expect(mark).not.toBeNull();
expect(mark.attrs.color).toBeNull();
});
it("imports '#ff0000' as the highlight mark color verbatim", async () => {
const doc = await markdownToProseMirror(
'<mark style="background-color: #ff0000">x</mark>',
);
const mark = firstMark(doc, 'highlight');
expect(mark).not.toBeNull();
expect(mark.attrs.color).toBe('#ff0000');
});
it("imports a colored span as a textStyle mark with the sanitized color", async () => {
const doc = await markdownToProseMirror(
'<span style="color: rebeccapurple">y</span>',
);
const mark = firstMark(doc, 'textStyle');
expect(mark).not.toBeNull();
expect(mark.attrs.color).toBe('rebeccapurple');
// It is carried on a real text node containing the span's text.
expect(allText(doc)).toContain('y');
});
});
// ---------------------------------------------------------------------------
// Spec 2. Importing an unsupported callout fence clamps the type to 'info'.
//
// preprocessCallouts emits div[data-type=callout][data-callout-type=tip]; the
// schema's Callout.type parseHTML pipes 'tip' through clampCalloutType, which
// maps the unknown type to the 'info' default. End-to-end import-side clamp.
// ---------------------------------------------------------------------------
describe('import: unsupported callout fence clamps type to info', () => {
it("imports ':::tip' as a callout whose attrs.type === 'info'", async () => {
const doc = await markdownToProseMirror(':::tip\nhello\n:::');
const callouts = findAll(doc, 'callout');
expect(callouts).toHaveLength(1);
expect(callouts[0].attrs.type).toBe('info');
// The body paragraph survived inside the callout.
expect(allText(callouts[0])).toContain('hello');
const paras = findAll(callouts[0], 'paragraph');
expect(paras.length).toBeGreaterThanOrEqual(1);
});
});
// ---------------------------------------------------------------------------
// Spec 3. Importing a columns layout with a string data-width yields a numeric
// column width, and the columns wrapper carries its default layout/widthMode.
// ---------------------------------------------------------------------------
describe('import: columns layout with string data-width -> numeric width', () => {
it('parses data-width="33.5" to the number 33.5 and populates columns defaults', async () => {
const doc = await markdownToProseMirror(
'<div data-type="columns"><div data-type="column" data-width="33.5"><p>a</p></div></div>',
);
const columns = findAll(doc, 'columns');
expect(columns).toHaveLength(1);
// Columns default attrs are populated (not undefined).
expect(columns[0].attrs.widthMode).toBe('normal');
expect(columns[0].attrs.layout).not.toBeNull();
expect(columns[0].attrs.layout).toBe('two_equal');
const cols = findAll(columns[0], 'column');
expect(cols).toHaveLength(1);
// parseFloat('33.5') -> 33.5 as a NUMBER, not the string '33.5'.
expect(cols[0].attrs.width).toBe(33.5);
expect(typeof cols[0].attrs.width).toBe('number');
expect(allText(cols[0])).toContain('a');
});
});
// ---------------------------------------------------------------------------
// Spec 4. Comment mark resolved-attribute boolean coercion on import.
//
// The comment mark's resolved attr parseHTML compares
// el.getAttribute('data-resolved') === 'true', so a missing attribute yields
// false (default) and the literal 'true' yields boolean true.
// ---------------------------------------------------------------------------
describe('import: comment mark commentId + resolved boolean coercion', () => {
it("data-resolved='true' -> resolved:true with the parsed commentId", async () => {
const doc = await markdownToProseMirror(
'<span data-comment-id="c1" data-resolved="true">x</span>',
);
const mark = firstMark(doc, 'comment');
expect(mark).not.toBeNull();
expect(mark.attrs.commentId).toBe('c1');
expect(mark.attrs.resolved).toBe(true);
});
it('a missing data-resolved -> resolved:false (default)', async () => {
const doc = await markdownToProseMirror(
'<span data-comment-id="c2">y</span>',
);
const mark = firstMark(doc, 'comment');
expect(mark).not.toBeNull();
expect(mark.attrs.commentId).toBe('c2');
expect(mark.attrs.resolved).toBe(false);
});
});
// ---------------------------------------------------------------------------
// Spec 5. A NON-numeric truthy data-width reaches parseFloat and yields NaN.
//
// Column.width parseHTML is `value ? parseFloat(value) : null`; 'abc' is truthy
// so parseFloat('abc') -> NaN leaks through as the raw attribute value rather
// than falling back to the null default. (JSON.stringify would serialize NaN to
// null — see the assertion below — so the leak is invisible in serialized JSON.)
// ---------------------------------------------------------------------------
describe('import: malformed non-numeric data-width leaks NaN', () => {
it("data-width='abc' -> column width is NaN (typeof number), not null", async () => {
const doc = await markdownToProseMirror(
'<div data-type="columns"><div data-type="column" data-width="abc"><p>x</p></div></div>',
);
const width = doc.content[0].content[0].attrs.width;
expect(typeof width).toBe('number');
expect(Number.isNaN(width)).toBe(true);
// Document that the leak is masked by JSON serialization: NaN -> null.
expect(JSON.parse(JSON.stringify(doc)).content[0].content[0].attrs.width).toBeNull();
});
});
// ---------------------------------------------------------------------------
// Spec 6. A column with NO data-width attribute lands on the null default.
//
// The else branch of `value ? parseFloat(value) : null` (getAttribute -> null)
// must yield exactly null (not NaN/undefined), and the columns wrapper carries
// its layout/widthMode defaults.
// ---------------------------------------------------------------------------
describe('import: width-less column lands on null default', () => {
it('no data-width -> column width === null, columns defaults populated', async () => {
const doc = await markdownToProseMirror(
'<div data-type="columns"><div data-type="column"><p>y</p></div></div>',
);
expect(doc.content[0].content[0].attrs.width).toBe(null);
expect(doc.content[0].attrs.layout).toBe('two_equal');
expect(doc.content[0].attrs.widthMode).toBe('normal');
});
});
// ---------------------------------------------------------------------------
// Spec 7. A structural callout div with missing/empty data-callout-type clamps
// to 'info' via clampCalloutType (the parseHTML getAttrs fallback), with no icon.
// ---------------------------------------------------------------------------
describe('import: callout div with missing/empty data-callout-type clamps to info', () => {
it('a callout div with NO data-callout-type -> type:info, icon:null', async () => {
const doc = await markdownToProseMirror(
'<div data-type="callout"><p>z</p></div>',
);
expect(doc.content[0].type).toBe('callout');
expect(doc.content[0].attrs.type).toBe('info');
expect(doc.content[0].attrs.icon).toBeNull();
});
it('a callout div with EMPTY data-callout-type -> type:info, icon:null', async () => {
const doc = await markdownToProseMirror(
'<div data-type="callout" data-callout-type=""><p>w</p></div>',
);
expect(doc.content[0].type).toBe('callout');
expect(doc.content[0].attrs.type).toBe('info');
expect(doc.content[0].attrs.icon).toBeNull();
});
});
// ---------------------------------------------------------------------------
// Spec 8. A plain <td> with no align/colspan/rowspan/colwidth lands on the
// schema defaults (align null via the `||` fallback arm; spans default to 1).
// ---------------------------------------------------------------------------
describe('import: span/align-less table cell lands on defaults', () => {
it('a bare td -> align:null, colspan:1, rowspan:1, colwidth:null', async () => {
const doc = await markdownToProseMirror(
'<table><tbody><tr><td><p>c</p></td></tr></tbody></table>',
);
const cells = findAll(doc, 'tableCell');
expect(cells).toHaveLength(1);
const attrs = cells[0].attrs;
expect(attrs.align).toBeNull();
expect(attrs.colspan).toBe(1);
expect(attrs.rowspan).toBe(1);
expect(attrs.colwidth).toBeNull();
expect(allText(cells[0])).toContain('c');
});
});
// ---------------------------------------------------------------------------
// Spec 9. underline/subscript/superscript marks survive import and re-export.
// (inlineToHtml src 611-619 renders them back to <u>/<sub>/<sup>.)
// ---------------------------------------------------------------------------
describe('import+export: underline/subscript/superscript marks round-trip', () => {
it('<u>/<sub>/<sup> import to the right marks and re-export unchanged', async () => {
const doc = await markdownToProseMirror('<p><u>a</u><sub>b</sub><sup>c</sup></p>');
const para = findAll(doc, 'paragraph')[0];
const texts = (para.content || []).filter((n: any) => n.type === 'text');
expect(texts).toHaveLength(3);
expect(texts[0].text).toBe('a');
expect((texts[0].marks || []).map((m: any) => m.type)).toEqual(['underline']);
expect(texts[1].text).toBe('b');
expect((texts[1].marks || []).map((m: any) => m.type)).toEqual(['subscript']);
expect(texts[2].text).toBe('c');
expect((texts[2].marks || []).map((m: any) => m.type)).toEqual(['superscript']);
const md = convertProseMirrorToMarkdown(doc);
expect(md).toContain('<u>a</u>');
expect(md).toContain('<sub>b</sub>');
expect(md).toContain('<sup>c</sup>');
});
});
// ---------------------------------------------------------------------------
// Spec 10. Heading level attribute fidelity (h1/h2/h6) on import and re-export.
// ---------------------------------------------------------------------------
describe('import+export: heading levels 1/2/6 round-trip', () => {
it('parses # / ## / ###### to level 1/2/6 and re-emits them', async () => {
const doc = await markdownToProseMirror('# H1\n\n## H2\n\n###### H6');
const headings = findAll(doc, 'heading');
expect(headings).toHaveLength(3);
expect(headings[0].attrs.level).toBe(1);
expect(headings[1].attrs.level).toBe(2);
expect(headings[2].attrs.level).toBe(6);
const md = convertProseMirrorToMarkdown(doc);
const blocks = md.split('\n\n');
expect(blocks).toContain('# H1');
expect(blocks).toContain('## H2');
expect(blocks).toContain('###### H6');
});
});
// ---------------------------------------------------------------------------
// Spec 11. Link mark recovers BOTH href and title on import and round-trips.
// ---------------------------------------------------------------------------
describe('import+export: link mark href + title round-trip', () => {
it('parses [lbl](http://a "the title") with href+title and re-emits it', async () => {
const doc = await markdownToProseMirror('[lbl](http://a "the title")');
const mark = firstMark(doc, 'link');
expect(mark).not.toBeNull();
expect(mark.attrs.href).toBe('http://a');
expect(mark.attrs.title).toBe('the title');
expect(allText(doc)).toContain('lbl');
const md = convertProseMirrorToMarkdown(doc);
expect(md).toContain('[lbl](http://a "the title")');
});
});