diff --git a/apps/server/src/collaboration/extensions/persistence-store.spec.ts b/apps/server/src/collaboration/extensions/persistence-store.spec.ts index 8bc713bf..c8ac68b9 100644 --- a/apps/server/src/collaboration/extensions/persistence-store.spec.ts +++ b/apps/server/src/collaboration/extensions/persistence-store.spec.ts @@ -205,31 +205,69 @@ describe('PersistenceExtension.onStoreDocument — Approach-A boundary snapshot' expect(historyQueue.add).toHaveBeenCalledTimes(1); }); - // #206 persist-6 — RED (it.failing): a momentarily-empty live Y.Doc must not - // overwrite non-empty persisted content. `onStoreDocument` empty-guards the - // LOAD path but not the STORE path, so today an empty doc (a client/agent - // glitch, a bad merge, an emptying transclusion) is written straight over the - // page and the content is wiped silently. A store-side empty-guard is a real - // behaviour change (a deliberate "select-all + delete" is also empty), so it - // is left UNFIXED pending a product decision; this documents the data-loss - // path and flips to a normal passing test the moment the guard lands. - it.failing( - 'does NOT overwrite non-empty content with a momentarily-empty live doc (persist-6)', - async () => { - const emptyDoc = { type: 'doc', content: [{ type: 'paragraph' }] }; - const document = ydocFor(emptyDoc); - pageRepo.findById.mockResolvedValue({ - ...persistedHumanPage('IGNORED'), - content: doc('IMPORTANT RICH CONTENT'), - }); + // #206 persist-6 — FIXED: a momentarily-empty live Y.Doc must not overwrite + // non-empty persisted content. `onStoreDocument` empty-guarded the LOAD path + // but not the STORE path, so an empty doc (a client/agent glitch, a bad + // merge, an emptying transclusion) was written straight over the page and the + // content was wiped silently. The store-side empty-guard now skips the write + // when the incoming doc is empty and the stored page is non-empty, unless an + // explicit intentional-clear signal is present. + it('does NOT overwrite non-empty content with a momentarily-empty live doc (persist-6)', async () => { + const emptyDoc = { type: 'doc', content: [{ type: 'paragraph' }] }; + const document = ydocFor(emptyDoc); + pageRepo.findById.mockResolvedValue({ + ...persistedHumanPage('IGNORED'), + content: doc('IMPORTANT RICH CONTENT'), + }); - await ext.onStoreDocument(buildData(document, 'user') as any); + await ext.onStoreDocument(buildData(document, 'user') as any); - // Desired contract: the empty incoming doc is rejected and the rich page - // survives. Today updatePage is called with the empty content (data loss). - expect(pageRepo.updatePage).not.toHaveBeenCalled(); - }, - ); + // The empty incoming doc is rejected and the rich page survives. + expect(pageRepo.updatePage).not.toHaveBeenCalled(); + // No false-success side effects for a write that never happened. + expect((document as any).broadcastStateless).not.toHaveBeenCalled(); + expect(historyQueue.add).not.toHaveBeenCalled(); + }); + + // persist-6 — a legitimate clear is NOT broken: with the explicit + // intentional-clear signal, emptying a non-empty page still persists. + it('persists an intentional clear of a non-empty page (persist-6 escape hatch)', async () => { + const emptyDoc = { type: 'doc', content: [{ type: 'paragraph' }] }; + const document = ydocFor(emptyDoc); + pageRepo.findById.mockResolvedValue({ + ...persistedHumanPage('IGNORED'), + content: doc('IMPORTANT RICH CONTENT'), + }); + + await ext.onStoreDocument({ + documentName: `page.${PAGE_ID}`, + document, + context: { + user: { id: USER_ID, name: 'Alice' }, + actor: 'user', + intentionalClear: true, + }, + } as any); + + expect(pageRepo.updatePage).toHaveBeenCalledTimes(1); + }); + + // persist-6 — a brand-new / already-empty page is unaffected: an empty store + // over empty stored content is not blocked (it short-circuits as unchanged). + it('does not block an empty store over an already-empty page (persist-6)', async () => { + const emptyDoc = { type: 'doc', content: [{ type: 'paragraph' }] }; + const document = ydocFor(emptyDoc); + const normalized = TiptapTransformer.fromYdoc(document, 'default'); + pageRepo.findById.mockResolvedValue({ + ...persistedHumanPage('IGNORED'), + content: normalized, + }); + + await ext.onStoreDocument(buildData(document, 'user') as any); + + // Unchanged empty-over-empty: short-circuits, no spurious write, no error. + expect(pageRepo.updatePage).not.toHaveBeenCalled(); + }); // persist-1 — when every attempt fails the hook must NOT report a phantom // success: no "page.updated" badge broadcast and no history snapshot for diff --git a/apps/server/src/collaboration/extensions/persistence.extension.ts b/apps/server/src/collaboration/extensions/persistence.extension.ts index f802f229..47fadf8f 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.ts @@ -210,6 +210,33 @@ export class PersistenceExtension implements Extension { return; } + // #206 persist-6 — store-side empty-guard. A momentarily-empty live + // Y.Doc (a client/agent glitch, a bad merge, a transclusion that + // emptied) must NOT overwrite non-empty persisted content. The LOAD + // path already guards emptiness (onLoadDocument only hydrates from db + // when the live doc isEmpty); the STORE path did not, so an empty + // serialization was written straight over the page, wiping it + // silently. Skip the write when the incoming doc is an empty + // paragraph doc AND the stored page is non-empty — unless the writer + // sends an explicit intentional-clear signal (a deliberate + // select-all + delete), the one case where emptying is the user's + // intent. New/empty pages are unaffected (stored content is already + // empty), and an unchanged doc was already short-circuited above. + const intentionalClear = context?.intentionalClear === true; + if ( + !intentionalClear && + isEmptyParagraphDoc(tiptapJson as any) && + page.content && + !isEmptyParagraphDoc(page.content as any) + ) { + this.logger.warn( + `Skipping store for ${pageId}: empty live doc would overwrite ` + + `non-empty persisted content (no intentional-clear signal)`, + ); + page = null; + return; + } + let contributorIds = undefined; try { const existingContributors = page.contributorIds || []; diff --git a/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts b/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts index 431c92f2..98fdc03a 100644 --- a/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts +++ b/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts @@ -1,77 +1,111 @@ import { describe, it, expect } from "vitest"; import { htmlToMarkdown } from "./turndown.utils"; +import { markdownToHtml } from "./marked.utils"; /** - * #206 mdrt-2 — Markdown export must never SILENTLY drop a block. + * #206 mdrt-2 — Markdown export must never SILENTLY drop a block. (FIXED) * - * `htmlToMarkdown` (turndown) only registers rules for a fixed set of custom - * nodes (callout, taskItem, details, math, iframe, htmlEmbed, image, video, - * footnote). Any other custom node — `transclusionReference`, `pageBreak`, - * `mention`, `status` — falls through to turndown's default handling: an empty - * wrapper is "blank" and removed, so the block disappears from the exported - * Markdown with no trace. The invariant "never silently lose a block" is broken. + * `htmlToMarkdown` (turndown) historically only registered rules for a fixed + * set of custom nodes (callout, taskItem, details, math, iframe, htmlEmbed, + * image, video, footnote). Any other custom node — `transclusionReference`, + * `pageBreak`, `mention`, `status` — fell through to turndown's default + * handling: an empty wrapper is "blank" and removed, so the block disappeared + * from the exported Markdown with no trace, and `mention`/`status` collapsed to + * bare text, losing their identity (data-id / data-color). The invariant + * "never silently lose a block" was broken. * - * The `it.fails` cases assert the DESIRED contract (the block survives export in - * SOME form) and are RED today: they document the unfixed data loss and flip to - * green the moment a turndown rule (real syntax or a lossless HTML-comment - * placeholder) is added. A normal characterization `it` pins the exact current - * lossy output so the regression is unambiguous. + * The fix adds lossless turndown rules that re-emit each of these nodes as raw + * HTML carrying every `data-*` attribute. Plain-Markdown viewers ignore the + * inert tag; the import path round-trips it (`markdownToHtml` passes the raw + * HTML through and each node's `parseHTML` rebuilds the ProseMirror node). These + * tests assert the surviving contract (the block is preserved AND its identity + * round-trips back through import). */ -describe("htmlToMarkdown — custom nodes without a turndown rule (#206 mdrt-2)", () => { - const wrap = (inner: string) => - `

before

${inner}

after

`; +describe("htmlToMarkdown — custom nodes are preserved losslessly (#206 mdrt-2)", () => { + const wrap = (inner: string) => `

before

${inner}

after

`; - it("CURRENTLY drops a pageBreak entirely (data loss)", () => { + it("preserves a pageBreak block on Markdown export", () => { const md = htmlToMarkdown( wrap('
'), ); - // The page break vanishes: only the two paragraphs remain, nothing between. expect(md).toContain("before"); expect(md).toContain("after"); - expect(md).not.toMatch(/page-?break/i); - expect(md).not.toContain("---"); // not even a horizontal-rule fallback + // The break survives as an inert raw-HTML tag, not silently dropped. + expect(md).toMatch(/data-type="pageBreak"/); + expect(md).toMatch(/page-?break/i); }); - it("CURRENTLY drops a transclusionReference entirely (data loss)", () => { + it("preserves a transclusionReference's identity on Markdown export", () => { const md = htmlToMarkdown( wrap('
'), ); expect(md).toContain("before"); expect(md).toContain("after"); - // The data-id (the only thing that gives the reference identity) is gone. - expect(md).not.toContain("abc"); + // The data-id (the only thing that gives the reference identity) survives. + expect(md).toContain("abc"); + expect(md).toMatch(/data-type="transclusionReference"/); }); - it.fails( - "should NOT lose a pageBreak block on Markdown export", - () => { + it("preserves a mention's data-id (stable identity) on Markdown export", () => { + const md = htmlToMarkdown( + '

hi @Bob there

', + ); + // The mention keeps its stable identity (data-id), not just the text. + expect(md).toContain("u1"); + expect(md).toContain("Bob"); + expect(md).toMatch(/data-type="mention"/); + }); + + it("preserves a status chip's color on Markdown export", () => { + const md = htmlToMarkdown( + '

s Done

', + ); + // The chip's color (its identity) survives, not just the visible text. + expect(md).toContain("green"); + expect(md).toContain("Done"); + expect(md).toMatch(/data-type="status"/); + }); + + // The export form is only lossless if the import path can rebuild it. These + // assert the full MD -> HTML round-trip restores the node + its attributes, + // which is the marker <-> node contract each `parseHTML` relies on. + describe("import round-trip (markdownToHtml restores the node)", () => { + it("round-trips a pageBreak through export + import", async () => { const md = htmlToMarkdown( wrap('
'), ); - // Desired: the break survives in some form (e.g. a `---` rule or marker). - expect(md).toMatch(/(-{3,}|page-?break)/i); - }, - ); + const html = await markdownToHtml(md); + expect(html).toMatch(/]*data-type="pageBreak"[^>]*>/); + expect(html).toContain("before"); + expect(html).toContain("after"); + }); - it.fails( - "should NOT lose a transclusionReference's identity on Markdown export", - () => { + it("round-trips a transclusionReference (keeps data-id)", async () => { const md = htmlToMarkdown( wrap('
'), ); - // Desired: the referenced id survives so the block can be rebuilt. - expect(md).toContain("abc"); - }, - ); + const html = await markdownToHtml(md); + expect(html).toMatch(/]*data-type="transclusionReference"[^>]*>/); + expect(html).toContain("abc"); + }); - it.fails( - "should NOT lose a mention's data-id on Markdown export", - () => { + it("round-trips a mention (keeps data-id + data-label)", async () => { const md = htmlToMarkdown( '

hi @Bob there

', ); - // Desired: the mention keeps its stable identity (data-id), not just text. - expect(md).toContain("u1"); - }, - ); + const html = await markdownToHtml(md); + expect(html).toMatch(/]*data-type="mention"[^>]*>/); + expect(html).toContain("u1"); + expect(html).toContain("Bob"); + }); + + it("round-trips a status chip (keeps data-color)", async () => { + const md = htmlToMarkdown( + '

s Done

', + ); + const html = await markdownToHtml(md); + expect(html).toMatch(/]*data-type="status"[^>]*>/); + expect(html).toContain("green"); + }); + }); }); diff --git a/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts b/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts index 172786a3..6bb8e456 100644 --- a/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts +++ b/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts @@ -43,6 +43,54 @@ function fillEmptyFootnoteRefs(html: string): string { ); } +/** + * `pageBreak` and `transclusionReference` are childless atom
s. Like an + * empty footnote ref (see above), turndown treats a childless block as "blank" + * and replaces it with the blankRule BEFORE any custom rule can fire — so the + * node disappears from the export with no trace (#206 mdrt-2). Inject a + * zero-width space so the node is non-blank and our lossless rule runs; the + * rule rebuilds the tag from the element's attributes, so the injected char + * never reaches the output. + */ +function fillEmptyAtomBlocks(html: string): string { + return html.replace( + /]*\bdata-type="(?:pageBreak|transclusionReference)"[^>]*)>\s*<\/div>/gi, + (_m, attrs) => `
`, + ); +} + +/** HTML-escape an attribute value so a re-emitted raw-HTML tag is well-formed. */ +function escapeHtmlAttr(value: string): string { + return value + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(//g, '>'); +} + +/** HTML-escape text placed inside a re-emitted raw-HTML element. */ +function escapeHtmlText(value: string): string { + return value + .replace(/&/g, '&') + .replace(//g, '>'); +} + +/** + * Serialize ALL of an element's attributes back to a raw-HTML attribute string + * (leading space included). Generic on purpose: a custom node's identity lives + * entirely in its `data-*` attributes (data-id, data-color, data-source-page-id, + * data-transclusion-id, …), and serializing every attribute keeps the export + * lossless regardless of which attributes a given node carries. + */ +function serializeAttrs(node: any): string { + const attrs = node?.attributes; + if (!attrs) return ''; + return Array.from(attrs as ArrayLike<{ name: string; value: string }>) + .map((attr) => ` ${attr.name}="${escapeHtmlAttr(attr.value ?? '')}"`) + .join(''); +} + export function htmlToMarkdown(html: string): string { const turndownService = new TurndownService({ headingStyle: 'atx', @@ -69,12 +117,83 @@ export function htmlToMarkdown(html: string): string { video, footnoteReference, footnotesList, + pageBreak, + transclusionReference, + mention, + status, ]); return turndownService - .turndown(fillEmptyFootnoteRefs(html)) + .turndown(fillEmptyAtomBlocks(fillEmptyFootnoteRefs(html))) .replaceAll('
', ' '); } +/** + * Lossless export rules for custom nodes that have NO native Markdown syntax + * (#206 mdrt-2). Markdown cannot represent a page break, a transclusion + * reference, a mention's stable id, or a status chip's color — so rather than + * letting turndown silently drop them, each rule re-emits the node as raw HTML + * carrying every `data-*` attribute. Plain-Markdown viewers ignore the inert + * tag, and the import path round-trips it: `markdownToHtml` passes raw HTML + * through and each node's `parseHTML` (`div[data-type="…"]`, `span[…]`) rebuilds + * the ProseMirror node with its attributes intact. + */ +function pageBreak(turndownService: _TurndownService) { + turndownService.addRule('pageBreak', { + filter: function (node: HTMLInputElement) { + return ( + node.nodeName === 'DIV' && + node.getAttribute('data-type') === 'pageBreak' + ); + }, + replacement: function (_content: string, node: HTMLInputElement) { + return `\n\n\n\n`; + }, + }); +} + +function transclusionReference(turndownService: _TurndownService) { + turndownService.addRule('transclusionReference', { + filter: function (node: HTMLInputElement) { + return ( + node.nodeName === 'DIV' && + node.getAttribute('data-type') === 'transclusionReference' + ); + }, + replacement: function (_content: string, node: HTMLInputElement) { + return `\n\n\n\n`; + }, + }); +} + +function mention(turndownService: _TurndownService) { + turndownService.addRule('mention', { + filter: function (node: HTMLInputElement) { + return ( + node.nodeName === 'SPAN' && + node.getAttribute('data-type') === 'mention' + ); + }, + replacement: function (_content: string, node: HTMLInputElement) { + const text = escapeHtmlText(node.textContent || ''); + return `${text}`; + }, + }); +} + +function status(turndownService: _TurndownService) { + turndownService.addRule('status', { + filter: function (node: HTMLInputElement) { + return ( + node.nodeName === 'SPAN' && node.getAttribute('data-type') === 'status' + ); + }, + replacement: function (_content: string, node: HTMLInputElement) { + const text = escapeHtmlText(node.textContent || ''); + return `${text}`; + }, + }); +} + /** * Serialize the `htmlEmbed` node to Markdown. *