diff --git a/apps/server/src/collaboration/extensions/persistence-store.spec.ts b/apps/server/src/collaboration/extensions/persistence-store.spec.ts index 8bc713bf..519abc5b 100644 --- a/apps/server/src/collaboration/extensions/persistence-store.spec.ts +++ b/apps/server/src/collaboration/extensions/persistence-store.spec.ts @@ -205,31 +205,61 @@ describe('PersistenceExtension.onStoreDocument — Approach-A boundary snapshot' expect(historyQueue.add).toHaveBeenCalledTimes(1); }); - // #206 persist-6 — RED (it.failing): a momentarily-empty live Y.Doc must not - // overwrite non-empty persisted content. `onStoreDocument` empty-guards the - // LOAD path but not the STORE path, so today an empty doc (a client/agent - // glitch, a bad merge, an emptying transclusion) is written straight over the - // page and the content is wiped silently. A store-side empty-guard is a real - // behaviour change (a deliberate "select-all + delete" is also empty), so it - // is left UNFIXED pending a product decision; this documents the data-loss - // path and flips to a normal passing test the moment the guard lands. - it.failing( - 'does NOT overwrite non-empty content with a momentarily-empty live doc (persist-6)', - async () => { - const emptyDoc = { type: 'doc', content: [{ type: 'paragraph' }] }; - const document = ydocFor(emptyDoc); - pageRepo.findById.mockResolvedValue({ - ...persistedHumanPage('IGNORED'), - content: doc('IMPORTANT RICH CONTENT'), - }); + // #206 persist-6 — FIXED: a momentarily-empty live Y.Doc must not overwrite + // non-empty persisted content. `onStoreDocument` empty-guarded the LOAD path + // but not the STORE path, so an empty doc (a client/agent glitch, a bad + // merge, an emptying transclusion) was written straight over the page and the + // content was wiped silently. The store-side empty-guard now skips the write + // when the incoming doc is empty and the stored page is non-empty. A real + // intentional-clear UX is tracked separately in issue #251. + it('does NOT overwrite non-empty content with a momentarily-empty live doc (persist-6)', async () => { + const emptyDoc = { type: 'doc', content: [{ type: 'paragraph' }] }; + const document = ydocFor(emptyDoc); + pageRepo.findById.mockResolvedValue({ + ...persistedHumanPage('IGNORED'), + content: doc('IMPORTANT RICH CONTENT'), + }); - await ext.onStoreDocument(buildData(document, 'user') as any); + await ext.onStoreDocument(buildData(document, 'user') as any); - // Desired contract: the empty incoming doc is rejected and the rich page - // survives. Today updatePage is called with the empty content (data loss). - expect(pageRepo.updatePage).not.toHaveBeenCalled(); - }, - ); + // The empty incoming doc is rejected and the rich page survives. + expect(pageRepo.updatePage).not.toHaveBeenCalled(); + // No false-success side effects for a write that never happened. + expect((document as any).broadcastStateless).not.toHaveBeenCalled(); + expect(historyQueue.add).not.toHaveBeenCalled(); + }); + + // persist-6 — a legitimately-empty existing page must still be writable when + // the empty live doc actually DIFFERS from the stored content (so the + // unchanged short-circuit does NOT fire and execution reaches the empty-guard). + // This exercises the guard's third condition `!isEmptyParagraphDoc(page.content)`: + // because the stored page is ALSO empty, the guard must NOT block the write. + // The live doc normalizes to a paragraph carrying `attrs: { indent: 0 }` and no + // `content` key; the stored page is an empty paragraph with `content: []` — + // both empty per `isEmptyParagraphDoc`, but NOT `isDeepStrictEqual`, so the + // store passes the short-circuit (~line 208) and genuinely enters the guard + // (~line 229). If the `!isEmptyParagraphDoc(page.content)` condition were + // removed, the guard would block this write and updatePage would never run, + // failing this test. + it('does not block an empty store over an already-empty page (persist-6)', async () => { + const liveEmptyDoc = { type: 'doc', content: [{ type: 'paragraph' }] }; + const document = ydocFor(liveEmptyDoc); + // Stored content is empty per isEmptyParagraphDoc (paragraph with content:[]) + // but structurally NOT deep-equal to the normalized live doc — so execution + // skips the unchanged short-circuit and reaches the empty-guard. + const storedEmptyDoc = { type: 'doc', content: [{ type: 'paragraph', content: [] }] }; + pageRepo.findById.mockResolvedValue({ + ...persistedHumanPage('IGNORED'), + content: storedEmptyDoc, + }); + + await ext.onStoreDocument(buildData(document, 'user') as any); + + // Empty-over-empty reaches the guard, which must let the write through + // (the stored page is empty, so the empty-overwrite protection does not + // apply). updatePage IS called — proving `!isEmptyParagraphDoc(page.content)`. + expect(pageRepo.updatePage).toHaveBeenCalledTimes(1); + }); // persist-1 — when every attempt fails the hook must NOT report a phantom // success: no "page.updated" badge broadcast and no history snapshot for diff --git a/apps/server/src/collaboration/extensions/persistence.extension.ts b/apps/server/src/collaboration/extensions/persistence.extension.ts index f802f229..0739a8c6 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.ts @@ -210,6 +210,35 @@ export class PersistenceExtension implements Extension { return; } + // #206 persist-6 — store-side empty-guard. A momentarily-empty live + // Y.Doc (a client/agent glitch, a bad merge, a transclusion that + // emptied) must NOT overwrite non-empty persisted content. The LOAD + // path already guards emptiness (onLoadDocument only hydrates from db + // when the live doc isEmpty); the STORE path did not, so an empty + // serialization was written straight over the page, wiping it + // silently. Skip the write when the incoming doc is an empty + // paragraph doc AND the stored page is non-empty. New/empty pages are + // unaffected (stored content is already empty), and an unchanged doc + // was already short-circuited above. + // + // This unconditionally blocks empty-over-non-empty: a deliberate + // select-all + delete is currently indistinguishable from a glitch at + // this layer, so data-loss prevention wins. A real intentional-clear + // UX (a distinct signal threaded from the client) is tracked in issue + // #251; do not re-add an escape hatch here without that signal. + if ( + isEmptyParagraphDoc(tiptapJson as any) && + page.content && + !isEmptyParagraphDoc(page.content as any) + ) { + this.logger.warn( + `Skipping store for ${pageId}: empty live doc would overwrite ` + + `non-empty persisted content`, + ); + page = null; + return; + } + let contributorIds = undefined; try { const existingContributors = page.contributorIds || []; diff --git a/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts b/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts index 431c92f2..9bec4e78 100644 --- a/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts +++ b/packages/editor-ext/src/lib/markdown/utils/turndown.dataloss.test.ts @@ -1,77 +1,147 @@ import { describe, it, expect } from "vitest"; import { htmlToMarkdown } from "./turndown.utils"; +import { markdownToHtml } from "./marked.utils"; /** - * #206 mdrt-2 — Markdown export must never SILENTLY drop a block. + * #206 mdrt-2 — Markdown export must never SILENTLY drop a block. (FIXED) * - * `htmlToMarkdown` (turndown) only registers rules for a fixed set of custom - * nodes (callout, taskItem, details, math, iframe, htmlEmbed, image, video, - * footnote). Any other custom node — `transclusionReference`, `pageBreak`, - * `mention`, `status` — falls through to turndown's default handling: an empty - * wrapper is "blank" and removed, so the block disappears from the exported - * Markdown with no trace. The invariant "never silently lose a block" is broken. + * `htmlToMarkdown` (turndown) historically only registered rules for a fixed + * set of custom nodes (callout, taskItem, details, math, iframe, htmlEmbed, + * image, video, footnote). Any other custom node — `transclusionReference`, + * `pageBreak`, `mention`, `status` — fell through to turndown's default + * handling: an empty wrapper is "blank" and removed, so the block disappeared + * from the exported Markdown with no trace, and `mention`/`status` collapsed to + * bare text, losing their identity (data-id / data-color). The invariant + * "never silently lose a block" was broken. * - * The `it.fails` cases assert the DESIRED contract (the block survives export in - * SOME form) and are RED today: they document the unfixed data loss and flip to - * green the moment a turndown rule (real syntax or a lossless HTML-comment - * placeholder) is added. A normal characterization `it` pins the exact current - * lossy output so the regression is unambiguous. + * The fix adds lossless turndown rules that re-emit each of these nodes as raw + * HTML carrying every `data-*` attribute. Plain-Markdown viewers ignore the + * inert tag; the import path round-trips it (`markdownToHtml` passes the raw + * HTML through and each node's `parseHTML` rebuilds the ProseMirror node). These + * tests assert the surviving contract (the block is preserved AND its identity + * round-trips back through import). */ -describe("htmlToMarkdown — custom nodes without a turndown rule (#206 mdrt-2)", () => { - const wrap = (inner: string) => - `
before
${inner}after
`; +describe("htmlToMarkdown — custom nodes are preserved losslessly (#206 mdrt-2)", () => { + const wrap = (inner: string) => `before
${inner}after
`; - it("CURRENTLY drops a pageBreak entirely (data loss)", () => { + it("preserves a pageBreak block on Markdown export", () => { const md = htmlToMarkdown( wrap(''), ); - // The page break vanishes: only the two paragraphs remain, nothing between. expect(md).toContain("before"); expect(md).toContain("after"); - expect(md).not.toMatch(/page-?break/i); - expect(md).not.toContain("---"); // not even a horizontal-rule fallback + // The break survives as an inert raw-HTML tag, not silently dropped. + expect(md).toMatch(/data-type="pageBreak"/); + expect(md).toMatch(/page-?break/i); }); - it("CURRENTLY drops a transclusionReference entirely (data loss)", () => { + it("preserves a transclusionReference's identity on Markdown export", () => { const md = htmlToMarkdown( wrap(''), ); expect(md).toContain("before"); expect(md).toContain("after"); - // The data-id (the only thing that gives the reference identity) is gone. - expect(md).not.toContain("abc"); + // The data-id (the only thing that gives the reference identity) survives. + expect(md).toContain("abc"); + expect(md).toMatch(/data-type="transclusionReference"/); }); - it.fails( - "should NOT lose a pageBreak block on Markdown export", - () => { + it("preserves a mention's data-id (stable identity) on Markdown export", () => { + const md = htmlToMarkdown( + 'hi @Bob there
', + ); + // The mention keeps its stable identity (data-id), not just the text. + expect(md).toContain("u1"); + expect(md).toContain("Bob"); + expect(md).toMatch(/data-type="mention"/); + }); + + it("preserves a status chip's color on Markdown export", () => { + const md = htmlToMarkdown( + 's Done
', + ); + // The chip's color (its identity) survives, not just the visible text. + expect(md).toContain("green"); + expect(md).toContain("Done"); + expect(md).toMatch(/data-type="status"/); + }); + + // The export form is only lossless if the import path can rebuild it. These + // assert the full MD -> HTML round-trip restores the node + its attributes, + // which is the marker <-> node contract each `parseHTML` relies on. + describe("import round-trip (markdownToHtml restores the node)", () => { + it("round-trips a pageBreak through export + import", async () => { const md = htmlToMarkdown( wrap(''), ); - // Desired: the break survives in some form (e.g. a `---` rule or marker). - expect(md).toMatch(/(-{3,}|page-?break)/i); - }, - ); + const html = await markdownToHtml(md); + expect(html).toMatch(/hi @Bob there
', ); - // Desired: the mention keeps its stable identity (data-id), not just text. - expect(md).toContain("u1"); - }, - ); + const html = await markdownToHtml(md); + expect(html).toMatch(/]*data-type="mention"[^>]*>/); + expect(html).toContain("u1"); + expect(html).toContain("Bob"); + }); + + it("round-trips a status chip (keeps data-color)", async () => { + const md = htmlToMarkdown( + 's Done
', + ); + const html = await markdownToHtml(md); + expect(html).toMatch(/]*data-type="status"[^>]*>/); + expect(html).toContain("green"); + }); + + // HTML special chars in an attribute value or in a node's text must be + // ESCAPED when re-emitted as raw HTML, otherwise the exported tag is + // malformed and `markdownToHtml`'s parser cannot restore the original value + // (the same silent data loss this PR fixes). Dropping `<`/`>` escaping is the + // dangerous regression: a stray `<` or `>` corrupts the tag (or injects new + // markup), so the test data carries ALL of `&`, `"`, `<`, `>` in BOTH the + // data-label attribute and the visible text. That fully exercises + // escapeHtmlAttr's `&,",<,>` branches and escapeHtmlText's `&,<,>` branches + // (escapeHtmlText leaves `"` literal); the alphanumeric-only cases above hit + // none of them. + it("escapes HTML special chars (& \" < >) in attrs + text and round-trips them", async () => { + const md = htmlToMarkdown( + `hi @A & <B> "C" there
`, + ); + + // (a) The exported Markdown carries a WELL-FORMED, correctly-escaped tag: + // the attribute escapes `&`, `<`, `>` AND `"`; the text escapes `&`, `<`, + // `>` (a `"` inside text content is legal, so it stays literal). + expect(md).toContain('data-label="A & <B> "C""'); + expect(md).toContain('>@A & <B> "C"'); + // And explicitly NOT the raw, tag-corrupting forms: a literal `` (would + // mean `<`/`>` escaping was dropped in either the attr or the text)... + expect(md).not.toContain(""); + // ...nor the malformed attribute that an unescaped `"` would produce. + expect(md).not.toContain('data-label="A & <B> "C""'); + + // (b) Import restores the ORIGINAL (unescaped) values, attribute and text. + const html = await markdownToHtml(md); + const dom = new DOMParser().parseFromString(html as string, "text/html"); + const span = dom.querySelector('span[data-type="mention"]'); + expect(span).not.toBeNull(); + expect(span!.getAttribute("data-id")).toBe("u1"); + expect(span!.getAttribute("data-label")).toBe('A & "C"'); + expect(span!.textContent).toBe('@A & "C"'); + }); + }); }); diff --git a/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts b/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts index 172786a3..6bb8e456 100644 --- a/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts +++ b/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts @@ -43,6 +43,54 @@ function fillEmptyFootnoteRefs(html: string): string { ); } +/** + * `pageBreak` and `transclusionReference` are childless atom