From 3fd66b4245b88b56c4ad810dfaddd38e6ee1a872 Mon Sep 17 00:00:00 2001 From: a Date: Sat, 27 Jun 2026 22:17:15 +0300 Subject: [PATCH] fix(footnotes): don't canonicalize comment bodies (data loss); canonicalize only page write paths (#228) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Must-fix (REAL DATA LOSS): - markdownToProseMirror is reused for COMMENT bodies (createComment/updateComment). It unconditionally canonicalized, so a comment carrying a standalone footnote definition ([^1]: text with no matching reference) had its whole footnotesList stripped (referenceIds.length===0 -> stripFootnotesListsDeep) — the text vanished. Fix: markdownToProseMirror no longer canonicalizes (content-preserving primitive); a new markdownToProseMirrorCanonical wraps it for the PAGE write paths (markdown import via importPageMarkdown, update_page markdown via updatePageContentRealtime). Comment callers keep the non-canonicalizing primitive. Updated the now-false header comment and added create/update-comment inline notes. Added collaboration tests: comment path PRESERVES a reference-less definition; page path still drops it AND still reorders real footnotes. Updated the page-import canonicalization test to use the canonical variant. Suggestions / architecture: - #2: collapsed transforms.footnoteDefinition onto the shared makeFootnoteDefinition factory (adds only the inner paragraph block id); kept the dependency direction transforms -> footnote-authoring (no circular import, mirror stays pure). - #3: confirmed docmost_transform auto-canonicalization is documented (inline comment, tool description, CHANGELOG) — no code change. - #4: copyPageContent is a FULL-document write (replacePageContent of a type:"doc"); added a defensive canonicalizeFootnotes pass (no-op on already-canonical source). - CHANGELOG entry refined to list the FULL-document write paths (incl. copy_page_content) and to state canonicalization is NOT applied to comment bodies. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 15 ++++--- packages/mcp/build/client.js | 20 ++++++--- packages/mcp/build/lib/collaboration.js | 41 +++++++++++------ packages/mcp/build/lib/transforms.js | 14 +++--- packages/mcp/src/client.ts | 20 +++++++-- packages/mcp/src/lib/collaboration.ts | 44 +++++++++++++------ packages/mcp/src/lib/transforms.ts | 14 +++--- packages/mcp/test/unit/collaboration.test.mjs | 36 +++++++++++++++ .../test/unit/footnote-canonicalize.test.mjs | 12 ++--- 9 files changed, 158 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 840f7cda..cb79f364 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,12 +49,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 edit the list, or desync, and a same-content note reuses one definition. Under the hood, the editor's footnote-integrity invariant (one trailing list, numbering by first reference, no orphans/duplicates, no raw `[^id]`) is now - enforced as a pure `canonicalizeFootnotes(doc)` on the write paths that bypass - the editor's plugins: server markdown/HTML import, `PageService` create and - full-document (`replace`) updates, the client markdown paste, and the MCP - `markdownToProseMirror` / `update_page_json` / `docmost_transform` / - `insert_footnote` paths. It is idempotent (a no-op once canonical) and is - deliberately NOT applied to append/prepend fragments. (#228) + enforced as a pure `canonicalizeFootnotes(doc)` on the FULL-document write paths + that bypass the editor's plugins: server markdown/HTML import, `PageService` + create and full-document (`replace`) updates, the client markdown paste, and the + MCP markdown page-import / `update_page` (markdown) / `update_page_json` / + `docmost_transform` / `insert_footnote` / `copy_page_content` paths. It is + idempotent (a no-op once canonical) and is deliberately NOT applied to + append/prepend fragments, nor to COMMENT bodies — a comment may legitimately + contain a standalone footnote definition, which canonicalization would drop. + (#228) ### Fixed diff --git a/packages/mcp/build/client.js b/packages/mcp/build/client.js index 6eba7ea1..7b8fc936 100644 --- a/packages/mcp/build/client.js +++ b/packages/mcp/build/client.js @@ -7,7 +7,7 @@ import { TiptapTransformer } from "@hocuspocus/transformer"; import * as Y from "yjs"; import WebSocket from "ws"; import { convertProseMirrorToMarkdown } from "./lib/markdown-converter.js"; -import { updatePageContentRealtime, replacePageContent, markdownToProseMirror, mutatePageContent, buildCollabWsUrl, assertYjsEncodable, applyDocToFragment, } from "./lib/collaboration.js"; +import { updatePageContentRealtime, replacePageContent, markdownToProseMirror, markdownToProseMirrorCanonical, mutatePageContent, buildCollabWsUrl, assertYjsEncodable, applyDocToFragment, } from "./lib/collaboration.js"; import { footnoteWarningsField } from "./lib/footnote-analyze.js"; import { buildPageTree } from "./lib/tree.js"; import { serializeDocmostMarkdown, parseDocmostMarkdown, } from "./lib/markdown-document.js"; @@ -1180,7 +1180,8 @@ export class DocmostClient { async importPageMarkdown(pageId, fullMarkdown) { await this.ensureAuthenticated(); const { meta, body, comments } = parseDocmostMarkdown(fullMarkdown); - const doc = await markdownToProseMirror(body); + // PAGE import: canonicalize footnotes (see markdownToProseMirrorCanonical). + const doc = await markdownToProseMirrorCanonical(body); const collabToken = await this.getCollabTokenWithReauth(); const mutation = await replacePageContent(pageId, doc, collabToken, this.apiUrl); // Collect distinct comment ids that actually became comment marks in the doc. @@ -1260,13 +1261,18 @@ export class DocmostClient { // uses, so copying never lands a javascript:/data: href/src on the target // (parity with updatePageJson; harmless for already-stored source content). this.validateDocUrls(content); + // Defense-in-depth (#228): this is a FULL-document write, so canonicalize + // footnotes before copying — a no-op on already-canonical source content, but + // it guarantees a copy can never propagate a non-canonical footnote topology + // to the target (parity with the other full-doc write paths). + const canonical = canonicalizeFootnotes(content); const collabToken = await this.getCollabTokenWithReauth(); - const mutation = await replacePageContent(targetPageId, content, collabToken, this.apiUrl); + const mutation = await replacePageContent(targetPageId, canonical, collabToken, this.apiUrl); return { success: true, sourcePageId, targetPageId, - copiedNodes: content.content.length, + copiedNodes: canonical.content.length, verify: mutation.verify, }; } @@ -1673,7 +1679,10 @@ export class DocmostClient { } } } - // Convert through the full Docmost schema (consistent with page paths) + // Convert through the full Docmost schema. Deliberately the NON-canonicalizing + // variant: a comment body may carry a footnote definition with no matching + // reference, and canonicalization would drop it (data loss). See + // markdownToProseMirror vs markdownToProseMirrorCanonical. const jsonContent = await markdownToProseMirror(content); const payload = { pageId, @@ -1761,6 +1770,7 @@ export class DocmostClient { } async updateComment(commentId, content) { await this.ensureAuthenticated(); + // NON-canonicalizing on purpose (comment body — see createComment). const jsonContent = await markdownToProseMirror(content); await this.client.post("/comments/update", { commentId, diff --git a/packages/mcp/build/lib/collaboration.js b/packages/mcp/build/lib/collaboration.js index 67942c6d..4504b8d0 100644 --- a/packages/mcp/build/lib/collaboration.js +++ b/packages/mcp/build/lib/collaboration.js @@ -347,24 +347,37 @@ function extractFootnotes(markdown) { /** * Convert markdown to a ProseMirror doc using the full Docmost schema. * - * NOTE: besides the page-import write paths, this is also reused for comment - * bodies (createComment / updateComment). For an ordinary comment the - * canonicalize call below is a no-op (a comment carries no footnotes), so the - * reuse is safe; the only theoretical effect is if footnote markup were ever - * authored INSIDE a comment — a narrow case where canonicalizing the comment's - * own (self-contained) footnotes is still the correct behaviour. + * This conversion does NOT canonicalize footnotes — it is the shared, content- + * preserving primitive used by BOTH page write paths and COMMENT bodies + * (createComment / updateComment). Canonicalization MUST NOT run on a comment + * body: a comment may legitimately contain a footnote-definition line + * (`[^1]: text`) with no matching reference, and the canonicalizer drops a + * reference-less footnotesList — which would silently delete the comment's text. + * + * Page write paths that DO need the canonical footnote topology call + * `markdownToProseMirrorCanonical` instead (markdown import, update_page markdown + * path). Keep this function reference-loss-free. */ export async function markdownToProseMirror(markdownContent) { const withCallouts = await preprocessCallouts(markdownContent); const { body, section } = extractFootnotes(withCallouts); const html = (await marked.parse(body)) + section; const bridged = bridgeTaskLists(html); - const json = generateJSON(bridged, docmostExtensions); - // Canonicalize footnotes on EVERY import: the section above is built in - // definition order, but numbering is derived from REFERENCE order — so without - // this the bottom list renders out of order (`1, 4, 2, 3, …`). Idempotent, so - // it is a no-op when the footnotes are already canonical. - return canonicalizeFootnotes(json); + return generateJSON(bridged, docmostExtensions); +} +/** + * Page-write variant of `markdownToProseMirror`: converts markdown then enforces + * the canonical footnote topology. The footnote `section` markdown is emitted in + * DEFINITION order, but numbering derives from REFERENCE order, so without this + * the bottom list renders out of order (`1, 4, 2, 3, …`); orphan definitions and + * duplicate lists are also normalized. Idempotent — a no-op once canonical, and a + * no-op for footnote-free content. + * + * Use this ONLY for full-document PAGE writes (never for comment bodies, where it + * would drop a reference-less footnote definition — see `markdownToProseMirror`). + */ +export async function markdownToProseMirrorCanonical(markdownContent) { + return canonicalizeFootnotes(await markdownToProseMirror(markdownContent)); } /** * Build the collaboration WebSocket URL from an API base URL: @@ -723,6 +736,8 @@ export async function replacePageContent(pageId, prosemirrorDoc, collabToken, ba * Tables and :::callout::: blocks survive thanks to the full schema. */ export async function updatePageContentRealtime(pageId, markdownContent, collabToken, baseUrl) { - const tiptapJson = await markdownToProseMirror(markdownContent); + // PAGE write: canonicalize footnotes (markdown import builds the bottom list in + // definition order; numbering is reference-ordered). + const tiptapJson = await markdownToProseMirrorCanonical(markdownContent); return await mutatePageContent(pageId, collabToken, baseUrl, () => tiptapJson); } diff --git a/packages/mcp/build/lib/transforms.js b/packages/mcp/build/lib/transforms.js index 9c5ecb7e..935ff33d 100644 --- a/packages/mcp/build/lib/transforms.js +++ b/packages/mcp/build/lib/transforms.js @@ -261,14 +261,16 @@ export function noteItem(inlineNodes) { * Wrap inline ProseMirror nodes in a real footnoteDefinition node keyed by id: * { type:"footnoteDefinition", attrs:{id}, content:[{ type:"paragraph", content }] } * (mirrors the editor-ext / docmost-schema FootnoteDefinition node). + * + * Built on the shared `makeFootnoteDefinition` factory (footnote-authoring.ts); + * the only extra is a fresh block id on the inner paragraph (Docmost stamps one, + * and the canonicalizer preserves attrs as-is). Single factory, one place to + * change the definition shape. */ export function footnoteDefinition(id, inlineNodes) { - const content = Array.isArray(inlineNodes) ? clone(inlineNodes) : []; - return { - type: "footnoteDefinition", - attrs: { id }, - content: [{ type: "paragraph", attrs: { id: freshId() }, content }], - }; + const node = makeFootnoteDefinition(id, inlineNodes); + node.content[0].attrs = { id: freshId() }; + return node; } /** * Replace every `[N]` body marker and `\u0000FN\u0000` comment placeholder in diff --git a/packages/mcp/src/client.ts b/packages/mcp/src/client.ts index 2b449924..2228b34e 100644 --- a/packages/mcp/src/client.ts +++ b/packages/mcp/src/client.ts @@ -17,6 +17,7 @@ import { updatePageContentRealtime, replacePageContent, markdownToProseMirror, + markdownToProseMirrorCanonical, mutatePageContent, buildCollabWsUrl, assertYjsEncodable, @@ -1487,7 +1488,8 @@ export class DocmostClient { async importPageMarkdown(pageId: string, fullMarkdown: string): Promise { await this.ensureAuthenticated(); const { meta, body, comments } = parseDocmostMarkdown(fullMarkdown); - const doc = await markdownToProseMirror(body); + // PAGE import: canonicalize footnotes (see markdownToProseMirrorCanonical). + const doc = await markdownToProseMirrorCanonical(body); const collabToken = await this.getCollabTokenWithReauth(); const mutation = await replacePageContent( pageId, @@ -1582,10 +1584,16 @@ export class DocmostClient { // (parity with updatePageJson; harmless for already-stored source content). this.validateDocUrls(content); + // Defense-in-depth (#228): this is a FULL-document write, so canonicalize + // footnotes before copying — a no-op on already-canonical source content, but + // it guarantees a copy can never propagate a non-canonical footnote topology + // to the target (parity with the other full-doc write paths). + const canonical = canonicalizeFootnotes(content); + const collabToken = await this.getCollabTokenWithReauth(); const mutation = await replacePageContent( targetPageId, - content, + canonical, collabToken, this.apiUrl, ); @@ -1594,7 +1602,7 @@ export class DocmostClient { success: true, sourcePageId, targetPageId, - copiedNodes: content.content.length, + copiedNodes: canonical.content.length, verify: mutation.verify, }; } @@ -2112,7 +2120,10 @@ export class DocmostClient { } } - // Convert through the full Docmost schema (consistent with page paths) + // Convert through the full Docmost schema. Deliberately the NON-canonicalizing + // variant: a comment body may carry a footnote definition with no matching + // reference, and canonicalization would drop it (data loss). See + // markdownToProseMirror vs markdownToProseMirrorCanonical. const jsonContent = await markdownToProseMirror(content); const payload: Record = { pageId, @@ -2215,6 +2226,7 @@ export class DocmostClient { async updateComment(commentId: string, content: string) { await this.ensureAuthenticated(); + // NON-canonicalizing on purpose (comment body — see createComment). const jsonContent = await markdownToProseMirror(content); await this.client.post("/comments/update", { commentId, diff --git a/packages/mcp/src/lib/collaboration.ts b/packages/mcp/src/lib/collaboration.ts index e6f57aa8..c8b1cf40 100644 --- a/packages/mcp/src/lib/collaboration.ts +++ b/packages/mcp/src/lib/collaboration.ts @@ -396,12 +396,16 @@ function extractFootnotes(markdown: string): { /** * Convert markdown to a ProseMirror doc using the full Docmost schema. * - * NOTE: besides the page-import write paths, this is also reused for comment - * bodies (createComment / updateComment). For an ordinary comment the - * canonicalize call below is a no-op (a comment carries no footnotes), so the - * reuse is safe; the only theoretical effect is if footnote markup were ever - * authored INSIDE a comment — a narrow case where canonicalizing the comment's - * own (self-contained) footnotes is still the correct behaviour. + * This conversion does NOT canonicalize footnotes — it is the shared, content- + * preserving primitive used by BOTH page write paths and COMMENT bodies + * (createComment / updateComment). Canonicalization MUST NOT run on a comment + * body: a comment may legitimately contain a footnote-definition line + * (`[^1]: text`) with no matching reference, and the canonicalizer drops a + * reference-less footnotesList — which would silently delete the comment's text. + * + * Page write paths that DO need the canonical footnote topology call + * `markdownToProseMirrorCanonical` instead (markdown import, update_page markdown + * path). Keep this function reference-loss-free. */ export async function markdownToProseMirror( markdownContent: string, @@ -410,12 +414,24 @@ export async function markdownToProseMirror( const { body, section } = extractFootnotes(withCallouts); const html = (await marked.parse(body)) + section; const bridged = bridgeTaskLists(html); - const json = generateJSON(bridged, docmostExtensions); - // Canonicalize footnotes on EVERY import: the section above is built in - // definition order, but numbering is derived from REFERENCE order — so without - // this the bottom list renders out of order (`1, 4, 2, 3, …`). Idempotent, so - // it is a no-op when the footnotes are already canonical. - return canonicalizeFootnotes(json); + return generateJSON(bridged, docmostExtensions); +} + +/** + * Page-write variant of `markdownToProseMirror`: converts markdown then enforces + * the canonical footnote topology. The footnote `section` markdown is emitted in + * DEFINITION order, but numbering derives from REFERENCE order, so without this + * the bottom list renders out of order (`1, 4, 2, 3, …`); orphan definitions and + * duplicate lists are also normalized. Idempotent — a no-op once canonical, and a + * no-op for footnote-free content. + * + * Use this ONLY for full-document PAGE writes (never for comment bodies, where it + * would drop a reference-less footnote definition — see `markdownToProseMirror`). + */ +export async function markdownToProseMirrorCanonical( + markdownContent: string, +): Promise { + return canonicalizeFootnotes(await markdownToProseMirror(markdownContent)); } /** @@ -816,7 +832,9 @@ export async function updatePageContentRealtime( collabToken: string, baseUrl: string, ): Promise { - const tiptapJson = await markdownToProseMirror(markdownContent); + // PAGE write: canonicalize footnotes (markdown import builds the bottom list in + // definition order; numbering is reference-ordered). + const tiptapJson = await markdownToProseMirrorCanonical(markdownContent); return await mutatePageContent( pageId, collabToken, diff --git a/packages/mcp/src/lib/transforms.ts b/packages/mcp/src/lib/transforms.ts index 639f6d9e..2bb96393 100644 --- a/packages/mcp/src/lib/transforms.ts +++ b/packages/mcp/src/lib/transforms.ts @@ -327,14 +327,16 @@ export function noteItem(inlineNodes: any[]): any { * Wrap inline ProseMirror nodes in a real footnoteDefinition node keyed by id: * { type:"footnoteDefinition", attrs:{id}, content:[{ type:"paragraph", content }] } * (mirrors the editor-ext / docmost-schema FootnoteDefinition node). + * + * Built on the shared `makeFootnoteDefinition` factory (footnote-authoring.ts); + * the only extra is a fresh block id on the inner paragraph (Docmost stamps one, + * and the canonicalizer preserves attrs as-is). Single factory, one place to + * change the definition shape. */ export function footnoteDefinition(id: string, inlineNodes: any[]): any { - const content = Array.isArray(inlineNodes) ? clone(inlineNodes) : []; - return { - type: "footnoteDefinition", - attrs: { id }, - content: [{ type: "paragraph", attrs: { id: freshId() }, content }], - }; + const node = makeFootnoteDefinition(id, inlineNodes); + node.content[0].attrs = { id: freshId() }; + return node; } /** diff --git a/packages/mcp/test/unit/collaboration.test.mjs b/packages/mcp/test/unit/collaboration.test.mjs index ab07a414..84801840 100644 --- a/packages/mcp/test/unit/collaboration.test.mjs +++ b/packages/mcp/test/unit/collaboration.test.mjs @@ -4,6 +4,7 @@ import assert from "node:assert/strict"; import { buildCollabWsUrl, markdownToProseMirror, + markdownToProseMirrorCanonical, } from "../../build/lib/collaboration.js"; /** Recursively find the first descendant node (or self) of the given type. */ @@ -124,3 +125,38 @@ test("markdownToProseMirror: an aligned GFM table maps header alignment", async ["left", "center", "right"], ); }); + +// Comment-body data-loss guard (#228 review #4): markdownToProseMirror is reused +// for COMMENT bodies (createComment/updateComment), so it must NOT canonicalize — +// a comment may legitimately carry a standalone footnote definition with no +// matching reference, and canonicalization would drop the whole list (the text +// would vanish). The page-write variant DOES canonicalize. +test("markdownToProseMirror (comment path) PRESERVES a reference-less footnote definition", async () => { + const md = "A comment.\n\n[^1]: a standalone footnote definition"; + const doc = await markdownToProseMirror(md); + const defs = findAll(doc, "footnoteDefinition"); + assert.equal(defs.length, 1, "the footnote definition must be preserved"); + assert.match( + JSON.stringify(doc), + /a standalone footnote definition/, + "the definition text must survive the comment write path", + ); +}); + +test("markdownToProseMirrorCanonical (page path) DROPS a reference-less footnote definition", async () => { + // Same input through the PAGE variant: with no reference, the canonical doc has + // no footnotesList (this is the page-side behavior the comment path must avoid). + const md = "A page.\n\n[^1]: a standalone footnote definition"; + const doc = await markdownToProseMirrorCanonical(md); + assert.equal(findAll(doc, "footnotesList").length, 0); + assert.equal(findAll(doc, "footnoteDefinition").length, 0); +}); + +test("markdownToProseMirrorCanonical still canonicalizes a real page footnote (order)", async () => { + // Page path must STILL canonicalize: refs b,a -> definitions reorder to b,a. + const md = "See[^b] then[^a].\n\n[^a]: alpha\n[^b]: bravo"; + const doc = await markdownToProseMirrorCanonical(md); + const defs = findAll(doc, "footnoteDefinition").map((d) => d.attrs.id); + assert.deepEqual(defs, ["b", "a"]); + assert.equal(findAll(doc, "footnotesList").length, 1); +}); diff --git a/packages/mcp/test/unit/footnote-canonicalize.test.mjs b/packages/mcp/test/unit/footnote-canonicalize.test.mjs index c4b68ce5..3052aad2 100644 --- a/packages/mcp/test/unit/footnote-canonicalize.test.mjs +++ b/packages/mcp/test/unit/footnote-canonicalize.test.mjs @@ -4,7 +4,7 @@ import assert from "node:assert/strict"; import { canonicalizeFootnotes } from "../../build/lib/footnote-canonicalize.js"; import { footnoteContentKey } from "../../build/lib/footnote-authoring.js"; import { insertInlineFootnote } from "../../build/lib/transforms.js"; -import { markdownToProseMirror } from "../../build/lib/collaboration.js"; +import { markdownToProseMirrorCanonical } from "../../build/lib/collaboration.js"; function findAll(node, type, acc = []) { if (!node || typeof node !== "object") return acc; @@ -190,10 +190,12 @@ test("insertInlineFootnote: codeBlock match is skipped, a later body paragraph s assert.equal(findAll(r.doc, "footnoteReference").length, 1); }); -test("markdown import: out-of-order definitions render as a reference-ordered list", async () => { +test("markdown import (page path): out-of-order definitions render as a reference-ordered list", async () => { // References appear b, a, c in the body; definitions are written in a, b, c - // order (the import order). After canonicalization the bottom list follows - // REFERENCE order so the numbers read 1, 2, 3 down the list. + // order (the import order). The PAGE import path (markdownToProseMirrorCanonical) + // canonicalizes so the bottom list follows REFERENCE order — numbers read 1, 2, + // 3 down the list. (The non-canonicalizing markdownToProseMirror, used for + // comment bodies, would keep the import order; see collaboration.test.mjs.) const md = [ "See[^b] then[^a] then[^c].", "", @@ -201,7 +203,7 @@ test("markdown import: out-of-order definitions render as a reference-ordered li "[^b]: bravo", "[^c]: charlie", ].join("\n"); - const json = await markdownToProseMirror(md); + const json = await markdownToProseMirrorCanonical(md); assert.deepEqual(defIds(json), ["b", "a", "c"]); assert.equal(findAll(json, "footnotesList").length, 1); });