Files
gitmost/packages/mcp/src/lib/footnote-canonicalize.ts
a c4ed4a4855 fix(footnotes): strip bare definitions on rebuild; MCP full-doc + zip-import canonicalize tests (#228)
Review #6 (approve-with-comments) follow-ups:
1. canonicalize step 7 now strips bare footnoteDefinitions at ANY depth
   (stripFootnoteDefinitionsDeep), not just footnotesList, in BOTH copies. A
   definition hand-authored outside a list (e.g. nested in a callout via a
   raw-JSON write path) was left in place while a copy was also added to the
   rebuilt list -> duplicate, idempotent, self-perpetuating. Runs only in the
   rebuild path (after the lists are stripped); the fast-path / placement-keep
   branch is untouched. Added a shared-corpus case (bare def nested in a callout)
   to pin it in both mirrors.
2. markdown-clipboard: removed the dead top-level footnoteReference check in
   canonicalizePastedFootnotes (an inline atom is never a top-level slice child;
   only the descendants scan can find it).

Test coverage:
4. New MCP binding tests (full-doc-write-canonicalize.test.mjs): update_page_json
   and copy_page_content canonicalize the persisted full doc, asserted via a new
   `replacePage` seam (symmetric to the existing `mutatePage` seam) so no live
   collab socket is needed. Routed both writers through the seam.
5. New server spec (file-import-task.service.footnote-canonicalize.spec.ts): the
   zip-import path (processGenericImport) canonicalizes footnotes — real
   markdown->HTML->JSON via a real ImportService over a temp-dir .md file, DB trx
   stubbed to capture the persisted page content. FileImportTaskService had no
   spec before.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 01:39:25 +03:00

226 lines
8.9 KiB
TypeScript

/**
* Server-side footnote canonicalizer (MCP mirror — PURE).
*
* `canonicalizeFootnotes(doc)` is a pure ProseMirror-JSON port of the editor's
* `footnoteSyncPlugin` end-state, identical in behaviour to
* `@docmost/editor-ext`'s `canonicalizeFootnotes`. It is mirrored here — rather
* than imported from editor-ext — for the SAME reason `footnote-lex.ts` and the
* `docmost-schema.ts` nodes are mirrored: the MCP package is deliberately
* decoupled from the browser/React-heavy editor barrel and operates on plain
* JSON. The editor-ext copy owns the golden test against the live plugin; this
* copy must stay behaviourally identical (a SHARED golden corpus, exercised by
* both test suites, pins that — see `test/unit/footnote-corpus.mjs`).
*
* This module is the pure MIRROR only. The inline-authoring helpers
* (`footnoteContentKey`, `makeFootnoteDefinition`, `generateFootnoteId`) used by
* `insertInlineFootnote` live in the sibling `footnote-authoring.ts`, so this
* file is compositionally symmetric to the editor-ext copy.
*
* Why it exists: every NON-editor write path (markdown import, update_page_json,
* docmost_transform, insert_footnote) builds ProseMirror JSON directly, so the
* editor's footnote plugins never run and the canonical topology (sequential
* numbering by first reference, one trailing list, no orphans, no raw `[^id]`)
* was never enforced. Running this at the end of every write path closes that
* gap; because it is idempotent, it is a no-op when the footnotes are already
* canonical (no spurious mutations / git-sync churn).
*
* ENFORCEMENT RULE (#228): any NEW FULL-document persist path MUST call
* `canonicalizeFootnotes(doc)` before writing — the current callers are
* `markdownToProseMirrorCanonical` (page markdown import/update; the plain
* `markdownToProseMirror` used for COMMENT bodies must NOT, or it would drop a
* reference-less definition), `update_page_json`, `docmost_transform`,
* `insert_footnote`, and `copy_page_content`. Append/prepend FRAGMENT writes MUST
* NOT canonicalize. This is deliberately per-call-site (the replace-vs-fragment
* and comment-vs-page nuances make a single naive wrapper unsafe).
*/
const FOOTNOTE_REFERENCE_NAME = "footnoteReference";
const FOOTNOTES_LIST_NAME = "footnotesList";
const FOOTNOTE_DEFINITION_NAME = "footnoteDefinition";
function cloneJson<T>(v: T): T {
if (typeof structuredClone === "function") return structuredClone(v);
return JSON.parse(JSON.stringify(v)) as T;
}
function isEmptyParagraph(node: any): boolean {
return (
!!node &&
node.type === "paragraph" &&
(!Array.isArray(node.content) || node.content.length === 0)
);
}
function collectReferenceIds(node: any, out: string[], seen: Set<string>): void {
if (!node || typeof node !== "object") return;
if (node.type === FOOTNOTE_REFERENCE_NAME) {
const id = node?.attrs?.id;
if (id && !seen.has(id)) {
seen.add(id);
out.push(id);
}
}
if (Array.isArray(node.content)) {
for (const child of node.content) collectReferenceIds(child, out, seen);
}
}
function collectDefinitions(node: any, out: any[]): void {
if (!node || typeof node !== "object") return;
if (node.type === FOOTNOTE_DEFINITION_NAME) out.push(node);
if (Array.isArray(node.content)) {
for (const child of node.content) collectDefinitions(child, out);
}
}
function emptyDefinition(id: string): any {
return {
type: FOOTNOTE_DEFINITION_NAME,
attrs: { id },
content: [{ type: "paragraph" }],
};
}
/**
* Deep equality over plain JSON: arrays are compared POSITIONALLY
* (order-SENSITIVE), object keys order-insensitively. The array order-sensitivity
* is required for correctness here — a reordered `footnotesList.content` must
* compare UNEQUAL so the canonical rebuild fires instead of leaving it in place.
*/
function deepEqualJson(a: any, b: any): boolean {
if (a === b) return true;
if (a == null || b == null || typeof a !== typeof b) return false;
if (Array.isArray(a) || Array.isArray(b)) {
if (!Array.isArray(a) || !Array.isArray(b) || a.length !== b.length) {
return false;
}
for (let i = 0; i < a.length; i++) {
if (!deepEqualJson(a[i], b[i])) return false;
}
return true;
}
if (typeof a === "object") {
const ka = Object.keys(a);
const kb = Object.keys(b);
if (ka.length !== kb.length) return false;
for (const k of ka) {
if (!Object.prototype.hasOwnProperty.call(b, k)) return false;
if (!deepEqualJson(a[k], b[k])) return false;
}
return true;
}
return false;
}
/**
* Canonicalize footnotes in a ProseMirror-JSON document. See the file header and
* the editor-ext twin for the full contract. Pure (deep-clones input,
* deterministic, idempotent).
*/
export function canonicalizeFootnotes<T = any>(doc: T): T {
if (
doc == null ||
typeof doc !== "object" ||
!Array.isArray((doc as any).content)
) {
return doc;
}
const out = cloneJson(doc) as any;
// 1) Distinct reference ids in document order (deep — refs can live in
// callouts, tables, list items, ...). The ordering/numbering truth.
const referenceIds: string[] = [];
collectReferenceIds(out, referenceIds, new Set<string>());
// 2) Every definition node in document order (deep).
const defNodes: any[] = [];
collectDefinitions(out, defNodes);
// 3) First definition per id wins; later duplicates carry the SAME id, so they
// cannot be referenced separately and would be orphans — they are dropped.
const defById = new Map<string, any>();
for (const d of defNodes) {
const id = d?.attrs?.id;
if (id && !defById.has(id)) defById.set(id, d);
}
// 4) Build the ordered definition list: one per referenced id, in REFERENCE
// order, reusing the existing node (shallow-copied, id normalized — `out` is
// already deep-cloned and the old lists are cut) or synthesizing an empty
// one. Definitions whose id is not referenced are orphans and never added.
const orderedDefs: any[] = [];
for (const id of referenceIds) {
const existing = defById.get(id);
if (existing) {
orderedDefs.push({
...existing,
attrs: { ...(existing.attrs ?? {}), id },
});
} else {
orderedDefs.push(emptyDefinition(id));
}
}
// 5) No references -> there must be NO list at all (at any depth).
if (referenceIds.length === 0) {
stripFootnotesListsDeep(out);
return out;
}
// 6) Placement parity with the live plugin: when the document is ALREADY in the
// canonical single-list state, leave that list exactly where it sits rather
// than cutting and re-inserting it at the end (the plugin never repositions a
// sole correct list, so moving it would silently reorder any content that
// follows the list on the first write).
const topLevelLists = out.content.filter(
(n: any) => n && n.type === FOOTNOTES_LIST_NAME,
);
if (
topLevelLists.length === 1 &&
defNodes.length === orderedDefs.length &&
deepEqualJson(topLevelLists[0].content, orderedDefs)
) {
return out;
}
// 7) Otherwise rebuild: strip every footnotesList AND every bare
// footnoteDefinition at ANY depth (collectDefinitions gathers defs
// recursively, so a list nested in a callout/blockquote — or a bare
// definition outside any list — would otherwise have its defs copied into the
// rebuilt list while the original survives in place → duplicates) and
// re-insert exactly one list after the last meaningful (non-empty paragraph)
// top-level block.
stripFootnotesListsDeep(out);
stripFootnoteDefinitionsDeep(out);
const top: any[] = out.content;
let insertAt = top.length;
while (insertAt > 0 && isEmptyParagraph(top[insertAt - 1])) insertAt--;
top.splice(insertAt, 0, { type: FOOTNOTES_LIST_NAME, content: orderedDefs });
out.content = top;
return out;
}
/** Remove every `footnotesList` node at ANY depth (mutates the given clone). */
function stripFootnotesListsDeep(node: any): void {
if (!node || typeof node !== "object" || !Array.isArray(node.content)) return;
node.content = node.content.filter(
(c: any) => !(c && c.type === FOOTNOTES_LIST_NAME),
);
for (const child of node.content) stripFootnotesListsDeep(child);
}
/**
* Remove every BARE `footnoteDefinition` node at ANY depth (mutates the given
* clone). Runs only in the rebuild path AFTER the lists are stripped, so it
* targets definitions that were sitting outside a list (e.g. hand-authored via a
* raw-JSON write path and nested in a callout); their content was already copied
* into the rebuilt list, so leaving the originals would duplicate them.
*/
function stripFootnoteDefinitionsDeep(node: any): void {
if (!node || typeof node !== "object" || !Array.isArray(node.content)) return;
node.content = node.content.filter(
(c: any) => !(c && c.type === FOOTNOTE_DEFINITION_NAME),
);
for (const child of node.content) stripFootnoteDefinitionsDeep(child);
}