refactor(footnotes): address PR #169 review

- footnote-sync: remove the now-dead `refReids` (CollisionPlan field, local, return, the 6a consumer loop) — references are never re-id'd under reuse, so it was dead structure on the hot reconciliation path. Rewrite the stale comments (plugin header, step 0, refOccurrences field) that still described the old "duplicates re-id'd so both survive" model to the reuse model. - Shared footnote lexer: new packages/mcp/src/lib/footnote-lex.ts (lexFootnoteLines + forEachFootnoteReference). extractFootnotes (collaboration) and analyzeFootnotes now consume the SAME fence-aware lexer, so "the analyzer sees exactly what the importer keeps/strips" is structural, not comment-kept. Removed the duplicated DEF_RE/fence machine from both consumers. - Tests: new mock test for the footnoteWarnings plumbing on createPage (problems -> field present; clean -> omitted); new paste-reuse case for TWO colliding pasted definitions (reservation -> distinct ids). Updated the derive-id golden test header (no MCP copy / parity test anymore). - CHANGELOG: [Unreleased] entries for footnote reuse (Changed, supersedes 0.93.0) and footnoteWarnings (Added). editor-ext 129, MCP 301, server roundtrip 2; client+server tsc clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 16:16:30 +03:00
parent 17e683a311
commit a0cc625dfe
11 changed files with 389 additions and 166 deletions
--- a/packages/mcp/build/lib/collaboration.js
+++ b/packages/mcp/build/lib/collaboration.js
@@ -10,6 +10,7 @@ import { JSDOM } from "jsdom";
 import { docmostExtensions, docmostSchema } from "./docmost-schema.js";
 import { withPageLock } from "./page-lock.js";
 import { sanitizeForYjs, findUnstorableAttr } from "./node-ops.js";
+import { lexFootnoteLines } from "./footnote-lex.js";
 import { summarizeChange } from "./diff.js";
 /**
 * Build the descriptive error for an opaque Yjs encode failure ("Unexpected
@@ -280,7 +281,8 @@ function bridgeTaskLists(html) {
 // Mirror of packages/editor-ext footnote markdown handling. A `[^id]` inline
 // marker becomes <sup data-footnote-ref data-id="id">, and `[^id]: text`
 // definition lines are collected into a single <section data-footnotes>.
-const FOOTNOTE_DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
+// Definition detection + fence handling are shared with analyzeFootnotes via
+// lexFootnoteLines (footnote-lex.js). FOOTNOTE_REF_RE is the inline tokenizer's.
 const FOOTNOTE_REF_RE = /\[\^([^\]\s]+)\]/;
 function escapeFootnoteAttr(value) {
    return String(value).replace(/&/g, "&amp;").replace(/"/g, "&quot;");
@@ -308,28 +310,17 @@ marked.use({ extensions: [footnoteRefMarkedExtension] });
 * <section data-footnotes> for them (or "" when there are none).
 */
 function extractFootnotes(markdown) {
-    const lines = markdown.split("\n");
    const bodyLines = [];
    const defs = [];
-    // Track fenced-code state so a `[^id]: ...` line shown inside a ``` / ~~~ code
-    // block is preserved verbatim and not treated as a footnote definition.
-    let fence = null;
-    for (const line of lines) {
-        const fenceMatch = /^(\s*)(`{3,}|~{3,})/.exec(line);
-        if (fenceMatch) {
-            const marker = fenceMatch[2][0];
-            if (fence === null)
-                fence = marker;
-            else if (marker === fence)
-                fence = null;
-            bodyLines.push(line);
-            continue;
-        }
-        const m = fence === null ? FOOTNOTE_DEF_RE.exec(line) : null;
-        if (m)
-            defs.push({ id: m[1], text: m[2] });
+    // Shared lexer (footnote-lex): a `[^id]: ...` line inside a ``` / ~~~ code
+    // block is inert and stays in the body verbatim; only real definition lines
+    // are pulled out. analyzeFootnotes() consumes the SAME lexer so its diagnostics
+    // match exactly what import keeps/strips (#166).
+    for (const tok of lexFootnoteLines(markdown)) {
+        if (!tok.inFence && tok.definition)
+            defs.push(tok.definition);
        else
-            bodyLines.push(line);
+            bodyLines.push(tok.line);
    }
    if (defs.length === 0)
        return { body: markdown, section: "" };
--- a/packages/mcp/build/lib/footnote-analyze.js
+++ b/packages/mcp/build/lib/footnote-analyze.js
@@ -16,24 +16,11 @@
 *    the line, trimmed, starts with `|`) — footnotes in table cells often do not
 *    render as expected.
 */
-/** Matches a footnote DEFINITION line: `[^id]: text` (id + text captured). */
-const DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
-/** Matches every footnote REFERENCE `[^id]` in a line (global; id captured). */
-const REF_RE_G = /\[\^([^\]\s]+)\]/g;
-/** Opening/closing fence marker (``` or ~~~). */
-const FENCE_RE = /^(\s*)(`{3,}|~{3,})/;
-/** Scan a line for every `[^id]` reference, invoking `onRef(id)` for each. */
-function forEachReference(line, onRef) {
-    REF_RE_G.lastIndex = 0;
-    let m;
-    while ((m = REF_RE_G.exec(line)) !== null)
-        onRef(m[1]);
-}
+import { lexFootnoteLines, forEachFootnoteReference, } from "./footnote-lex.js";
 /**
 * Analyze the footnotes in a Markdown string. Pure; safe to call on any body.
 */
 export function analyzeFootnotes(markdown) {
-    const lines = markdown.split("\n");
    // Distinct reference ids in first-appearance order, plus the set of ids seen
    // inside a table row.
    const refIds = [];
@@ -49,24 +36,13 @@ export function analyzeFootnotes(markdown) {
    };
    // Definition texts per id, in first-appearance order of the id.
    const defTextsById = new Map();
-    let fence = null;
-    for (const line of lines) {
-        const fenceMatch = FENCE_RE.exec(line);
-        if (fenceMatch) {
-            const marker = fenceMatch[2][0];
-            if (fence === null)
-                fence = marker;
-            else if (marker === fence)
-                fence = null;
+    // Same lexer the importer uses, so the analysis matches exactly what import
+    // keeps/strips (#166): fenced lines are inert, definition lines are pulled.
+    for (const tok of lexFootnoteLines(markdown)) {
+        if (tok.inFence)
            continue;
-        }
-        // Footnote syntax shown inside a code fence is not real markup.
-        if (fence !== null)
-            continue;
-        const defM = DEF_RE.exec(line);
-        if (defM) {
-            const id = defM[1];
-            const text = defM[2];
+        if (tok.definition) {
+            const { id, text } = tok.definition;
            const arr = defTextsById.get(id);
            if (arr)
                arr.push(text);
@@ -74,11 +50,11 @@ export function analyzeFootnotes(markdown) {
                defTextsById.set(id, [text]);
            // A definition's TEXT can itself reference another footnote (`[^a]: see
            // [^b]`); count those so such a `[^b]` is not falsely reported dangling.
-            forEachReference(text, (rid) => addRef(rid, false));
+            forEachFootnoteReference(text, (rid) => addRef(rid, false));
            continue;
        }
-        const inTable = line.trimStart().startsWith("|");
-        forEachReference(line, (id) => addRef(id, inTable));
+        const inTable = tok.line.trimStart().startsWith("|");
+        forEachFootnoteReference(tok.line, (id) => addRef(id, inTable));
    }
    const danglingReferences = refIds.filter((id) => !defTextsById.has(id));
    const duplicateDefinitions = [];
--- a/packages/mcp/build/lib/footnote-lex.js
+++ b/packages/mcp/build/lib/footnote-lex.js
@@ -0,0 +1,55 @@
+/**
+ * Shared, fence-aware line lexer for footnote markdown (MCP-internal).
+ *
+ * Both the importer (`extractFootnotes` in collaboration.ts, which strips
+ * definition lines and rebuilds a footnotes section) and the diagnostics
+ * (`analyzeFootnotes` in footnote-analyze.ts) must agree EXACTLY on which lines
+ * are definitions and which lines are inert (inside a code fence). Sharing one
+ * lexer makes "the analyzer sees what the importer leaves" a structural property
+ * instead of two hand-kept copies that can drift (#166 review).
+ *
+ * NOTE: this is deliberately NOT shared with editor-ext's
+ * `extractFootnoteDefinitions` — that lives in a different package and the
+ * decoupling between the editor and the MCP mirror is intentional.
+ */
+/** A footnote DEFINITION line: `[^id]: text` (id + text captured). */
+export const FOOTNOTE_DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
+/** Every footnote REFERENCE `[^id]` in a line (global; id captured). */
+export const FOOTNOTE_REF_RE_G = /\[\^([^\]\s]+)\]/g;
+/** Opening/closing code fence marker (``` or ~~~). */
+const FENCE_RE = /^(\s*)(`{3,}|~{3,})/;
+/** Classify every line of `markdown`, tracking fenced-code state. Pure. */
+export function lexFootnoteLines(markdown) {
+    const out = [];
+    let fence = null;
+    for (const line of markdown.split("\n")) {
+        const fenceMatch = FENCE_RE.exec(line);
+        if (fenceMatch) {
+            const marker = fenceMatch[2][0];
+            if (fence === null)
+                fence = marker; // opening fence
+            else if (marker === fence)
+                fence = null; // matching closing fence
+            out.push({ line, inFence: true, definition: null });
+            continue;
+        }
+        if (fence !== null) {
+            out.push({ line, inFence: true, definition: null });
+            continue;
+        }
+        const m = FOOTNOTE_DEF_RE.exec(line);
+        out.push({
+            line,
+            inFence: false,
+            definition: m ? { id: m[1], text: m[2] } : null,
+        });
+    }
+    return out;
+}
+/** Scan a line for every `[^id]` reference, invoking `onRef(id)` for each. */
+export function forEachFootnoteReference(line, onRef) {
+    FOOTNOTE_REF_RE_G.lastIndex = 0;
+    let m;
+    while ((m = FOOTNOTE_REF_RE_G.exec(line)) !== null)
+        onRef(m[1]);
+}
--- a/packages/mcp/src/lib/collaboration.ts
+++ b/packages/mcp/src/lib/collaboration.ts
@@ -10,6 +10,7 @@ import { JSDOM } from "jsdom";
 import { docmostExtensions, docmostSchema } from "./docmost-schema.js";
 import { withPageLock } from "./page-lock.js";
 import { sanitizeForYjs, findUnstorableAttr } from "./node-ops.js";
+import { lexFootnoteLines } from "./footnote-lex.js";
 import { summarizeChange, VerifyReport } from "./diff.js";

 /**
@@ -316,7 +317,8 @@ function bridgeTaskLists(html: string): string {
 // Mirror of packages/editor-ext footnote markdown handling. A `[^id]` inline
 // marker becomes <sup data-footnote-ref data-id="id">, and `[^id]: text`
 // definition lines are collected into a single <section data-footnotes>.
-const FOOTNOTE_DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
+// Definition detection + fence handling are shared with analyzeFootnotes via
+// lexFootnoteLines (footnote-lex.js). FOOTNOTE_REF_RE is the inline tokenizer's.
 const FOOTNOTE_REF_RE = /\[\^([^\]\s]+)\]/;

 function escapeFootnoteAttr(value: string): string {
@@ -353,24 +355,15 @@ function extractFootnotes(markdown: string): {
  body: string;
  section: string;
 } {
-  const lines = markdown.split("\n");
  const bodyLines: string[] = [];
  const defs: Array<{ id: string; text: string }> = [];
-  // Track fenced-code state so a `[^id]: ...` line shown inside a ``` / ~~~ code
-  // block is preserved verbatim and not treated as a footnote definition.
-  let fence: string | null = null;
-  for (const line of lines) {
-    const fenceMatch = /^(\s*)(`{3,}|~{3,})/.exec(line);
-    if (fenceMatch) {
-      const marker = fenceMatch[2][0];
-      if (fence === null) fence = marker;
-      else if (marker === fence) fence = null;
-      bodyLines.push(line);
-      continue;
-    }
-    const m = fence === null ? FOOTNOTE_DEF_RE.exec(line) : null;
-    if (m) defs.push({ id: m[1], text: m[2] });
-    else bodyLines.push(line);
+  // Shared lexer (footnote-lex): a `[^id]: ...` line inside a ``` / ~~~ code
+  // block is inert and stays in the body verbatim; only real definition lines
+  // are pulled out. analyzeFootnotes() consumes the SAME lexer so its diagnostics
+  // match exactly what import keeps/strips (#166).
+  for (const tok of lexFootnoteLines(markdown)) {
+    if (!tok.inFence && tok.definition) defs.push(tok.definition);
+    else bodyLines.push(tok.line);
  }
  if (defs.length === 0) return { body: markdown, section: "" };

--- a/packages/mcp/src/lib/footnote-analyze.ts
+++ b/packages/mcp/src/lib/footnote-analyze.ts
@@ -17,12 +17,10 @@
 *    render as expected.
 */

-/** Matches a footnote DEFINITION line: `[^id]: text` (id + text captured). */
-const DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
-/** Matches every footnote REFERENCE `[^id]` in a line (global; id captured). */
-const REF_RE_G = /\[\^([^\]\s]+)\]/g;
-/** Opening/closing fence marker (``` or ~~~). */
-const FENCE_RE = /^(\s*)(`{3,}|~{3,})/;
+import {
+  lexFootnoteLines,
+  forEachFootnoteReference,
+} from "./footnote-lex.js";

 export interface FootnoteDiagnostics {
  /** Reference ids (distinct, document order) with no matching definition. */
@@ -37,19 +35,10 @@ export interface FootnoteDiagnostics {
  warnings: string[];
 }

-/** Scan a line for every `[^id]` reference, invoking `onRef(id)` for each. */
-function forEachReference(line: string, onRef: (id: string) => void): void {
-  REF_RE_G.lastIndex = 0;
-  let m: RegExpExecArray | null;
-  while ((m = REF_RE_G.exec(line)) !== null) onRef(m[1]);
-}
-
 /**
 * Analyze the footnotes in a Markdown string. Pure; safe to call on any body.
 */
 export function analyzeFootnotes(markdown: string): FootnoteDiagnostics {
-  const lines = markdown.split("\n");
-
  // Distinct reference ids in first-appearance order, plus the set of ids seen
  // inside a table row.
  const refIds: string[] = [];
@@ -66,33 +55,22 @@ export function analyzeFootnotes(markdown: string): FootnoteDiagnostics {
  // Definition texts per id, in first-appearance order of the id.
  const defTextsById = new Map<string, string[]>();

-  let fence: string | null = null;
-  for (const line of lines) {
-    const fenceMatch = FENCE_RE.exec(line);
-    if (fenceMatch) {
-      const marker = fenceMatch[2][0];
-      if (fence === null) fence = marker;
-      else if (marker === fence) fence = null;
-      continue;
-    }
-    // Footnote syntax shown inside a code fence is not real markup.
-    if (fence !== null) continue;
-
-    const defM = DEF_RE.exec(line);
-    if (defM) {
-      const id = defM[1];
-      const text = defM[2];
+  // Same lexer the importer uses, so the analysis matches exactly what import
+  // keeps/strips (#166): fenced lines are inert, definition lines are pulled.
+  for (const tok of lexFootnoteLines(markdown)) {
+    if (tok.inFence) continue;
+    if (tok.definition) {
+      const { id, text } = tok.definition;
      const arr = defTextsById.get(id);
      if (arr) arr.push(text);
      else defTextsById.set(id, [text]);
      // A definition's TEXT can itself reference another footnote (`[^a]: see
      // [^b]`); count those so such a `[^b]` is not falsely reported dangling.
-      forEachReference(text, (rid) => addRef(rid, false));
+      forEachFootnoteReference(text, (rid) => addRef(rid, false));
      continue;
    }
-
-    const inTable = line.trimStart().startsWith("|");
-    forEachReference(line, (id) => addRef(id, inTable));
+    const inTable = tok.line.trimStart().startsWith("|");
+    forEachFootnoteReference(tok.line, (id) => addRef(id, inTable));
  }

  const danglingReferences = refIds.filter((id) => !defTextsById.has(id));
--- a/packages/mcp/src/lib/footnote-lex.ts
+++ b/packages/mcp/src/lib/footnote-lex.ts
@@ -0,0 +1,71 @@
+/**
+ * Shared, fence-aware line lexer for footnote markdown (MCP-internal).
+ *
+ * Both the importer (`extractFootnotes` in collaboration.ts, which strips
+ * definition lines and rebuilds a footnotes section) and the diagnostics
+ * (`analyzeFootnotes` in footnote-analyze.ts) must agree EXACTLY on which lines
+ * are definitions and which lines are inert (inside a code fence). Sharing one
+ * lexer makes "the analyzer sees what the importer leaves" a structural property
+ * instead of two hand-kept copies that can drift (#166 review).
+ *
+ * NOTE: this is deliberately NOT shared with editor-ext's
+ * `extractFootnoteDefinitions` — that lives in a different package and the
+ * decoupling between the editor and the MCP mirror is intentional.
+ */
+
+/** A footnote DEFINITION line: `[^id]: text` (id + text captured). */
+export const FOOTNOTE_DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
+/** Every footnote REFERENCE `[^id]` in a line (global; id captured). */
+export const FOOTNOTE_REF_RE_G = /\[\^([^\]\s]+)\]/g;
+/** Opening/closing code fence marker (``` or ~~~). */
+const FENCE_RE = /^(\s*)(`{3,}|~{3,})/;
+
+export interface FootnoteLine {
+  /** The raw line, verbatim. */
+  line: string;
+  /**
+   * True for a code-fence marker line AND every line inside a fence — footnote
+   * syntax on such lines is inert (example text, not real markup). The importer
+   * keeps these in the body; the analyzer skips them.
+   */
+  inFence: boolean;
+  /** The parsed definition, when this is a `[^id]: text` line OUTSIDE any fence. */
+  definition: { id: string; text: string } | null;
+}
+
+/** Classify every line of `markdown`, tracking fenced-code state. Pure. */
+export function lexFootnoteLines(markdown: string): FootnoteLine[] {
+  const out: FootnoteLine[] = [];
+  let fence: string | null = null;
+  for (const line of markdown.split("\n")) {
+    const fenceMatch = FENCE_RE.exec(line);
+    if (fenceMatch) {
+      const marker = fenceMatch[2][0];
+      if (fence === null) fence = marker; // opening fence
+      else if (marker === fence) fence = null; // matching closing fence
+      out.push({ line, inFence: true, definition: null });
+      continue;
+    }
+    if (fence !== null) {
+      out.push({ line, inFence: true, definition: null });
+      continue;
+    }
+    const m = FOOTNOTE_DEF_RE.exec(line);
+    out.push({
+      line,
+      inFence: false,
+      definition: m ? { id: m[1], text: m[2] } : null,
+    });
+  }
+  return out;
+}
+
+/** Scan a line for every `[^id]` reference, invoking `onRef(id)` for each. */
+export function forEachFootnoteReference(
+  line: string,
+  onRef: (id: string) => void,
+): void {
+  FOOTNOTE_REF_RE_G.lastIndex = 0;
+  let m: RegExpExecArray | null;
+  while ((m = FOOTNOTE_REF_RE_G.exec(line)) !== null) onRef(m[1]);
+}
--- a/packages/mcp/test/mock/footnote-warnings.test.mjs
+++ b/packages/mcp/test/mock/footnote-warnings.test.mjs
@@ -0,0 +1,110 @@
+// Mock-HTTP test for the footnoteWarnings plumbing (#166). createPage is the
+// representative path that is fully plain-HTTP (import + getPage) and so is
+// mockable here; updatePage / importPageMarkdown attach footnoteWarnings with the
+// IDENTICAL wiring (`analyzeFootnotes(...)` + spread-when-non-empty) but run their
+// mutation over the Hocuspocus collab WebSocket, which this plain-HTTP harness
+// does not stand up. The analyzer itself is unit-tested in footnote-analyze.test.
+import { test, after } from "node:test";
+import assert from "node:assert/strict";
+import http from "node:http";
+import { DocmostClient } from "../../build/client.js";
+
+function readBody(req) {
+  return new Promise((resolve) => {
+    let raw = "";
+    req.on("data", (c) => (raw += c));
+    req.on("end", () => resolve(raw));
+  });
+}
+
+function sendJson(res, status, obj, extraHeaders = {}) {
+  res.writeHead(status, { "Content-Type": "application/json", ...extraHeaders });
+  res.end(JSON.stringify(obj));
+}
+
+const openServers = [];
+function spawn(handler) {
+  return new Promise((resolve) => {
+    const server = http.createServer(handler);
+    openServers.push(server);
+    server.listen(0, "127.0.0.1", () => {
+      const { port } = server.address();
+      resolve(`http://127.0.0.1:${port}/api`);
+    });
+  });
+}
+
+after(async () => {
+  await Promise.all(
+    openServers.map((s) => new Promise((r) => s.close(r))),
+  );
+});
+
+// A handler that imports a page, lets getPage read it back, and 404s everything
+// else (listSidebarPages fails gracefully inside getPage).
+function pageHandler() {
+  return async (req, res) => {
+    await readBody(req);
+    if (req.url === "/api/auth/login") {
+      sendJson(res, 200, { success: true }, {
+        "Set-Cookie": "authToken=t; Path=/; HttpOnly",
+      });
+      return;
+    }
+    if (req.url === "/api/pages/import") {
+      sendJson(res, 200, { data: { id: "new-1" } });
+      return;
+    }
+    if (req.url === "/api/pages/update") {
+      // The title-restore step after import.
+      sendJson(res, 200, { data: { id: "new-1" } });
+      return;
+    }
+    if (req.url === "/api/pages/info") {
+      sendJson(res, 200, {
+        data: {
+          id: "new-1",
+          slugId: "slug-1",
+          title: "T",
+          spaceId: "sp-1",
+          content: { type: "doc", content: [] },
+        },
+      });
+      return;
+    }
+    sendJson(res, 404, { message: "not found" });
+  };
+}
+
+test("createPage attaches footnoteWarnings when the content has footnote problems", async () => {
+  const baseURL = await spawn(pageHandler());
+  const client = new DocmostClient(baseURL, "user@example.com", "pw");
+  // A dangling reference + a duplicate definition + a table marker.
+  const content = [
+    "Intro[^missing] and| cell[^t] |.",
+    "",
+    "[^d]: one",
+    "[^d]: two",
+    "[^t]: in table",
+  ].join("\n");
+  const result = await client.createPage("T", content, "sp-1");
+  assert.ok(Array.isArray(result.footnoteWarnings), "footnoteWarnings present");
+  const joined = result.footnoteWarnings.join("\n");
+  assert.match(joined, /no matching definition/); // dangling [^missing]
+  assert.match(joined, /defined more than once/); // duplicate [^d]
+  // The page itself is still returned.
+  assert.equal(result.success, true);
+});
+
+test("createPage omits footnoteWarnings when the content is clean", async () => {
+  const baseURL = await spawn(pageHandler());
+  const client = new DocmostClient(baseURL, "user@example.com", "pw");
+  const content = ["A[^a] and reuse[^a].", "", "[^a]: fine"].join("\n");
+  const result = await client.createPage("T", content, "sp-1");
+  assert.equal(
+    "footnoteWarnings" in result,
+    false,
+    "no footnoteWarnings field on clean input",
+  );
+  assert.equal(result.success, true);
+});