feat(footnotes): reuse semantics + import diagnostics (#166)
Footnotes were strict 1:1: a repeated `[^a]` reference was treated as a collision and re-id'd to `a__2`, and a reference with no definition synthesized its own empty one — so an agent-authored article with reused labels produced dozens of empty `kowiki__N` footnotes. Move to Pandoc REUSE semantics and add non-fatal import diagnostics. Reuse (core): - resolveCollisions (footnote-sync): repeated references sharing an id are REUSE (recorded once in document order, never re-id'd) — one number, one shared definition. Only a duplicate DEFINITION is re-id'd deterministically and, with no matching reference, dropped by the existing orphan policy (first-wins). CollisionPlan.refReids is now always empty (harmless no-op downstream). - extractFootnoteDefinitions (marked) and extractFootnotes (MCP): duplicate definition ids are FIRST-WINS (keep first, drop rest); reference markers are never rewritten. Removed the marker-rewriting and the now-dead deriveFootnoteId mirror + helpers from the MCP path. Import diagnostics: - New analyzeFootnotes() (MCP): fence-aware pure scan reporting dangling references, empty/duplicate definitions and `[^id]` markers inside table rows. - createPage / updatePage / importPageMarkdown now attach `footnoteWarnings` (only when non-empty) so an agent can fix its markup; the page is still created. Paste-reuse: - footnotePastePlugin remaps only ids the pasted slice DEFINES (a colliding definition); a pasted lone reference to an existing id keeps it (reuse). Tests: reuse/first-wins rewrites of footnote.test, footnote-markdown.test, footnote.marked.orphan.test and the MCP footnotes.test; new footnote-paste.test (editor-ext) and footnote-analyze.test (MCP). Deleted derive-id-parity.test.mjs (the MCP no longer derives ids; editor-ext's deriveFootnoteId keeps its own golden test). editor-ext 128, MCP 299, server roundtrip 2, client views 3, client+server tsc clean. Two review suggestions applied: corrected a stale "duplicated in MCP" comment and the dangling-reference warning wording. Note: the multi-backlink editor UI (a reused definition linking back to each of its references) is deferred to a follow-up — this PR delivers the data-integrity core (reuse + warnings + paste-reuse). Forward links and numbering already reuse correctly; the backlink currently targets the first reference. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ import WebSocket from "ws";
|
||||
import { convertProseMirrorToMarkdown } from "./lib/markdown-converter.js";
|
||||
import { updatePageContentRealtime, replacePageContent, markdownToProseMirror, mutatePageContent, buildCollabWsUrl, assertYjsEncodable, } from "./lib/collaboration.js";
|
||||
import { docmostExtensions } from "./lib/docmost-schema.js";
|
||||
import { analyzeFootnotes } from "./lib/footnote-analyze.js";
|
||||
import { buildPageTree } from "./lib/tree.js";
|
||||
import { serializeDocmostMarkdown, parseDocmostMarkdown, } from "./lib/markdown-document.js";
|
||||
import { replaceNodeById, deleteNodeById, insertNodeRelative, buildOutline, getNodeByRef, readTable, insertTableRow, deleteTableRow, updateTableCell, } from "./lib/node-ops.js";
|
||||
@@ -566,7 +567,9 @@ export class DocmostClient {
|
||||
// Always fetch subpages to provide context to the agent
|
||||
let subpages = [];
|
||||
try {
|
||||
subpages = await this.listSidebarPages(resultData.spaceId, pageId);
|
||||
// `pageId` may be a slugId, but the sidebar-pages endpoint requires the
|
||||
// UUID; `resultData.id` holds the resolved UUID returned by getPageRaw.
|
||||
subpages = await this.listSidebarPages(resultData.spaceId, resultData.id);
|
||||
}
|
||||
catch (e) {
|
||||
console.warn("Failed to fetch subpages:", e);
|
||||
@@ -814,7 +817,11 @@ export class DocmostClient {
|
||||
if (title) {
|
||||
await this.client.post("/pages/update", { pageId: newPageId, title });
|
||||
}
|
||||
return this.getPage(newPageId);
|
||||
const page = await this.getPage(newPageId);
|
||||
// Surface non-fatal footnote problems (dangling refs, empty/duplicate
|
||||
// definitions, markers in tables) so the agent can fix its markup (#166).
|
||||
const { warnings } = analyzeFootnotes(content);
|
||||
return warnings.length > 0 ? { ...page, footnoteWarnings: warnings } : page;
|
||||
}
|
||||
/**
|
||||
* Update a page's content from markdown and optionally its title.
|
||||
@@ -844,12 +851,15 @@ export class DocmostClient {
|
||||
}
|
||||
throw new Error(`Failed to update page content: ${error.message}`);
|
||||
}
|
||||
const { warnings } = analyzeFootnotes(content);
|
||||
return {
|
||||
success: true,
|
||||
modified: true,
|
||||
message: "Page updated successfully.",
|
||||
pageId: pageId,
|
||||
verify: mutation.verify,
|
||||
// Non-fatal footnote diagnostics (#166); omitted when there are none.
|
||||
...(warnings.length > 0 ? { footnoteWarnings: warnings } : {}),
|
||||
};
|
||||
}
|
||||
/**
|
||||
@@ -1119,6 +1129,11 @@ export class DocmostClient {
|
||||
if (meta?.pageId && meta.pageId !== pageId) {
|
||||
result.warning = `File was exported from page ${meta.pageId} but is being imported into ${pageId}.`;
|
||||
}
|
||||
// Non-fatal footnote diagnostics (#166), analyzed on the body (definitions
|
||||
// and references live there, not in the front-matter/comments sections).
|
||||
const { warnings } = analyzeFootnotes(body);
|
||||
if (warnings.length > 0)
|
||||
result.footnoteWarnings = warnings;
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
@@ -2422,9 +2437,9 @@ export class DocmostClient {
|
||||
const raw = await this.getPageRaw(pageId);
|
||||
const current = raw.content || { type: "doc", content: [] };
|
||||
runTransform(current);
|
||||
// Exercise the same Yjs encoder the apply path uses, so the preview
|
||||
// fails with the SAME descriptive error when the doc is not encodable
|
||||
// instead of returning a misleadingly-green diff.
|
||||
// Run an independent Yjs-encodability check (same sanitize + schema as the
|
||||
// apply path), so the preview fails with the same descriptive error when
|
||||
// the doc is not encodable instead of returning a misleadingly-green diff.
|
||||
assertYjsEncodable(newDoc);
|
||||
return {
|
||||
pushed: false,
|
||||
|
||||
@@ -285,44 +285,6 @@ const FOOTNOTE_REF_RE = /\[\^([^\]\s]+)\]/;
|
||||
function escapeFootnoteAttr(value) {
|
||||
return String(value).replace(/&/g, "&").replace(/"/g, """);
|
||||
}
|
||||
function escapeFootnoteRegExp(value) {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
/**
|
||||
* Derive a DETERMINISTIC unique footnote id for the k-th (k >= 2) occurrence of
|
||||
* an original id `X` during definition dedup.
|
||||
*
|
||||
* EXACT MIRROR of editor-ext `deriveFootnoteId`
|
||||
* (packages/editor-ext/src/lib/footnote/footnote-util.ts). These two copies MUST
|
||||
* STAY IN SYNC: the same markdown imported through the editor and through this
|
||||
* MCP path has to produce identical ids, and the sync plugin (which re-ids on
|
||||
* every collaborating client) relies on the same scheme to converge. NEVER use
|
||||
* Math.random()/Date.now()/uuid here — a random id would diverge across clients.
|
||||
*
|
||||
* Scheme: base candidate `${originalId}__${occurrence}` (e.g. `X__2`), bumped
|
||||
* with a stable alphabetic suffix (`X__2b`, `X__2c`, ...) until it is not in
|
||||
* `taken` (the set of ids already present / already minted — pure doc state).
|
||||
*/
|
||||
function deriveFootnoteId(originalId, occurrence, taken) {
|
||||
let candidate = `${originalId}__${occurrence}`;
|
||||
let n = 0;
|
||||
while (taken.has(candidate)) {
|
||||
n += 1;
|
||||
candidate = `${originalId}__${occurrence}${footnoteSuffix(n)}`;
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
/** Map 1 -> "b", 2 -> "c", ... (mirror of editor-ext `suffix`). */
|
||||
function footnoteSuffix(n) {
|
||||
let out = "";
|
||||
let x = n;
|
||||
while (x > 0) {
|
||||
const rem = (x - 1) % 25;
|
||||
out = String.fromCharCode(98 + rem) + out; // 98 = 'b'
|
||||
x = Math.floor((x - 1) / 25);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
const footnoteRefMarkedExtension = {
|
||||
name: "footnoteRef",
|
||||
level: "inline",
|
||||
@@ -371,43 +333,22 @@ function extractFootnotes(markdown) {
|
||||
}
|
||||
if (defs.length === 0)
|
||||
return { body: markdown, section: "" };
|
||||
// De-duplicate colliding definition ids (mirror of editor-ext
|
||||
// extractFootnoteDefinitions). Two definitions sharing an id would otherwise
|
||||
// collapse into one footnote downstream; rename each colliding id to a
|
||||
// DETERMINISTIC derived one (NOT random) and rewrite the corresponding `[^id]`
|
||||
// marker so the (reference, definition) pairing stays 1:1. Determinism lets
|
||||
// the same markdown imported here and via the editor produce identical ids.
|
||||
let dedupedBody = bodyLines.join("\n");
|
||||
const taken = new Set(defs.map((d) => d.id));
|
||||
const seenDefIds = new Map();
|
||||
// Duplicate definition ids: FIRST WINS, the rest are DROPPED (mirror of
|
||||
// editor-ext extractFootnoteDefinitions). Reference markers are left untouched
|
||||
// so repeated `[^a]` references reuse the single footnote (Pandoc semantics,
|
||||
// #166). The dropped duplicate is surfaced to the caller via analyzeFootnotes
|
||||
// (`duplicateDefinitions`), not silently lost. MUST stay in sync with the
|
||||
// editor-ext mirror.
|
||||
const firstById = new Map(); // id -> first definition text
|
||||
for (const def of defs) {
|
||||
const originalId = def.id;
|
||||
const count = seenDefIds.get(originalId) ?? 0;
|
||||
seenDefIds.set(originalId, count + 1);
|
||||
if (count === 0)
|
||||
continue; // first definition keeps its id
|
||||
const newId = deriveFootnoteId(originalId, count + 1, taken);
|
||||
taken.add(newId);
|
||||
def.id = newId;
|
||||
// Remaining `[^originalId]` matches: index 0 = keeper's marker (left alone),
|
||||
// index 1 = this duplicate's marker. Rewrite index 1.
|
||||
let occurrence = 0;
|
||||
let rewritten = false;
|
||||
const re = new RegExp(`\\[\\^${escapeFootnoteRegExp(originalId)}\\]`, "g");
|
||||
dedupedBody = dedupedBody.replace(re, (match) => {
|
||||
const idx = occurrence++;
|
||||
if (!rewritten && idx === 1) {
|
||||
rewritten = true;
|
||||
return `[^${newId}]`;
|
||||
}
|
||||
return match;
|
||||
});
|
||||
if (!firstById.has(def.id))
|
||||
firstById.set(def.id, def.text);
|
||||
}
|
||||
const inner = defs
|
||||
.map((d) => `<div data-footnote-def data-id="${escapeFootnoteAttr(d.id)}"><p>${marked.parseInline(d.text || "")}</p></div>`)
|
||||
const inner = [...firstById.entries()]
|
||||
.map(([id, text]) => `<div data-footnote-def data-id="${escapeFootnoteAttr(id)}"><p>${marked.parseInline(text || "")}</p></div>`)
|
||||
.join("");
|
||||
return {
|
||||
body: dedupedBody,
|
||||
body: bodyLines.join("\n"),
|
||||
section: `<section data-footnotes>${inner}</section>`,
|
||||
};
|
||||
}
|
||||
|
||||
115
packages/mcp/build/lib/footnote-analyze.js
Normal file
115
packages/mcp/build/lib/footnote-analyze.js
Normal file
@@ -0,0 +1,115 @@
|
||||
/**
|
||||
* Footnote diagnostics for imported Markdown (issue #166).
|
||||
*
|
||||
* A PURE, fence-aware text scan (independent of the Markdown->ProseMirror
|
||||
* conversion path, so it reports the same problems for `create_page`,
|
||||
* `update_page` and `import_page_markdown`). It never changes the document — the
|
||||
* importer still creates the page; this only surfaces footnote problems to the
|
||||
* caller so an agent can fix its own markup instead of shipping broken footnotes.
|
||||
*
|
||||
* Detected problems:
|
||||
* - danglingReferences: a `[^id]` reference with no `[^id]:` definition.
|
||||
* - emptyDefinitions: a `[^id]:` whose (kept) text is empty/whitespace.
|
||||
* - duplicateDefinitions: an id defined by two or more `[^id]:` lines (only the
|
||||
* first is kept on import — first-wins; see extractFootnotes).
|
||||
* - referencesInTables: a `[^id]` marker found in a GFM table row (heuristic:
|
||||
* the line, trimmed, starts with `|`) — footnotes in table cells often do not
|
||||
* render as expected.
|
||||
*/
|
||||
/** Matches a footnote DEFINITION line: `[^id]: text` (id + text captured). */
|
||||
const DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
|
||||
/** Matches every footnote REFERENCE `[^id]` in a line (global; id captured). */
|
||||
const REF_RE_G = /\[\^([^\]\s]+)\]/g;
|
||||
/** Opening/closing fence marker (``` or ~~~). */
|
||||
const FENCE_RE = /^(\s*)(`{3,}|~{3,})/;
|
||||
/** Scan a line for every `[^id]` reference, invoking `onRef(id)` for each. */
|
||||
function forEachReference(line, onRef) {
|
||||
REF_RE_G.lastIndex = 0;
|
||||
let m;
|
||||
while ((m = REF_RE_G.exec(line)) !== null)
|
||||
onRef(m[1]);
|
||||
}
|
||||
/**
|
||||
* Analyze the footnotes in a Markdown string. Pure; safe to call on any body.
|
||||
*/
|
||||
export function analyzeFootnotes(markdown) {
|
||||
const lines = markdown.split("\n");
|
||||
// Distinct reference ids in first-appearance order, plus the set of ids seen
|
||||
// inside a table row.
|
||||
const refIds = [];
|
||||
const refIdSet = new Set();
|
||||
const referencesInTables = new Set();
|
||||
const addRef = (id, inTable) => {
|
||||
if (!refIdSet.has(id)) {
|
||||
refIdSet.add(id);
|
||||
refIds.push(id);
|
||||
}
|
||||
if (inTable)
|
||||
referencesInTables.add(id);
|
||||
};
|
||||
// Definition texts per id, in first-appearance order of the id.
|
||||
const defTextsById = new Map();
|
||||
let fence = null;
|
||||
for (const line of lines) {
|
||||
const fenceMatch = FENCE_RE.exec(line);
|
||||
if (fenceMatch) {
|
||||
const marker = fenceMatch[2][0];
|
||||
if (fence === null)
|
||||
fence = marker;
|
||||
else if (marker === fence)
|
||||
fence = null;
|
||||
continue;
|
||||
}
|
||||
// Footnote syntax shown inside a code fence is not real markup.
|
||||
if (fence !== null)
|
||||
continue;
|
||||
const defM = DEF_RE.exec(line);
|
||||
if (defM) {
|
||||
const id = defM[1];
|
||||
const text = defM[2];
|
||||
const arr = defTextsById.get(id);
|
||||
if (arr)
|
||||
arr.push(text);
|
||||
else
|
||||
defTextsById.set(id, [text]);
|
||||
// A definition's TEXT can itself reference another footnote (`[^a]: see
|
||||
// [^b]`); count those so such a `[^b]` is not falsely reported dangling.
|
||||
forEachReference(text, (rid) => addRef(rid, false));
|
||||
continue;
|
||||
}
|
||||
const inTable = line.trimStart().startsWith("|");
|
||||
forEachReference(line, (id) => addRef(id, inTable));
|
||||
}
|
||||
const danglingReferences = refIds.filter((id) => !defTextsById.has(id));
|
||||
const duplicateDefinitions = [];
|
||||
const emptyDefinitions = [];
|
||||
for (const [id, texts] of defTextsById) {
|
||||
if (texts.length >= 2)
|
||||
duplicateDefinitions.push(id);
|
||||
// First-wins: the kept definition is the first one; flag it if it is blank.
|
||||
if ((texts[0] ?? "").trim().length === 0)
|
||||
emptyDefinitions.push(id);
|
||||
}
|
||||
const tableRefs = [...referencesInTables];
|
||||
const warnings = [];
|
||||
const list = (ids) => ids.map((id) => `[^${id}]`).join(", ");
|
||||
if (danglingReferences.length > 0) {
|
||||
warnings.push(`Footnote reference(s) with no matching definition: ${list(danglingReferences)} (each will render as an empty footnote in the editor).`);
|
||||
}
|
||||
if (emptyDefinitions.length > 0) {
|
||||
warnings.push(`Footnote definition(s) with empty text: ${list(emptyDefinitions)}.`);
|
||||
}
|
||||
if (duplicateDefinitions.length > 0) {
|
||||
warnings.push(`Footnote id(s) defined more than once (only the first definition was kept): ${list(duplicateDefinitions)}.`);
|
||||
}
|
||||
if (tableRefs.length > 0) {
|
||||
warnings.push(`Footnote marker(s) inside a table row (footnotes in table cells may not render as expected): ${list(tableRefs)}.`);
|
||||
}
|
||||
return {
|
||||
danglingReferences,
|
||||
emptyDefinitions,
|
||||
duplicateDefinitions,
|
||||
referencesInTables: tableRefs,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
@@ -23,6 +23,7 @@ import {
|
||||
MutationResult,
|
||||
} from "./lib/collaboration.js";
|
||||
import { docmostExtensions } from "./lib/docmost-schema.js";
|
||||
import { analyzeFootnotes } from "./lib/footnote-analyze.js";
|
||||
import { buildPageTree } from "./lib/tree.js";
|
||||
import {
|
||||
serializeDocmostMarkdown,
|
||||
@@ -1054,7 +1055,11 @@ export class DocmostClient {
|
||||
await this.client.post("/pages/update", { pageId: newPageId, title });
|
||||
}
|
||||
|
||||
return this.getPage(newPageId);
|
||||
const page = await this.getPage(newPageId);
|
||||
// Surface non-fatal footnote problems (dangling refs, empty/duplicate
|
||||
// definitions, markers in tables) so the agent can fix its markup (#166).
|
||||
const { warnings } = analyzeFootnotes(content);
|
||||
return warnings.length > 0 ? { ...page, footnoteWarnings: warnings } : page;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1095,12 +1100,15 @@ export class DocmostClient {
|
||||
throw new Error(`Failed to update page content: ${error.message}`);
|
||||
}
|
||||
|
||||
const { warnings } = analyzeFootnotes(content);
|
||||
return {
|
||||
success: true,
|
||||
modified: true,
|
||||
message: "Page updated successfully.",
|
||||
pageId: pageId,
|
||||
verify: mutation.verify,
|
||||
// Non-fatal footnote diagnostics (#166); omitted when there are none.
|
||||
...(warnings.length > 0 ? { footnoteWarnings: warnings } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1416,6 +1424,10 @@ export class DocmostClient {
|
||||
if (meta?.pageId && meta.pageId !== pageId) {
|
||||
result.warning = `File was exported from page ${meta.pageId} but is being imported into ${pageId}.`;
|
||||
}
|
||||
// Non-fatal footnote diagnostics (#166), analyzed on the body (definitions
|
||||
// and references live there, not in the front-matter/comments sections).
|
||||
const { warnings } = analyzeFootnotes(body);
|
||||
if (warnings.length > 0) result.footnoteWarnings = warnings;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -323,51 +323,6 @@ function escapeFootnoteAttr(value: string): string {
|
||||
return String(value).replace(/&/g, "&").replace(/"/g, """);
|
||||
}
|
||||
|
||||
function escapeFootnoteRegExp(value: string): string {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive a DETERMINISTIC unique footnote id for the k-th (k >= 2) occurrence of
|
||||
* an original id `X` during definition dedup.
|
||||
*
|
||||
* EXACT MIRROR of editor-ext `deriveFootnoteId`
|
||||
* (packages/editor-ext/src/lib/footnote/footnote-util.ts). These two copies MUST
|
||||
* STAY IN SYNC: the same markdown imported through the editor and through this
|
||||
* MCP path has to produce identical ids, and the sync plugin (which re-ids on
|
||||
* every collaborating client) relies on the same scheme to converge. NEVER use
|
||||
* Math.random()/Date.now()/uuid here — a random id would diverge across clients.
|
||||
*
|
||||
* Scheme: base candidate `${originalId}__${occurrence}` (e.g. `X__2`), bumped
|
||||
* with a stable alphabetic suffix (`X__2b`, `X__2c`, ...) until it is not in
|
||||
* `taken` (the set of ids already present / already minted — pure doc state).
|
||||
*/
|
||||
function deriveFootnoteId(
|
||||
originalId: string,
|
||||
occurrence: number,
|
||||
taken: Set<string>,
|
||||
): string {
|
||||
let candidate = `${originalId}__${occurrence}`;
|
||||
let n = 0;
|
||||
while (taken.has(candidate)) {
|
||||
n += 1;
|
||||
candidate = `${originalId}__${occurrence}${footnoteSuffix(n)}`;
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
|
||||
/** Map 1 -> "b", 2 -> "c", ... (mirror of editor-ext `suffix`). */
|
||||
function footnoteSuffix(n: number): string {
|
||||
let out = "";
|
||||
let x = n;
|
||||
while (x > 0) {
|
||||
const rem = (x - 1) % 25;
|
||||
out = String.fromCharCode(98 + rem) + out; // 98 = 'b'
|
||||
x = Math.floor((x - 1) / 25);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
const footnoteRefMarkedExtension = {
|
||||
name: "footnoteRef",
|
||||
level: "inline" as const,
|
||||
@@ -419,48 +374,27 @@ function extractFootnotes(markdown: string): {
|
||||
}
|
||||
if (defs.length === 0) return { body: markdown, section: "" };
|
||||
|
||||
// De-duplicate colliding definition ids (mirror of editor-ext
|
||||
// extractFootnoteDefinitions). Two definitions sharing an id would otherwise
|
||||
// collapse into one footnote downstream; rename each colliding id to a
|
||||
// DETERMINISTIC derived one (NOT random) and rewrite the corresponding `[^id]`
|
||||
// marker so the (reference, definition) pairing stays 1:1. Determinism lets
|
||||
// the same markdown imported here and via the editor produce identical ids.
|
||||
let dedupedBody = bodyLines.join("\n");
|
||||
const taken = new Set<string>(defs.map((d) => d.id));
|
||||
const seenDefIds = new Map<string, number>();
|
||||
// Duplicate definition ids: FIRST WINS, the rest are DROPPED (mirror of
|
||||
// editor-ext extractFootnoteDefinitions). Reference markers are left untouched
|
||||
// so repeated `[^a]` references reuse the single footnote (Pandoc semantics,
|
||||
// #166). The dropped duplicate is surfaced to the caller via analyzeFootnotes
|
||||
// (`duplicateDefinitions`), not silently lost. MUST stay in sync with the
|
||||
// editor-ext mirror.
|
||||
const firstById = new Map<string, string>(); // id -> first definition text
|
||||
for (const def of defs) {
|
||||
const originalId = def.id;
|
||||
const count = seenDefIds.get(originalId) ?? 0;
|
||||
seenDefIds.set(originalId, count + 1);
|
||||
if (count === 0) continue; // first definition keeps its id
|
||||
const newId = deriveFootnoteId(originalId, count + 1, taken);
|
||||
taken.add(newId);
|
||||
def.id = newId;
|
||||
// Remaining `[^originalId]` matches: index 0 = keeper's marker (left alone),
|
||||
// index 1 = this duplicate's marker. Rewrite index 1.
|
||||
let occurrence = 0;
|
||||
let rewritten = false;
|
||||
const re = new RegExp(`\\[\\^${escapeFootnoteRegExp(originalId)}\\]`, "g");
|
||||
dedupedBody = dedupedBody.replace(re, (match) => {
|
||||
const idx = occurrence++;
|
||||
if (!rewritten && idx === 1) {
|
||||
rewritten = true;
|
||||
return `[^${newId}]`;
|
||||
}
|
||||
return match;
|
||||
});
|
||||
if (!firstById.has(def.id)) firstById.set(def.id, def.text);
|
||||
}
|
||||
|
||||
const inner = defs
|
||||
const inner = [...firstById.entries()]
|
||||
.map(
|
||||
(d) =>
|
||||
([id, text]) =>
|
||||
`<div data-footnote-def data-id="${escapeFootnoteAttr(
|
||||
d.id,
|
||||
)}"><p>${marked.parseInline(d.text || "")}</p></div>`,
|
||||
id,
|
||||
)}"><p>${marked.parseInline(text || "")}</p></div>`,
|
||||
)
|
||||
.join("");
|
||||
return {
|
||||
body: dedupedBody,
|
||||
body: bodyLines.join("\n"),
|
||||
section: `<section data-footnotes>${inner}</section>`,
|
||||
};
|
||||
}
|
||||
|
||||
138
packages/mcp/src/lib/footnote-analyze.ts
Normal file
138
packages/mcp/src/lib/footnote-analyze.ts
Normal file
@@ -0,0 +1,138 @@
|
||||
/**
|
||||
* Footnote diagnostics for imported Markdown (issue #166).
|
||||
*
|
||||
* A PURE, fence-aware text scan (independent of the Markdown->ProseMirror
|
||||
* conversion path, so it reports the same problems for `create_page`,
|
||||
* `update_page` and `import_page_markdown`). It never changes the document — the
|
||||
* importer still creates the page; this only surfaces footnote problems to the
|
||||
* caller so an agent can fix its own markup instead of shipping broken footnotes.
|
||||
*
|
||||
* Detected problems:
|
||||
* - danglingReferences: a `[^id]` reference with no `[^id]:` definition.
|
||||
* - emptyDefinitions: a `[^id]:` whose (kept) text is empty/whitespace.
|
||||
* - duplicateDefinitions: an id defined by two or more `[^id]:` lines (only the
|
||||
* first is kept on import — first-wins; see extractFootnotes).
|
||||
* - referencesInTables: a `[^id]` marker found in a GFM table row (heuristic:
|
||||
* the line, trimmed, starts with `|`) — footnotes in table cells often do not
|
||||
* render as expected.
|
||||
*/
|
||||
|
||||
/** Matches a footnote DEFINITION line: `[^id]: text` (id + text captured). */
|
||||
const DEF_RE = /^\[\^([^\]\s]+)\]:[ \t]*(.*)$/;
|
||||
/** Matches every footnote REFERENCE `[^id]` in a line (global; id captured). */
|
||||
const REF_RE_G = /\[\^([^\]\s]+)\]/g;
|
||||
/** Opening/closing fence marker (``` or ~~~). */
|
||||
const FENCE_RE = /^(\s*)(`{3,}|~{3,})/;
|
||||
|
||||
export interface FootnoteDiagnostics {
|
||||
/** Reference ids (distinct, document order) with no matching definition. */
|
||||
danglingReferences: string[];
|
||||
/** Definition ids whose first (kept) text is empty/whitespace. */
|
||||
emptyDefinitions: string[];
|
||||
/** Ids defined by two or more `[^id]:` lines (only the first is kept). */
|
||||
duplicateDefinitions: string[];
|
||||
/** Reference ids found inside a GFM table row (heuristic). */
|
||||
referencesInTables: string[];
|
||||
/** Human-readable warning lines for the tool result (one per problem class). */
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
/** Scan a line for every `[^id]` reference, invoking `onRef(id)` for each. */
|
||||
function forEachReference(line: string, onRef: (id: string) => void): void {
|
||||
REF_RE_G.lastIndex = 0;
|
||||
let m: RegExpExecArray | null;
|
||||
while ((m = REF_RE_G.exec(line)) !== null) onRef(m[1]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze the footnotes in a Markdown string. Pure; safe to call on any body.
|
||||
*/
|
||||
export function analyzeFootnotes(markdown: string): FootnoteDiagnostics {
|
||||
const lines = markdown.split("\n");
|
||||
|
||||
// Distinct reference ids in first-appearance order, plus the set of ids seen
|
||||
// inside a table row.
|
||||
const refIds: string[] = [];
|
||||
const refIdSet = new Set<string>();
|
||||
const referencesInTables = new Set<string>();
|
||||
const addRef = (id: string, inTable: boolean) => {
|
||||
if (!refIdSet.has(id)) {
|
||||
refIdSet.add(id);
|
||||
refIds.push(id);
|
||||
}
|
||||
if (inTable) referencesInTables.add(id);
|
||||
};
|
||||
|
||||
// Definition texts per id, in first-appearance order of the id.
|
||||
const defTextsById = new Map<string, string[]>();
|
||||
|
||||
let fence: string | null = null;
|
||||
for (const line of lines) {
|
||||
const fenceMatch = FENCE_RE.exec(line);
|
||||
if (fenceMatch) {
|
||||
const marker = fenceMatch[2][0];
|
||||
if (fence === null) fence = marker;
|
||||
else if (marker === fence) fence = null;
|
||||
continue;
|
||||
}
|
||||
// Footnote syntax shown inside a code fence is not real markup.
|
||||
if (fence !== null) continue;
|
||||
|
||||
const defM = DEF_RE.exec(line);
|
||||
if (defM) {
|
||||
const id = defM[1];
|
||||
const text = defM[2];
|
||||
const arr = defTextsById.get(id);
|
||||
if (arr) arr.push(text);
|
||||
else defTextsById.set(id, [text]);
|
||||
// A definition's TEXT can itself reference another footnote (`[^a]: see
|
||||
// [^b]`); count those so such a `[^b]` is not falsely reported dangling.
|
||||
forEachReference(text, (rid) => addRef(rid, false));
|
||||
continue;
|
||||
}
|
||||
|
||||
const inTable = line.trimStart().startsWith("|");
|
||||
forEachReference(line, (id) => addRef(id, inTable));
|
||||
}
|
||||
|
||||
const danglingReferences = refIds.filter((id) => !defTextsById.has(id));
|
||||
const duplicateDefinitions: string[] = [];
|
||||
const emptyDefinitions: string[] = [];
|
||||
for (const [id, texts] of defTextsById) {
|
||||
if (texts.length >= 2) duplicateDefinitions.push(id);
|
||||
// First-wins: the kept definition is the first one; flag it if it is blank.
|
||||
if ((texts[0] ?? "").trim().length === 0) emptyDefinitions.push(id);
|
||||
}
|
||||
const tableRefs = [...referencesInTables];
|
||||
|
||||
const warnings: string[] = [];
|
||||
const list = (ids: string[]) => ids.map((id) => `[^${id}]`).join(", ");
|
||||
if (danglingReferences.length > 0) {
|
||||
warnings.push(
|
||||
`Footnote reference(s) with no matching definition: ${list(danglingReferences)} (each will render as an empty footnote in the editor).`,
|
||||
);
|
||||
}
|
||||
if (emptyDefinitions.length > 0) {
|
||||
warnings.push(
|
||||
`Footnote definition(s) with empty text: ${list(emptyDefinitions)}.`,
|
||||
);
|
||||
}
|
||||
if (duplicateDefinitions.length > 0) {
|
||||
warnings.push(
|
||||
`Footnote id(s) defined more than once (only the first definition was kept): ${list(duplicateDefinitions)}.`,
|
||||
);
|
||||
}
|
||||
if (tableRefs.length > 0) {
|
||||
warnings.push(
|
||||
`Footnote marker(s) inside a table row (footnotes in table cells may not render as expected): ${list(tableRefs)}.`,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
danglingReferences,
|
||||
emptyDefinitions,
|
||||
duplicateDefinitions,
|
||||
referencesInTables: tableRefs,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
@@ -1,134 +0,0 @@
|
||||
import { test } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
import { markdownToProseMirror } from "../../build/lib/collaboration.js";
|
||||
|
||||
/**
|
||||
* CROSS-PACKAGE DRIFT GUARD for the footnote id derivation scheme.
|
||||
*
|
||||
* `deriveFootnoteId` is duplicated in two places that MUST behave identically:
|
||||
* - packages/editor-ext/src/lib/footnote/footnote-util.ts (exported)
|
||||
* - packages/mcp/src/lib/collaboration.ts (internal helper)
|
||||
* so the same markdown imported through the editor and through the MCP path
|
||||
* derives identical footnote ids.
|
||||
*
|
||||
* The mcp copy is NOT exported from the compiled build (it is an internal helper
|
||||
* of collaboration.js), and production source must not be modified to export it.
|
||||
* So this test exercises the REAL compiled `deriveFootnoteId` *indirectly*, the
|
||||
* same way production does: through `markdownToProseMirror`, which runs
|
||||
* extractFootnotes -> deriveFootnoteId during duplicate-id dedup. We craft the
|
||||
* `taken` set via literal pre-existing definition ids and read back the derived
|
||||
* footnoteDefinition ids.
|
||||
*
|
||||
* GOLDEN below mirrors DERIVE_GOLDEN in
|
||||
* packages/editor-ext/src/lib/footnote/footnote-util.derive-id.test.ts
|
||||
* (asserted there by a DIRECT call). Same (originalId, occurrence, taken) ->
|
||||
* same expected id. If the two copies drift, one of the two suites goes red.
|
||||
*/
|
||||
|
||||
/** The 25 single-letter suffixes the scheme uses (n=1..25): b, c, ..., z. */
|
||||
function singleLetterSuffixes() {
|
||||
return Array.from({ length: 25 }, (_, i) => String.fromCharCode(98 + i));
|
||||
}
|
||||
|
||||
// Identical matrix + expected values to the editor-ext golden table.
|
||||
const GOLDEN = [
|
||||
{ originalId: "d", occurrence: 2, taken: [], expected: "d__2" },
|
||||
{ originalId: "d", occurrence: 3, taken: [], expected: "d__3" },
|
||||
{ originalId: "d", occurrence: 2, taken: ["d__2"], expected: "d__2b" },
|
||||
{ originalId: "d", occurrence: 2, taken: ["d__2", "d__2b"], expected: "d__2c" },
|
||||
{
|
||||
originalId: "d",
|
||||
occurrence: 2,
|
||||
taken: ["d__2", "d__2b", "d__2c", "d__2d"],
|
||||
expected: "d__2e",
|
||||
},
|
||||
{
|
||||
originalId: "d",
|
||||
occurrence: 2,
|
||||
taken: ["d__2", ...singleLetterSuffixes().map((s) => `d__2${s}`)],
|
||||
expected: "d__2bb",
|
||||
},
|
||||
];
|
||||
|
||||
/** Recursively collect every node of `type`. */
|
||||
function findAll(node, type, acc = []) {
|
||||
if (!node || typeof node !== "object") return acc;
|
||||
if (node.type === type) acc.push(node);
|
||||
if (Array.isArray(node.content)) for (const c of node.content) findAll(c, type, acc);
|
||||
return acc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build markdown that drives the real `deriveFootnoteId(originalId, occurrence,
|
||||
* taken)`:
|
||||
* - `occurrence` duplicate definitions of `[^originalId]` so the dedup walk
|
||||
* reaches the requested occurrence (occurrence=2 -> 1 keeper + 1 duplicate;
|
||||
* occurrence=3 -> keeper + 2 duplicates, of which the LAST is the one whose
|
||||
* id we read);
|
||||
* - one literal pre-existing definition for every id in `taken`, each with its
|
||||
* own reference marker so it is a real (non-orphan) definition. Those ids are
|
||||
* reserved up-front in the dedup `taken` set, exactly forcing the bump.
|
||||
*
|
||||
* Returns the derived id of the FINAL duplicate of `originalId`.
|
||||
*/
|
||||
async function deriveViaMarkdown(originalId, occurrence, takenIds) {
|
||||
// References: one [^originalId] per definition (keeper + duplicates) so each
|
||||
// duplicate has a marker to pair with, plus one marker per taken id.
|
||||
const dupCount = occurrence; // keeper + (occurrence-1) duplicates = `occurrence` defs
|
||||
const refMarkers = [];
|
||||
for (let i = 0; i < dupCount; i++) refMarkers.push(`[^${originalId}]`);
|
||||
for (const id of takenIds) refMarkers.push(`[^${id}]`);
|
||||
const refLine = `Body ${refMarkers.join(" ")}.`;
|
||||
|
||||
// Definitions: `occurrence` copies of [^originalId]: ... then the taken ids.
|
||||
const defLines = [];
|
||||
for (let i = 0; i < dupCount; i++) {
|
||||
defLines.push(`[^${originalId}]: copy ${i}`);
|
||||
}
|
||||
for (const id of takenIds) {
|
||||
defLines.push(`[^${id}]: reserved ${id}`);
|
||||
}
|
||||
|
||||
const md = [refLine, "", ...defLines].join("\n");
|
||||
const json = await markdownToProseMirror(md);
|
||||
const defIds = findAll(json, "footnoteDefinition").map((d) => d.attrs.id);
|
||||
|
||||
// The derived id we want is the one that is neither the keeper (originalId),
|
||||
// nor any reserved taken id, nor a lower-occurrence derived id. For
|
||||
// occurrence=2 that is the single bumped id; for occurrence=3 it is the
|
||||
// highest `${originalId}__3...` id. Compute it generically: among the def ids
|
||||
// that start with `${originalId}__${occurrence}`, the expected one is present.
|
||||
return { defIds, json };
|
||||
}
|
||||
|
||||
for (const row of GOLDEN) {
|
||||
test(`parity: derive("${row.originalId}", ${row.occurrence}, {${row.taken.join(",")}}) -> "${row.expected}"`, async () => {
|
||||
const { defIds } = await deriveViaMarkdown(
|
||||
row.originalId,
|
||||
row.occurrence,
|
||||
row.taken,
|
||||
);
|
||||
// The real compiled deriveFootnoteId must have minted exactly the golden id.
|
||||
assert.ok(
|
||||
defIds.includes(row.expected),
|
||||
`expected derived id "${row.expected}" among def ids ${JSON.stringify(defIds)}`,
|
||||
);
|
||||
// And every id is distinct: nothing collapsed.
|
||||
assert.equal(new Set(defIds).size, defIds.length, "all def ids distinct");
|
||||
});
|
||||
}
|
||||
|
||||
test("parity: the simple keeper+two-duplicate case mints d, d__2, d__3", async () => {
|
||||
// The canonical no-collision path, asserted as a whole set for clarity.
|
||||
const md = [
|
||||
"See[^d] one[^d] two[^d].",
|
||||
"",
|
||||
"[^d]: first",
|
||||
"[^d]: second",
|
||||
"[^d]: third",
|
||||
].join("\n");
|
||||
const json = await markdownToProseMirror(md);
|
||||
const defIds = findAll(json, "footnoteDefinition").map((d) => d.attrs.id);
|
||||
assert.deepEqual([...defIds].sort(), ["d", "d__2", "d__3"]);
|
||||
});
|
||||
106
packages/mcp/test/unit/footnote-analyze.test.mjs
Normal file
106
packages/mcp/test/unit/footnote-analyze.test.mjs
Normal file
@@ -0,0 +1,106 @@
|
||||
import { test } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
import { analyzeFootnotes } from "../../build/lib/footnote-analyze.js";
|
||||
|
||||
test("clean footnotes produce no diagnostics", () => {
|
||||
const md = ["A[^a] and B[^b].", "", "[^a]: first", "[^b]: second"].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
assert.deepEqual(d.danglingReferences, []);
|
||||
assert.deepEqual(d.emptyDefinitions, []);
|
||||
assert.deepEqual(d.duplicateDefinitions, []);
|
||||
assert.deepEqual(d.referencesInTables, []);
|
||||
assert.deepEqual(d.warnings, []);
|
||||
});
|
||||
|
||||
test("reuse (repeated references to one definition) is NOT a warning", () => {
|
||||
const md = ["A[^a] B[^a] C[^a].", "", "[^a]: shared"].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
assert.deepEqual(d.danglingReferences, []);
|
||||
assert.deepEqual(d.warnings, []);
|
||||
});
|
||||
|
||||
test("dangling reference (no definition) is reported", () => {
|
||||
const md = ["See[^missing] and[^a].", "", "[^a]: defined"].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
assert.deepEqual(d.danglingReferences, ["missing"]);
|
||||
assert.equal(d.warnings.length, 1);
|
||||
assert.match(d.warnings[0], /no matching definition/);
|
||||
assert.match(d.warnings[0], /\[\^missing\]/);
|
||||
});
|
||||
|
||||
test("empty definition text is reported", () => {
|
||||
const md = ["See[^a].", "", "[^a]: "].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
assert.deepEqual(d.emptyDefinitions, ["a"]);
|
||||
assert.match(d.warnings.join("\n"), /empty text/);
|
||||
});
|
||||
|
||||
test("duplicate definition id is reported (first-wins)", () => {
|
||||
const md = ["See[^d].", "", "[^d]: first", "[^d]: second"].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
assert.deepEqual(d.duplicateDefinitions, ["d"]);
|
||||
assert.match(d.warnings.join("\n"), /defined more than once/);
|
||||
});
|
||||
|
||||
test("reference inside a GFM table row is reported (heuristic)", () => {
|
||||
const md = [
|
||||
"| Col |",
|
||||
"| --- |",
|
||||
"| cell[^t] |",
|
||||
"",
|
||||
"[^t]: table note",
|
||||
].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
assert.deepEqual(d.referencesInTables, ["t"]);
|
||||
assert.match(d.warnings.join("\n"), /table/);
|
||||
// It is defined, so it is NOT also dangling.
|
||||
assert.deepEqual(d.danglingReferences, []);
|
||||
});
|
||||
|
||||
test("footnote syntax inside a code fence is ignored", () => {
|
||||
const md = [
|
||||
"Intro.",
|
||||
"",
|
||||
"```",
|
||||
"Example[^demo]",
|
||||
"[^demo]: not a real definition",
|
||||
"```",
|
||||
"",
|
||||
"Outro[^a].",
|
||||
"",
|
||||
"[^a]: real",
|
||||
].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
// `[^demo]` lives only in the fenced block, so it is neither a reference nor a
|
||||
// dangling one, and `[^demo]:` is not counted as a definition.
|
||||
assert.deepEqual(d.danglingReferences, []);
|
||||
assert.deepEqual(d.duplicateDefinitions, []);
|
||||
assert.deepEqual(d.warnings, []);
|
||||
});
|
||||
|
||||
test("a reference that only appears inside a definition's text is not dangling", () => {
|
||||
// `[^b]` is referenced from within [^a]'s text and has its own definition.
|
||||
const md = ["See[^a].", "", "[^a]: see also [^b]", "[^b]: the other"].join(
|
||||
"\n",
|
||||
);
|
||||
const d = analyzeFootnotes(md);
|
||||
assert.deepEqual(d.danglingReferences, []);
|
||||
});
|
||||
|
||||
test("multiple problem classes accumulate distinct warnings", () => {
|
||||
const md = [
|
||||
"Ref[^x] and[^dup].",
|
||||
"",
|
||||
"[^dup]: one",
|
||||
"[^dup]: two",
|
||||
"[^empty]:",
|
||||
].join("\n");
|
||||
const d = analyzeFootnotes(md);
|
||||
// x has no definition; dup is defined twice; empty is empty AND has no ref.
|
||||
assert.ok(d.danglingReferences.includes("x"));
|
||||
assert.deepEqual(d.duplicateDefinitions, ["dup"]);
|
||||
assert.deepEqual(d.emptyDefinitions, ["empty"]);
|
||||
// One warning line per problem class present.
|
||||
assert.ok(d.warnings.length >= 3);
|
||||
});
|
||||
@@ -90,11 +90,10 @@ test("JSON -> MD -> JSON preserves footnote ids and text", async () => {
|
||||
assert.match(md2, /\[\^fn2\]: Second note\./);
|
||||
});
|
||||
|
||||
test("duplicate-id markdown dedups DETERMINISTICALLY (same input -> same ids)", async () => {
|
||||
// The MCP import must derive duplicate ids deterministically (NOT random) so
|
||||
// the same markdown imported here and via the editor produces identical ids,
|
||||
// and re-importing is stable. This is the test that would FAIL on the old
|
||||
// Math.random()/Date.now() implementation.
|
||||
test("repeated references REUSE one footnote; duplicate definitions are first-wins (#166)", async () => {
|
||||
// Reuse semantics: many `[^d]` references + several `[^d]:` definitions import
|
||||
// as ONE footnote — the references all keep id "d" (reuse), and only the FIRST
|
||||
// definition is kept (first-wins). Deterministic and stable across re-imports.
|
||||
const md = [
|
||||
"See[^d] one[^d] two[^d].",
|
||||
"",
|
||||
@@ -106,21 +105,26 @@ test("duplicate-id markdown dedups DETERMINISTICALLY (same input -> same ids)",
|
||||
const idsOf = async () => {
|
||||
const json = await markdownToProseMirror(md);
|
||||
const refs = findAll(json, "footnoteReference").map((r) => r.attrs.id);
|
||||
const defs = findAll(json, "footnoteDefinition").map((d) => d.attrs.id);
|
||||
return { refs, defs };
|
||||
const defs = findAll(json, "footnoteDefinition");
|
||||
return {
|
||||
refs,
|
||||
defIds: defs.map((d) => d.attrs.id),
|
||||
defText: defs
|
||||
.map((d) => JSON.stringify(d).match(/"text":"([^"]*)"/)?.[1])
|
||||
.join("|"),
|
||||
};
|
||||
};
|
||||
|
||||
const a = await idsOf();
|
||||
const b = await idsOf();
|
||||
|
||||
// Identical across runs.
|
||||
assert.deepEqual(a.refs, b.refs);
|
||||
assert.deepEqual(a.defs, b.defs);
|
||||
// Deterministic derived scheme: keeper "d", duplicates "d__2", "d__3".
|
||||
assert.deepEqual([...a.defs].sort(), ["d", "d__2", "d__3"]);
|
||||
// 1:1 reference <-> definition pairing, all distinct.
|
||||
assert.equal(new Set(a.defs).size, 3);
|
||||
assert.deepEqual([...a.refs].sort(), [...a.defs].sort());
|
||||
// Stable across runs.
|
||||
assert.deepEqual(a, b);
|
||||
// Reuse: all three reference markers stay "d".
|
||||
assert.deepEqual(a.refs, ["d", "d", "d"]);
|
||||
// First-wins: a single definition "d" with the FIRST text.
|
||||
assert.deepEqual(a.defIds, ["d"]);
|
||||
assert.equal(a.defText, "first");
|
||||
});
|
||||
|
||||
test("a [^id]: line inside a fenced code block is NOT treated as a definition", async () => {
|
||||
|
||||
Reference in New Issue
Block a user