edit_page_text (applyTextEdits) now matches at the inline-block level instead of
per text node, so a find/replace may cross bold/italic/link boundaries; the
replacement inherits marks from the unchanged common prefix/suffix via a diff
splice. Atom (non-text inline) slots can never be part of a match, making the
U+FFFC placeholder collision-safe, and inserted text never inherits an atom's
marks.
The edit batch is no longer all-or-nothing: applyTextEdits returns
{ doc, results, failed } and applies what it can; editPageText writes only on a
real change (no spurious history version for a no-op) and throws an aggregated,
actionable error only when nothing applied.
The AI-chat insert_node / patch_node / update_page_json tools now JSON.parse a
node/content argument that arrives as a string, matching the standalone MCP
server (this is what made insert_node fail under OpenAI tool calls).
Tool descriptions gain concrete ProseMirror examples and reflect the new
edit_page_text behavior. Adds/updates json-edit unit tests (183 pass).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
313 lines
12 KiB
JavaScript
313 lines
12 KiB
JavaScript
/**
|
|
* Surgical text edits on a ProseMirror document without re-importing it.
|
|
*
|
|
* Each edit replaces an exact substring of a block's inline text, preserving
|
|
* every node id, mark and attribute around it. Matching works at the
|
|
* INLINE-CONTAINER (block) level: a block's text nodes are flattened into a
|
|
* per-character array, so a `find` may freely cross bold/italic/link
|
|
* boundaries (separate text nodes). The replacement inherits marks from the
|
|
* unchanged common prefix/suffix of the match, so editing plain text next to a
|
|
* bold word keeps the bold word bold, and editing the inside of a bold word
|
|
* keeps the inserted text bold. This is the safe alternative to a full markdown
|
|
* re-import for small wording fixes.
|
|
*/
|
|
/** Placeholder code unit standing in for one opaque (non-text) inline node. */
|
|
const ATOM_PLACEHOLDER = ""; // OBJECT REPLACEMENT CHARACTER
|
|
/**
|
|
* Find every VALID occurrence of `needle` in a block's flattened slots.
|
|
*
|
|
* A candidate occurrence at slot range [start, start+needle.length) is valid
|
|
* ONLY IF none of the slots in that range are atoms (non-text inline nodes).
|
|
* This makes atom matching collision-safe against the U+FFFC placeholder: an
|
|
* atom slot can never be part of a match, while a real text node containing a
|
|
* literal U+FFFC code unit still matches normally (its slot has no `.atom`).
|
|
*
|
|
* Overlapping candidates that touch an atom are skipped (not counted, not
|
|
* spliced); the scan resumes one code unit past the rejected start so a valid
|
|
* match that begins just after an atom is not missed.
|
|
*/
|
|
function findValidMatches(chars, plain, needle) {
|
|
if (!needle)
|
|
return [];
|
|
const positions = [];
|
|
let idx = plain.indexOf(needle);
|
|
while (idx !== -1) {
|
|
const end = idx + needle.length;
|
|
let hasAtom = false;
|
|
for (let i = idx; i < end; i++) {
|
|
if (chars[i] && chars[i].atom) {
|
|
hasAtom = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!hasAtom) {
|
|
positions.push(idx);
|
|
// Non-overlapping: skip past this match.
|
|
idx = plain.indexOf(needle, end);
|
|
}
|
|
else {
|
|
// This candidate crosses an atom: reject it and resume one unit later so
|
|
// an overlapping valid match starting after the atom is still found.
|
|
idx = plain.indexOf(needle, idx + 1);
|
|
}
|
|
}
|
|
return positions;
|
|
}
|
|
/** Order-sensitive deep-equality of two marks arrays. */
|
|
function marksEqual(a, b) {
|
|
if (a === b)
|
|
return true;
|
|
if (a.length !== b.length)
|
|
return false;
|
|
for (let i = 0; i < a.length; i++) {
|
|
if (JSON.stringify(a[i]) !== JSON.stringify(b[i]))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
/** A block is any node that DIRECTLY contains at least one inline text child. */
|
|
function isInlineBlock(node) {
|
|
return (Array.isArray(node?.content) &&
|
|
node.content.some((child) => child && child.type === "text"));
|
|
}
|
|
/** Flatten a block's inline content into a per-code-unit slot array. */
|
|
function flattenBlock(node) {
|
|
const chars = [];
|
|
for (const child of node.content || []) {
|
|
if (child && child.type === "text" && typeof child.text === "string") {
|
|
const marks = child.marks || [];
|
|
// Iterate by UTF-16 code unit so indices align with String.indexOf.
|
|
for (let i = 0; i < child.text.length; i++) {
|
|
chars.push({ ch: child.text[i], marks });
|
|
}
|
|
}
|
|
else {
|
|
// Any non-text inline node becomes one opaque slot.
|
|
chars.push({
|
|
ch: ATOM_PLACEHOLDER,
|
|
marks: (child && child.marks) || [],
|
|
atom: child,
|
|
});
|
|
}
|
|
}
|
|
return chars;
|
|
}
|
|
/** Re-tokenize a slot array back into ProseMirror inline nodes. */
|
|
function tokenizeChars(chars) {
|
|
const out = [];
|
|
let buffer = "";
|
|
let bufferMarks = null;
|
|
const flush = () => {
|
|
if (buffer.length === 0)
|
|
return;
|
|
const textNode = { type: "text", text: buffer };
|
|
if (bufferMarks && bufferMarks.length > 0)
|
|
textNode.marks = bufferMarks;
|
|
out.push(textNode);
|
|
buffer = "";
|
|
bufferMarks = null;
|
|
};
|
|
for (const slot of chars) {
|
|
if (slot.atom) {
|
|
flush();
|
|
out.push(slot.atom);
|
|
continue;
|
|
}
|
|
if (bufferMarks !== null && !marksEqual(bufferMarks, slot.marks)) {
|
|
flush();
|
|
}
|
|
if (bufferMarks === null)
|
|
bufferMarks = slot.marks;
|
|
buffer += slot.ch;
|
|
}
|
|
flush();
|
|
return out;
|
|
}
|
|
/** Longest common prefix length of two strings. */
|
|
function commonPrefixLen(a, b) {
|
|
const max = Math.min(a.length, b.length);
|
|
let i = 0;
|
|
while (i < max && a[i] === b[i])
|
|
i++;
|
|
return i;
|
|
}
|
|
/** Longest common suffix length of two strings, capped so it can't overlap. */
|
|
function commonSuffixLen(a, b, cap) {
|
|
const max = Math.min(a.length, b.length, cap);
|
|
let i = 0;
|
|
while (i < max && a[a.length - 1 - i] === b[b.length - 1 - i])
|
|
i++;
|
|
return i;
|
|
}
|
|
/**
|
|
* Apply one edit to one block's flattened slot array.
|
|
*
|
|
* The caller passes only VALID (atom-free) match positions (see
|
|
* findValidMatches), so no match range can overlap an atom slot here.
|
|
*/
|
|
function applyEditToChars(chars, edit, matchPositions) {
|
|
// Pre-compute the diff slices once (find/replace are constant per edit).
|
|
const p = commonPrefixLen(edit.find, edit.replace);
|
|
const s = commonSuffixLen(edit.find, edit.replace, Math.min(edit.find.length, edit.replace.length) - p);
|
|
const insertText = edit.replace.slice(p, edit.replace.length - s);
|
|
// Rebuild the slot array in a single left-to-right pass, splicing at each
|
|
// match start. Offsets into `chars` stay valid because we copy through.
|
|
const newChars = [];
|
|
let cursor = 0;
|
|
let spliced = 0;
|
|
for (const mStart of matchPositions) {
|
|
const mEnd = mStart + edit.find.length;
|
|
const changedStart = mStart + p;
|
|
const changedEnd = mEnd - s;
|
|
// Copy through everything up to the changed region (incl. the prefix).
|
|
for (; cursor < changedStart; cursor++)
|
|
newChars.push(chars[cursor]);
|
|
const removed = chars.slice(changedStart, changedEnd);
|
|
// Choose the marks for the inserted characters.
|
|
let chosenMarks = [];
|
|
if (removed.length > 0 &&
|
|
removed.every((r) => marksEqual(r.marks, removed[0].marks))) {
|
|
// Uniform removed region: inherit its marks directly.
|
|
chosenMarks = removed[0].marks;
|
|
}
|
|
else {
|
|
// Empty or non-uniform removed region: inherit from the nearest TEXT
|
|
// neighbour, skipping atom slots (an atom carries marks that do not
|
|
// belong on inserted text). Scan left first, then right; fall back to [].
|
|
let inherited = null;
|
|
for (let i = changedStart - 1; i >= 0; i--) {
|
|
if (!chars[i].atom) {
|
|
inherited = chars[i].marks;
|
|
break;
|
|
}
|
|
}
|
|
if (inherited === null) {
|
|
for (let i = changedEnd; i < chars.length; i++) {
|
|
if (!chars[i].atom) {
|
|
inherited = chars[i].marks;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
chosenMarks = inherited === null ? [] : inherited;
|
|
}
|
|
// Emit the inserted text (one slot per code unit).
|
|
for (let i = 0; i < insertText.length; i++) {
|
|
newChars.push({ ch: insertText[i], marks: chosenMarks });
|
|
}
|
|
// Skip the removed region.
|
|
cursor = changedEnd;
|
|
spliced++;
|
|
}
|
|
// Copy through the tail.
|
|
for (; cursor < chars.length; cursor++)
|
|
newChars.push(chars[cursor]);
|
|
return { newChars, spliced };
|
|
}
|
|
/**
|
|
* Apply text edits to a ProseMirror doc (operates on a deep copy, returns it).
|
|
*
|
|
* Returns { doc, results, failed }:
|
|
* - results: edits that applied (replacements >= 1).
|
|
* - failed: edits that matched zero times, were ambiguous (multi-match
|
|
* without replaceAll), or whose changed region crosses a non-text inline
|
|
* node. These do NOT throw — they are recorded so the caller can surface an
|
|
* actionable message and still keep the edits that did apply.
|
|
*
|
|
* Edits apply IN ORDER to the same working copy, so a later edit can target
|
|
* text produced by an earlier one. The input doc is never mutated. The only
|
|
* thrown error is for invalid input (an empty `edit.find`).
|
|
*/
|
|
export function applyTextEdits(doc, edits) {
|
|
const copy = JSON.parse(JSON.stringify(doc));
|
|
const results = [];
|
|
const failed = [];
|
|
for (const edit of edits) {
|
|
if (!edit.find)
|
|
throw new Error("edit.find must be a non-empty string");
|
|
// Gather every inline block in document order (recurse the whole tree so
|
|
// nested containers — callouts, list items, table cells, blockquotes — are
|
|
// all covered).
|
|
const blocks = [];
|
|
(function collect(node) {
|
|
if (isInlineBlock(node))
|
|
blocks.push(node);
|
|
for (const child of node.content || [])
|
|
collect(child);
|
|
})(copy);
|
|
// Find every VALID (atom-free) occurrence per block. A candidate whose slot
|
|
// range overlaps a non-text inline atom is never a match (collision-safe vs
|
|
// the U+FFFC placeholder), so it is excluded from both the uniqueness count
|
|
// and the splicing.
|
|
const blockChars = blocks.map((b) => flattenBlock(b));
|
|
const blockPlain = blockChars.map((chars) => chars.map((c) => c.ch).join(""));
|
|
const validPerBlock = blockChars.map((chars, b) => findValidMatches(chars, blockPlain[b], edit.find));
|
|
let total = 0;
|
|
for (const positions of validPerBlock)
|
|
total += positions.length;
|
|
if (total === 0) {
|
|
// Distinguish "the text exists but only across an atom" from a plain
|
|
// not-found: if a raw substring scan (atoms included) WOULD have hit,
|
|
// the only thing blocking the edit is the atom, so report that.
|
|
const existsAcrossAtom = blockPlain.some((plain) => plain.indexOf(edit.find) !== -1);
|
|
failed.push({
|
|
find: edit.find,
|
|
reason: existsAcrossAtom
|
|
? "match crosses a non-text inline node (image/break/mention); use update_page_json for structural changes."
|
|
: "text not found in the document.",
|
|
});
|
|
continue;
|
|
}
|
|
if (total > 1 && !edit.replaceAll) {
|
|
failed.push({
|
|
find: edit.find,
|
|
reason: `matches ${total} times. Provide a longer, unique fragment or set replaceAll: true.`,
|
|
});
|
|
continue;
|
|
}
|
|
// Plan the splices from the valid positions. For a non-replaceAll edit we
|
|
// splice only the first valid match (left-to-right across blocks); for
|
|
// replaceAll we splice every valid match.
|
|
const plannedPerBlock = blockChars.map(() => []);
|
|
let takenFirst = false;
|
|
for (let b = 0; b < validPerBlock.length; b++) {
|
|
for (const idx of validPerBlock[b]) {
|
|
if (edit.replaceAll) {
|
|
plannedPerBlock[b].push(idx);
|
|
}
|
|
else if (!takenFirst) {
|
|
plannedPerBlock[b].push(idx);
|
|
takenFirst = true;
|
|
break;
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
if (!edit.replaceAll && takenFirst)
|
|
break;
|
|
}
|
|
// Apply the splices block-by-block and re-tokenize changed blocks.
|
|
let spliced = 0;
|
|
for (let b = 0; b < blocks.length; b++) {
|
|
if (plannedPerBlock[b].length === 0)
|
|
continue;
|
|
const { newChars, spliced: n } = applyEditToChars(blockChars[b], edit, plannedPerBlock[b]);
|
|
spliced += n;
|
|
blocks[b].content = tokenizeChars(newChars);
|
|
}
|
|
results.push({ find: edit.find, replacements: spliced });
|
|
}
|
|
// Safety net: drop any empty text nodes (ProseMirror forbids them). The
|
|
// re-tokenizer never emits empty text nodes, but untouched blocks could in
|
|
// principle carry one in from upstream.
|
|
(function prune(node) {
|
|
if (Array.isArray(node.content)) {
|
|
node.content = node.content.filter((child) => !(child.type === "text" && child.text === ""));
|
|
for (const child of node.content)
|
|
prune(child);
|
|
}
|
|
})(copy);
|
|
return { doc: copy, results, failed };
|
|
}
|