feat(git-sync): vendor pure converter + engine into @docmost/git-sync (Phase A.1)
First step of docs/git-sync-plan.md. New workspace package @docmost/git-sync vendoring the PURE parts from docmost-sync (HEAD b03eb35): - lib: markdown-converter, markdown-document, canonicalize, docmost-schema, node-ops, diff, and an extracted markdown-to-prosemirror (only the pure marked->HTML->generateJSON path from upstream collaboration.ts; no websocket). - engine (pure, no IO): reconcile, layout, sanitize, stabilize, loop-guard. Ported the upstream pure-module + round-trip corpus tests (vitest): 314 pass, 3 expected upstream known-limitation fails. tsc clean. No server wiring yet. docmost-schema inlines getStyleProperty (as packages/mcp does — @tiptap/core 3.20.4 doesn't export it). IO engine (pull/push/git/settings) deferred to later Phase A/B steps; the editor-ext idempotency gate (plan §13.1) is the next step. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
319
packages/git-sync/src/lib/diff.ts
Normal file
319
packages/git-sync/src/lib/diff.ts
Normal file
@@ -0,0 +1,319 @@
|
||||
/**
|
||||
* Headless, Docmost-equivalent document diff.
|
||||
*
|
||||
* Docmost's history editor computes a change set with the exact pipeline below
|
||||
* (recreateTransform -> ChangeSet.addSteps -> simplifyChanges) and renders it as
|
||||
* editor decorations. This module runs the SAME computation but serializes the
|
||||
* result to text + integrity counts instead of decorations, so a diff can be
|
||||
* previewed without a browser.
|
||||
*
|
||||
* recreateTransform here comes from @fellow/prosemirror-recreate-transform, the
|
||||
* maintained published fork of the MIT prosemirror-recreate-steps source that
|
||||
* Docmost vendors in @docmost/editor-ext; it exposes the identical
|
||||
* recreateTransform(fromDoc, toDoc, { complexSteps, wordDiffs, simplifyDiff })
|
||||
* signature.
|
||||
*
|
||||
* If recreateTransform / the changeset throws on a pathological document pair,
|
||||
* we fall back to a coarse block-level text diff so the tool never hard-fails.
|
||||
*/
|
||||
|
||||
import { getSchema } from "@tiptap/core";
|
||||
import { Node } from "@tiptap/pm/model";
|
||||
import { ChangeSet, simplifyChanges } from "@tiptap/pm/changeset";
|
||||
import { recreateTransform } from "@fellow/prosemirror-recreate-transform";
|
||||
import { docmostExtensions } from "./docmost-schema.js";
|
||||
|
||||
/** A single inserted/deleted change with its containing-block context. */
|
||||
export interface DiffChange {
|
||||
op: "insert" | "delete";
|
||||
/** Lead (plain) text of the block that contains the change, for context. */
|
||||
block: string;
|
||||
/** The inserted or deleted text. */
|
||||
text: string;
|
||||
}
|
||||
|
||||
/** Integrity counts as [old, new] tuples; footnoteMarkers as [oldList, newList]. */
|
||||
export interface DiffIntegrity {
|
||||
images: [number, number];
|
||||
links: [number, number];
|
||||
tables: [number, number];
|
||||
callouts: [number, number];
|
||||
footnoteMarkers: [number[], number[]];
|
||||
}
|
||||
|
||||
export interface DiffResult {
|
||||
summary: { inserted: number; deleted: number; blocksChanged: number };
|
||||
integrity: DiffIntegrity;
|
||||
changes: DiffChange[];
|
||||
/** Human-readable unified-ish summary. */
|
||||
markdown: string;
|
||||
}
|
||||
|
||||
/** Build the schema once; it is pure and reused across calls. */
|
||||
const schema = getSchema(docmostExtensions);
|
||||
|
||||
/** Recursively concatenate the plain text of a JSON node. */
|
||||
function plainText(node: any): string {
|
||||
if (!node || typeof node !== "object") return "";
|
||||
let out = "";
|
||||
if (typeof node.text === "string") out += node.text;
|
||||
if (Array.isArray(node.content)) {
|
||||
for (const child of node.content) out += plainText(child);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/** Count nodes in a JSON doc that satisfy `pred` (recursive). */
|
||||
function countNodes(doc: any, pred: (node: any) => boolean): number {
|
||||
let n = 0;
|
||||
const visit = (node: any): void => {
|
||||
if (!node || typeof node !== "object") return;
|
||||
if (pred(node)) n++;
|
||||
if (Array.isArray(node.content)) for (const c of node.content) visit(c);
|
||||
};
|
||||
visit(doc);
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count UNIQUE links in a JSON doc by their `href`. A single link can be split
|
||||
* across several adjacent text runs (e.g. a "link+bold" run followed by a "link"
|
||||
* run); counting link-bearing runs would over-count it. Walking the tree and
|
||||
* collecting hrefs into a Set keys each distinct link once. Link marks with a
|
||||
* missing/empty href are bucketed under a single "" key so a malformed link is
|
||||
* still counted as one.
|
||||
*/
|
||||
function countUniqueLinks(doc: any): number {
|
||||
const hrefs = new Set<string>();
|
||||
const visit = (node: any): void => {
|
||||
if (!node || typeof node !== "object") return;
|
||||
if (node.type === "text" && Array.isArray(node.marks)) {
|
||||
for (const m of node.marks) {
|
||||
if (m && m.type === "link") {
|
||||
const href = m.attrs && typeof m.attrs.href === "string" ? m.attrs.href : "";
|
||||
hrefs.add(href);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Array.isArray(node.content)) for (const c of node.content) visit(c);
|
||||
};
|
||||
visit(doc);
|
||||
return hrefs.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the ordered list of integers from `[N]` footnote markers found in the
|
||||
* BODY only (every top-level block before the first "Примечания..." notes
|
||||
* heading; if no such heading, the whole doc). Returned in reading order.
|
||||
*/
|
||||
function footnoteMarkers(doc: any, notesHeading: string): number[] {
|
||||
const top: any[] = Array.isArray(doc?.content) ? doc.content : [];
|
||||
const notesIdx = top.findIndex(
|
||||
(n) =>
|
||||
n &&
|
||||
n.type === "heading" &&
|
||||
plainText(n).trim() === notesHeading,
|
||||
);
|
||||
const bodyBlocks = notesIdx >= 0 ? top.slice(0, notesIdx) : top;
|
||||
const markers: number[] = [];
|
||||
const re = /\[(\d+)\]/g;
|
||||
for (const block of bodyBlocks) {
|
||||
const text = plainText(block);
|
||||
let m: RegExpExecArray | null;
|
||||
re.lastIndex = 0;
|
||||
while ((m = re.exec(text)) !== null) {
|
||||
markers.push(Number(m[1]));
|
||||
}
|
||||
}
|
||||
return markers;
|
||||
}
|
||||
|
||||
/** Compute the [old,new] integrity tuples for two JSON docs. */
|
||||
function computeIntegrity(
|
||||
oldDoc: any,
|
||||
newDoc: any,
|
||||
notesHeading: string,
|
||||
): DiffIntegrity {
|
||||
const images: [number, number] = [
|
||||
countNodes(oldDoc, (n) => n.type === "image"),
|
||||
countNodes(newDoc, (n) => n.type === "image"),
|
||||
];
|
||||
const links: [number, number] = [
|
||||
countUniqueLinks(oldDoc),
|
||||
countUniqueLinks(newDoc),
|
||||
];
|
||||
const tables: [number, number] = [
|
||||
countNodes(oldDoc, (n) => n.type === "table"),
|
||||
countNodes(newDoc, (n) => n.type === "table"),
|
||||
];
|
||||
const callouts: [number, number] = [
|
||||
countNodes(oldDoc, (n) => n.type === "callout"),
|
||||
countNodes(newDoc, (n) => n.type === "callout"),
|
||||
];
|
||||
const fns: [number[], number[]] = [
|
||||
footnoteMarkers(oldDoc, notesHeading),
|
||||
footnoteMarkers(newDoc, notesHeading),
|
||||
];
|
||||
return { images, links, tables, callouts, footnoteMarkers: fns };
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the lead text of the top-level block in a ProseMirror Node that
|
||||
* contains the given document position. Returns "" when out of range.
|
||||
*/
|
||||
function blockContextAt(node: Node, pos: number): string {
|
||||
try {
|
||||
const clamped = Math.max(0, Math.min(pos, node.content.size));
|
||||
const $pos = node.resolve(clamped);
|
||||
// depth 1 is the top-level block in a doc node.
|
||||
const block = $pos.depth >= 1 ? $pos.node(1) : $pos.node(0);
|
||||
const text = block.textContent || "";
|
||||
return text.length > 80 ? text.slice(0, 77) + "..." : text;
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/** Truncate a string for the markdown summary. */
|
||||
function truncate(s: string, n = 120): string {
|
||||
return s.length > n ? s.slice(0, n - 3) + "..." : s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Coarse fallback: a block-by-block plain-text diff. Used only when the precise
|
||||
* changeset pipeline throws, so the tool degrades gracefully instead of failing.
|
||||
*/
|
||||
function coarseDiff(oldDoc: any, newDoc: any): DiffChange[] {
|
||||
const oldBlocks: any[] = Array.isArray(oldDoc?.content) ? oldDoc.content : [];
|
||||
const newBlocks: any[] = Array.isArray(newDoc?.content) ? newDoc.content : [];
|
||||
const oldTexts = oldBlocks.map(plainText);
|
||||
const newTexts = newBlocks.map(plainText);
|
||||
const oldSet = new Set(oldTexts);
|
||||
const newSet = new Set(newTexts);
|
||||
const changes: DiffChange[] = [];
|
||||
for (const t of oldTexts) {
|
||||
if (!newSet.has(t) && t.trim() !== "") {
|
||||
changes.push({ op: "delete", block: truncate(t, 80), text: t });
|
||||
}
|
||||
}
|
||||
for (const t of newTexts) {
|
||||
if (!oldSet.has(t) && t.trim() !== "") {
|
||||
changes.push({ op: "insert", block: truncate(t, 80), text: t });
|
||||
}
|
||||
}
|
||||
return changes;
|
||||
}
|
||||
|
||||
/** Build the human-readable unified-ish markdown summary. */
|
||||
function renderMarkdown(
|
||||
result: Omit<DiffResult, "markdown">,
|
||||
fellBack: boolean,
|
||||
): string {
|
||||
const lines: string[] = [];
|
||||
const { summary, integrity, changes } = result;
|
||||
lines.push(
|
||||
`# Diff: ${summary.inserted} inserted / ${summary.deleted} deleted (${summary.blocksChanged} blocks changed)`,
|
||||
);
|
||||
if (fellBack) {
|
||||
lines.push("");
|
||||
lines.push("> note: precise diff failed; coarse block-level diff shown.");
|
||||
}
|
||||
lines.push("");
|
||||
lines.push("## Integrity (old -> new)");
|
||||
lines.push(`- images: ${integrity.images[0]} -> ${integrity.images[1]}`);
|
||||
lines.push(`- links: ${integrity.links[0]} -> ${integrity.links[1]}`);
|
||||
lines.push(`- tables: ${integrity.tables[0]} -> ${integrity.tables[1]}`);
|
||||
lines.push(`- callouts: ${integrity.callouts[0]} -> ${integrity.callouts[1]}`);
|
||||
lines.push(
|
||||
`- footnoteMarkers: [${integrity.footnoteMarkers[0].join(", ")}] -> [${integrity.footnoteMarkers[1].join(", ")}]`,
|
||||
);
|
||||
lines.push("");
|
||||
lines.push("## Changes");
|
||||
if (changes.length === 0) {
|
||||
lines.push("(no textual changes)");
|
||||
} else {
|
||||
for (const c of changes) {
|
||||
const sign = c.op === "insert" ? "+" : "-";
|
||||
const ctx = c.block ? ` @ ${truncate(c.block, 60)}` : "";
|
||||
lines.push(`${sign} ${truncate(c.text)}${ctx}`);
|
||||
}
|
||||
}
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Diff two ProseMirror JSON documents the way Docmost's history editor does and
|
||||
* serialize the result to text + integrity counts.
|
||||
*
|
||||
* @param oldDocJson the earlier document
|
||||
* @param newDocJson the later document
|
||||
* @param notesHeading heading delimiting body from notes for footnote counting
|
||||
*/
|
||||
export function diffDocs(
|
||||
oldDocJson: any,
|
||||
newDocJson: any,
|
||||
notesHeading: string = "Примечания переводчика",
|
||||
): DiffResult {
|
||||
const integrity = computeIntegrity(oldDocJson, newDocJson, notesHeading);
|
||||
|
||||
let changes: DiffChange[] = [];
|
||||
let inserted = 0;
|
||||
let deleted = 0;
|
||||
let fellBack = false;
|
||||
const changedBlocks = new Set<string>();
|
||||
|
||||
try {
|
||||
const oldNode = Node.fromJSON(schema, oldDocJson);
|
||||
const newNode = Node.fromJSON(schema, newDocJson);
|
||||
const tr = recreateTransform(oldNode, newNode, {
|
||||
complexSteps: false,
|
||||
wordDiffs: true,
|
||||
simplifyDiff: true,
|
||||
});
|
||||
const changeSet = ChangeSet.create(oldNode).addSteps(
|
||||
tr.doc,
|
||||
tr.mapping.maps,
|
||||
[],
|
||||
);
|
||||
const simplified = simplifyChanges(changeSet.changes, newNode);
|
||||
|
||||
for (const change of simplified) {
|
||||
// Deleted text lives in the OLD doc coordinate range [fromA, toA).
|
||||
if (change.toA > change.fromA) {
|
||||
const text = oldNode.textBetween(change.fromA, change.toA, "\n", " ");
|
||||
if (text.length > 0) {
|
||||
deleted += text.length;
|
||||
const block = blockContextAt(oldNode, change.fromA);
|
||||
changes.push({ op: "delete", block, text });
|
||||
if (block) changedBlocks.add("d:" + block);
|
||||
}
|
||||
}
|
||||
// Inserted text lives in the NEW doc coordinate range [fromB, toB).
|
||||
if (change.toB > change.fromB) {
|
||||
const text = newNode.textBetween(change.fromB, change.toB, "\n", " ");
|
||||
if (text.length > 0) {
|
||||
inserted += text.length;
|
||||
const block = blockContextAt(newNode, change.fromB);
|
||||
changes.push({ op: "insert", block, text });
|
||||
if (block) changedBlocks.add("i:" + block);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Pathological pair: degrade to a coarse block-level diff so we never throw.
|
||||
fellBack = true;
|
||||
changes = coarseDiff(oldDocJson, newDocJson);
|
||||
for (const c of changes) {
|
||||
if (c.op === "insert") inserted += c.text.length;
|
||||
else deleted += c.text.length;
|
||||
if (c.block) changedBlocks.add(c.op[0] + ":" + c.block);
|
||||
}
|
||||
}
|
||||
|
||||
const partial: Omit<DiffResult, "markdown"> = {
|
||||
summary: { inserted, deleted, blocksChanged: changedBlocks.size },
|
||||
integrity,
|
||||
changes,
|
||||
};
|
||||
return { ...partial, markdown: renderMarkdown(partial, fellBack) };
|
||||
}
|
||||
Reference in New Issue
Block a user