docmost-sync/packages/docmost-client/src/lib/diff.ts

/**
 * Headless, Docmost-equivalent document diff.
 *
 * Docmost's history editor computes a change set with the exact pipeline below
 * (recreateTransform -> ChangeSet.addSteps -> simplifyChanges) and renders it as
 * editor decorations. This module runs the SAME computation but serializes the
 * result to text + integrity counts instead of decorations, so a diff can be
 * previewed without a browser.
 *
 * recreateTransform here comes from @fellow/prosemirror-recreate-transform, the
 * maintained published fork of the MIT prosemirror-recreate-steps source that
 * Docmost vendors in @docmost/editor-ext; it exposes the identical
 * recreateTransform(fromDoc, toDoc, { complexSteps, wordDiffs, simplifyDiff })
 * signature.
 *
 * If recreateTransform / the changeset throws on a pathological document pair,
 * we fall back to a coarse block-level text diff so the tool never hard-fails.
 */

import { getSchema } from "@tiptap/core";
import { Node } from "@tiptap/pm/model";
import { ChangeSet, simplifyChanges } from "@tiptap/pm/changeset";
import { recreateTransform } from "@fellow/prosemirror-recreate-transform";
import { docmostExtensions } from "./docmost-schema.js";

/** A single inserted/deleted change with its containing-block context. */
export interface DiffChange {
  op: "insert" | "delete";
  /** Lead (plain) text of the block that contains the change, for context. */
  block: string;
  /** The inserted or deleted text. */
  text: string;
}

/** Integrity counts as [old, new] tuples; footnoteMarkers as [oldList, newList]. */
export interface DiffIntegrity {
  images: [number, number];
  links: [number, number];
  tables: [number, number];
  callouts: [number, number];
  footnoteMarkers: [number[], number[]];
}

export interface DiffResult {
  summary: { inserted: number; deleted: number; blocksChanged: number };
  integrity: DiffIntegrity;
  changes: DiffChange[];
  /** Human-readable unified-ish summary. */
  markdown: string;
}

/** Build the schema once; it is pure and reused across calls. */
const schema = getSchema(docmostExtensions);

/** Recursively concatenate the plain text of a JSON node. */
function plainText(node: any): string {
  if (!node || typeof node !== "object") return "";
  let out = "";
  if (typeof node.text === "string") out += node.text;
  if (Array.isArray(node.content)) {
    for (const child of node.content) out += plainText(child);
  }
  return out;
}

/** Count nodes in a JSON doc that satisfy `pred` (recursive). */
function countNodes(doc: any, pred: (node: any) => boolean): number {
  let n = 0;
  const visit = (node: any): void => {
    if (!node || typeof node !== "object") return;
    if (pred(node)) n++;
    if (Array.isArray(node.content)) for (const c of node.content) visit(c);
  };
  visit(doc);
  return n;
}

/**
 * Count UNIQUE links in a JSON doc by their `href`. A single link can be split
 * across several adjacent text runs (e.g. a "link+bold" run followed by a "link"
 * run); counting link-bearing runs would over-count it. Walking the tree and
 * collecting hrefs into a Set keys each distinct link once. Link marks with a
 * missing/empty href are bucketed under a single "" key so a malformed link is
 * still counted as one.
 */
function countUniqueLinks(doc: any): number {
  const hrefs = new Set<string>();
  const visit = (node: any): void => {
    if (!node || typeof node !== "object") return;
    if (node.type === "text" && Array.isArray(node.marks)) {
      for (const m of node.marks) {
        if (m && m.type === "link") {
          const href = m.attrs && typeof m.attrs.href === "string" ? m.attrs.href : "";
          hrefs.add(href);
        }
      }
    }
    if (Array.isArray(node.content)) for (const c of node.content) visit(c);
  };
  visit(doc);
  return hrefs.size;
}

/**
 * Parse the ordered list of integers from `[N]` footnote markers found in the
 * BODY only (every top-level block before the first "Примечания..." notes
 * heading; if no such heading, the whole doc). Returned in reading order.
 */
function footnoteMarkers(doc: any, notesHeading: string): number[] {
  const top: any[] = Array.isArray(doc?.content) ? doc.content : [];
  const notesIdx = top.findIndex(
    (n) =>
      n &&
      n.type === "heading" &&
      plainText(n).trim() === notesHeading,
  );
  const bodyBlocks = notesIdx >= 0 ? top.slice(0, notesIdx) : top;
  const markers: number[] = [];
  const re = /\[(\d+)\]/g;
  for (const block of bodyBlocks) {
    const text = plainText(block);
    let m: RegExpExecArray | null;
    re.lastIndex = 0;
    while ((m = re.exec(text)) !== null) {
      markers.push(Number(m[1]));
    }
  }
  return markers;
}

/** Compute the [old,new] integrity tuples for two JSON docs. */
function computeIntegrity(
  oldDoc: any,
  newDoc: any,
  notesHeading: string,
): DiffIntegrity {
  const images: [number, number] = [
    countNodes(oldDoc, (n) => n.type === "image"),
    countNodes(newDoc, (n) => n.type === "image"),
  ];
  const links: [number, number] = [
    countUniqueLinks(oldDoc),
    countUniqueLinks(newDoc),
  ];
  const tables: [number, number] = [
    countNodes(oldDoc, (n) => n.type === "table"),
    countNodes(newDoc, (n) => n.type === "table"),
  ];
  const callouts: [number, number] = [
    countNodes(oldDoc, (n) => n.type === "callout"),
    countNodes(newDoc, (n) => n.type === "callout"),
  ];
  const fns: [number[], number[]] = [
    footnoteMarkers(oldDoc, notesHeading),
    footnoteMarkers(newDoc, notesHeading),
  ];
  return { images, links, tables, callouts, footnoteMarkers: fns };
}

/**
 * Resolve the lead text of the top-level block in a ProseMirror Node that
 * contains the given document position. Returns "" when out of range.
 */
function blockContextAt(node: Node, pos: number): string {
  try {
    const clamped = Math.max(0, Math.min(pos, node.content.size));
    const $pos = node.resolve(clamped);
    // depth 1 is the top-level block in a doc node.
    const block = $pos.depth >= 1 ? $pos.node(1) : $pos.node(0);
    const text = block.textContent || "";
    return text.length > 80 ? text.slice(0, 77) + "..." : text;
  } catch {
    return "";
  }
}

/** Truncate a string for the markdown summary. */
function truncate(s: string, n = 120): string {
  return s.length > n ? s.slice(0, n - 3) + "..." : s;
}

/**
 * Coarse fallback: a block-by-block plain-text diff. Used only when the precise
 * changeset pipeline throws, so the tool degrades gracefully instead of failing.
 */
function coarseDiff(oldDoc: any, newDoc: any): DiffChange[] {
  const oldBlocks: any[] = Array.isArray(oldDoc?.content) ? oldDoc.content : [];
  const newBlocks: any[] = Array.isArray(newDoc?.content) ? newDoc.content : [];
  const oldTexts = oldBlocks.map(plainText);
  const newTexts = newBlocks.map(plainText);
  const oldSet = new Set(oldTexts);
  const newSet = new Set(newTexts);
  const changes: DiffChange[] = [];
  for (const t of oldTexts) {
    if (!newSet.has(t) && t.trim() !== "") {
      changes.push({ op: "delete", block: truncate(t, 80), text: t });
    }
  }
  for (const t of newTexts) {
    if (!oldSet.has(t) && t.trim() !== "") {
      changes.push({ op: "insert", block: truncate(t, 80), text: t });
    }
  }
  return changes;
}

/** Build the human-readable unified-ish markdown summary. */
function renderMarkdown(
  result: Omit<DiffResult, "markdown">,
  fellBack: boolean,
): string {
  const lines: string[] = [];
  const { summary, integrity, changes } = result;
  lines.push(
    `# Diff: ${summary.inserted} inserted / ${summary.deleted} deleted (${summary.blocksChanged} blocks changed)`,
  );
  if (fellBack) {
    lines.push("");
    lines.push("> note: precise diff failed; coarse block-level diff shown.");
  }
  lines.push("");
  lines.push("## Integrity (old -> new)");
  lines.push(`- images: ${integrity.images[0]} -> ${integrity.images[1]}`);
  lines.push(`- links: ${integrity.links[0]} -> ${integrity.links[1]}`);
  lines.push(`- tables: ${integrity.tables[0]} -> ${integrity.tables[1]}`);
  lines.push(`- callouts: ${integrity.callouts[0]} -> ${integrity.callouts[1]}`);
  lines.push(
    `- footnoteMarkers: [${integrity.footnoteMarkers[0].join(", ")}] -> [${integrity.footnoteMarkers[1].join(", ")}]`,
  );
  lines.push("");
  lines.push("## Changes");
  if (changes.length === 0) {
    lines.push("(no textual changes)");
  } else {
    for (const c of changes) {
      const sign = c.op === "insert" ? "+" : "-";
      const ctx = c.block ? ` @ ${truncate(c.block, 60)}` : "";
      lines.push(`${sign} ${truncate(c.text)}${ctx}`);
    }
  }
  return lines.join("\n");
}

/**
 * Diff two ProseMirror JSON documents the way Docmost's history editor does and
 * serialize the result to text + integrity counts.
 *
 * @param oldDocJson the earlier document
 * @param newDocJson the later document
 * @param notesHeading heading delimiting body from notes for footnote counting
 */
export function diffDocs(
  oldDocJson: any,
  newDocJson: any,
  notesHeading: string = "Примечания переводчика",
): DiffResult {
  const integrity = computeIntegrity(oldDocJson, newDocJson, notesHeading);

  let changes: DiffChange[] = [];
  let inserted = 0;
  let deleted = 0;
  let fellBack = false;
  const changedBlocks = new Set<string>();

  try {
    const oldNode = Node.fromJSON(schema, oldDocJson);
    const newNode = Node.fromJSON(schema, newDocJson);
    const tr = recreateTransform(oldNode, newNode, {
      complexSteps: false,
      wordDiffs: true,
      simplifyDiff: true,
    });
    const changeSet = ChangeSet.create(oldNode).addSteps(
      tr.doc,
      tr.mapping.maps,
      [],
    );
    const simplified = simplifyChanges(changeSet.changes, newNode);

    for (const change of simplified) {
      // Deleted text lives in the OLD doc coordinate range [fromA, toA).
      if (change.toA > change.fromA) {
        const text = oldNode.textBetween(change.fromA, change.toA, "\n", " ");
        if (text.length > 0) {
          deleted += text.length;
          const block = blockContextAt(oldNode, change.fromA);
          changes.push({ op: "delete", block, text });
          if (block) changedBlocks.add("d:" + block);
        }
      }
      // Inserted text lives in the NEW doc coordinate range [fromB, toB).
      if (change.toB > change.fromB) {
        const text = newNode.textBetween(change.fromB, change.toB, "\n", " ");
        if (text.length > 0) {
          inserted += text.length;
          const block = blockContextAt(newNode, change.fromB);
          changes.push({ op: "insert", block, text });
          if (block) changedBlocks.add("i:" + block);
        }
      }
    }
  } catch {
    // Pathological pair: degrade to a coarse block-level diff so we never throw.
    fellBack = true;
    changes = coarseDiff(oldDocJson, newDocJson);
    for (const c of changes) {
      if (c.op === "insert") inserted += c.text.length;
      else deleted += c.text.length;
      if (c.block) changedBlocks.add(c.op[0] + ":" + c.block);
    }
  }

  const partial: Omit<DiffResult, "markdown"> = {
    summary: { inserted, deleted, blocksChanged: changedBlocks.size },
    integrity,
    changes,
  };
  return { ...partial, markdown: renderMarkdown(partial, fellBack) };
}