diff --git a/packages/git-sync/build/engine/layout.d.ts b/packages/git-sync/build/engine/layout.d.ts new file mode 100644 index 00000000..8e6d14b4 --- /dev/null +++ b/packages/git-sync/build/engine/layout.d.ts @@ -0,0 +1,44 @@ +/** + * Pure page-tree -> vault path mapping (SPEC §12). + * + * Given the flat list of page nodes for a space (as returned by + * `listAllSpacePages`), compute for every page a deterministic, collision-free + * destination: a folder path (root -> leaf ancestors) plus a file stem (the + * page's own name, no extension). This module is intentionally PURE and + * dependency-free apart from the sanitization helpers, so the whole tree -> + * path logic is unit-testable without any I/O. The names are COSMETIC; identity + * lives in each file's meta block (pageId / slugId). + */ +/** Flat page node as returned by `listAllSpacePages` (no content). */ +export interface PageNode { + id: string; + title?: string; + slugId?: string; + parentPageId?: string | null; + hasChildren?: boolean; +} +/** A page's resolved vault destination: folder path + file stem. */ +export interface VaultEntry { + /** Folder path, root -> leaf (the page's ancestors). Empty for a root page. */ + segments: string[]; + /** The page's own file name without extension. */ + stem: string; +} +/** + * Build the full vault layout for a space. + * + * Returns a Map keyed by pageId -> `{ segments, stem }`. The result is + * deterministic for a given input and guarantees every full destination path + * (`[...segments, stem].join("/")`) is unique, so no page can silently overwrite + * another. + * + * Disambiguation is layered: + * 1. Sibling collisions (same sanitized title under the same parent) are + * resolved with a stable ` ~` suffix (the suffix is itself + * sanitized, since slugId/id is untrusted data that must never inject a + * path separator). + * 2. A final full-path pass catches residual collisions that sibling-scoping + * cannot see — e.g. two pages whose parents are BOTH outside the input set + * both bucket at the root with `segments: []`. + */ +export declare function buildVaultLayout(pages: PageNode[]): Map; diff --git a/packages/git-sync/build/engine/layout.js b/packages/git-sync/build/engine/layout.js new file mode 100644 index 00000000..3503de78 --- /dev/null +++ b/packages/git-sync/build/engine/layout.js @@ -0,0 +1,148 @@ +/** + * Pure page-tree -> vault path mapping (SPEC §12). + * + * Given the flat list of page nodes for a space (as returned by + * `listAllSpacePages`), compute for every page a deterministic, collision-free + * destination: a folder path (root -> leaf ancestors) plus a file stem (the + * page's own name, no extension). This module is intentionally PURE and + * dependency-free apart from the sanitization helpers, so the whole tree -> + * path logic is unit-testable without any I/O. The names are COSMETIC; identity + * lives in each file's meta block (pageId / slugId). + */ +import { sanitizeTitle, disambiguate } from "./sanitize.js"; +/** + * Build the full vault layout for a space. + * + * Returns a Map keyed by pageId -> `{ segments, stem }`. The result is + * deterministic for a given input and guarantees every full destination path + * (`[...segments, stem].join("/")`) is unique, so no page can silently overwrite + * another. + * + * Disambiguation is layered: + * 1. Sibling collisions (same sanitized title under the same parent) are + * resolved with a stable ` ~` suffix (the suffix is itself + * sanitized, since slugId/id is untrusted data that must never inject a + * path separator). + * 2. A final full-path pass catches residual collisions that sibling-scoping + * cannot see — e.g. two pages whose parents are BOTH outside the input set + * both bucket at the root with `segments: []`. + */ +export function buildVaultLayout(pages) { + // Index pages by id so the parent chain can be walked. Guard against + // duplicate ids in the input (first one wins). + const byId = new Map(); + for (const p of pages) { + if (p && p.id && !byId.has(p.id)) + byId.set(p.id, p); + } + // Resolve each node's display name once, deterministically, tracking sibling + // collisions per parent. `usedBySibling` maps a parent key -> set of names + // already taken under that parent. The bucket key is the node's parent ONLY + // when that parent is actually present in `byId`; otherwise (null parent, or + // an orphan whose parent is outside the input set) the node buckets at + // `"__root__"`. This is critical: orphans land at the vault root (see + // `folderSegmentsFor`), so they MUST share the root bucket with real root + // pages to be disambiguated against each other here — making `nameById` final + // before any `segments` are computed, so no ancestor name can drift later. + const usedBySibling = new Map(); + const nameById = new Map(); + for (const p of pages) { + if (p && p.id && !nameById.has(p.id)) { + const parentKey = p.parentPageId && byId.has(p.parentPageId) ? p.parentPageId : "__root__"; + nameById.set(p.id, nameForNode(p, parentKey, usedBySibling)); + } + } + // Every id we index above MUST get a resolved name; this helper returns it + // and THROWS if it is somehow absent, rather than silently recomputing a + // DIFFERENT, non-disambiguated name (which would desync a folder segment from + // its target file). + const nameOf = (id) => { + const name = nameById.get(id); + if (name === undefined) { + throw new Error(`buildVaultLayout: no resolved name for page id ${id}`); + } + return name; + }; + // Build the folder path for a page by walking parentPageId to the root. The + // page's OWN name is the file stem; its ancestors become folders. A `visited` + // guard prevents an infinite loop on a malformed parent cycle. + const folderSegmentsFor = (node) => { + const ancestors = []; + const visited = new Set(); + let current = node.parentPageId + ? byId.get(node.parentPageId) + : undefined; + while (current && current.id && !visited.has(current.id)) { + visited.add(current.id); + ancestors.unshift(nameOf(current.id)); + current = current.parentPageId + ? byId.get(current.parentPageId) + : undefined; + } + return ancestors; + }; + // First pass: compute the provisional { segments, stem } for every node. + const layout = new Map(); + for (const p of pages) { + if (!p || !p.id || layout.has(p.id)) + continue; + layout.set(p.id, { + segments: folderSegmentsFor(p), + stem: nameOf(p.id), + }); + } + // Final full-path uniqueness pass — a belt-and-suspenders safety net. Note + // that cross-bucket (orphan/root) collisions are now resolved in the name pass + // above (orphans share the "__root__" bucket), so ancestor names are final + // before `segments` are built and this pass should rarely/never re-stem an + // ancestor. It only re-stems the colliding LATER leaf via the sanitized + // slugId/id, then (if still colliding) appends the id. + const usedPaths = new Set(); + const seenIds = new Set(); + const pathKey = (e) => [...e.segments, e.stem].join("/"); + for (const p of pages) { + if (!p || !p.id || seenIds.has(p.id)) + continue; + seenIds.add(p.id); + const entry = layout.get(p.id); + if (!entry) + continue; + if (usedPaths.has(pathKey(entry))) { + // First attempt: disambiguate the stem with the sanitized slugId (or id). + entry.stem = disambiguate(entry.stem, sanitizeTitle(p.slugId ?? p.id)); + if (usedPaths.has(pathKey(entry))) { + // Still colliding: append the (sanitized) id as a last resort. The id + // is globally unique, so this always resolves the collision. + entry.stem = disambiguate(entry.stem, sanitizeTitle(p.id)); + } + } + usedPaths.add(pathKey(entry)); + } + return layout; +} +/** + * Compute a deterministic, collision-free name for a node among its SIBLINGS. + * `usedBySibling` maps a parent key -> set of names already taken, so two + * siblings that sanitize to the same name get a stable ` ~slugId` suffix + * (SPEC §12). The suffix is itself passed through `sanitizeTitle`, because the + * slugId/id is a second untrusted-data channel that must never leak a path + * separator into the name. `parentKey` is supplied by the caller (it resolves + * to `"__root__"` for root pages AND for orphans whose parent is outside the + * input set, so they share one bucket). The name is COSMETIC; identity lives in + * the meta block. + */ +function nameForNode(node, parentKey, usedBySibling) { + let used = usedBySibling.get(parentKey); + if (!used) { + used = new Set(); + usedBySibling.set(parentKey, used); + } + let name = sanitizeTitle(node.title ?? ""); + if (used.has(name)) { + // Sibling collision: disambiguate with the stable, sanitized slugId (fall + // back to the sanitized pageId if no slugId is present). + name = disambiguate(name, sanitizeTitle(node.slugId ?? node.id)); + } + used.add(name); + return name; +} diff --git a/packages/git-sync/build/engine/loop-guard.d.ts b/packages/git-sync/build/engine/loop-guard.d.ts new file mode 100644 index 00000000..95980d02 --- /dev/null +++ b/packages/git-sync/build/engine/loop-guard.d.ts @@ -0,0 +1,13 @@ +/** + * Stable hash of a page's markdown BODY (SPEC §10 "хэш тела"). Deterministic: + * the same input string always yields the same digest, a different input a + * different one. Used to recognize our own write later (loop suppression). + * + * We hash the body STRING as-is (UTF-8) with SHA-256 and return lowercase hex. + * SPEC §10 keys on the body hash rather than file bytes; callers decide WHAT + * counts as "the body" (here it is the exact string passed in — typically the + * self-contained markdown that was pushed). No normalization is applied: the + * caller is responsible for passing a canonical/stable representation if it + * wants hash equality across cosmetic-only differences. + */ +export declare function bodyHash(markdownBody: string): string; diff --git a/packages/git-sync/build/engine/loop-guard.js b/packages/git-sync/build/engine/loop-guard.js new file mode 100644 index 00000000..a85047e4 --- /dev/null +++ b/packages/git-sync/build/engine/loop-guard.js @@ -0,0 +1,28 @@ +/** + * Loop-guard primitives (SPEC §10). The sync engine must never re-pull its OWN + * write as if it were a remote edit: after a push, the next poll will see the + * page it just wrote with a fresh `updatedAt`. To suppress that, we key on two + * signals — the body HASH of what we pushed (this module) and the `updatedAt` + * returned by the write — recorded per page at push time. + * + * This module owns the PURE, deterministic body-hash. The CONSUMPTION on the + * pull side (comparing an incoming page's body hash against the last pushed hash + * to decide "this is our own write, ignore it") is a future increment — here we + * only PRODUCE the hash and the per-page push record (see `src/push.ts`). + */ +import { createHash } from "node:crypto"; +/** + * Stable hash of a page's markdown BODY (SPEC §10 "хэш тела"). Deterministic: + * the same input string always yields the same digest, a different input a + * different one. Used to recognize our own write later (loop suppression). + * + * We hash the body STRING as-is (UTF-8) with SHA-256 and return lowercase hex. + * SPEC §10 keys on the body hash rather than file bytes; callers decide WHAT + * counts as "the body" (here it is the exact string passed in — typically the + * self-contained markdown that was pushed). No normalization is applied: the + * caller is responsible for passing a canonical/stable representation if it + * wants hash equality across cosmetic-only differences. + */ +export function bodyHash(markdownBody) { + return createHash("sha256").update(markdownBody, "utf8").digest("hex"); +} diff --git a/packages/git-sync/build/engine/reconcile.d.ts b/packages/git-sync/build/engine/reconcile.d.ts new file mode 100644 index 00000000..28a58e92 --- /dev/null +++ b/packages/git-sync/build/engine/reconcile.d.ts @@ -0,0 +1,126 @@ +/** + * Pure reconciliation planner (SPEC §5/§6/§8). + * + * Given the desired live set of files (computed from the current Docmost tree) + * and the set of files currently tracked in the vault, compute what to write, + * what to move (old path to remove), and what to delete. Identity is `pageId` + * (the stable file<->page anchor, SPEC §4): a page that keeps its pageId but + * changes relPath is a MOVE, not delete+add; a tracked pageId that is gone from + * the live tree is a DELETE. + * + * This module is intentionally PURE (no IO, no git) so the whole plan is + * unit-testable. The actual file writing / git operations happen in pull.ts. + */ +/** A page that SHOULD exist in the vault at a given path. */ +export interface LiveEntry { + pageId: string; + /** Vault-relative path (forward-slash), e.g. `Space/Parent/Child.md`. */ + relPath: string; +} +/** A page currently tracked in the vault (pageId parsed from its meta). */ +export interface ExistingEntry { + pageId: string; + /** Vault-relative path (forward-slash) of the tracked file. */ + relPath: string; +} +/** A page to (re)write at its destination path. */ +export interface WriteEntry { + pageId: string; + relPath: string; +} +/** A page that moved: written at its NEW relPath, with the OLD path removed. */ +export interface MovedEntry { + pageId: string; + fromRelPath: string; + toRelPath: string; + /** + * Whether the old path (`fromRelPath`) is SAFE to remove. False when another + * live page will (re)write that exact path (path reuse): removing it would + * destroy real data, so the caller must skip the removal. The move itself is + * still recorded (the new path is written regardless). + */ + removeOldPath: boolean; +} +/** The full reconciliation plan. */ +export interface ReconciliationPlan { + /** + * Pages present in `live` -> (re)write at their relPath. This naturally + * covers add, content-update (same path) AND move (same pageId, new path), + * since every live page is (re)written regardless of whether it existed. + */ + toWrite: WriteEntry[]; + /** + * Vault-relative paths to delete because their tracked pageId is ABSENT from + * `live` (page removed/trashed). This set is ONLY absence-based deletions — + * the OLD paths of moved pages are NOT here (they live in `moved` and are + * applied separately by the caller). Keeping the two apart lets pull.ts gate + * absence deletions behind the incomplete-fetch suppression + mass-delete + * guard (SPEC §8) while still applying real moves. + */ + toDelete: string[]; + /** + * Tracked pages whose relPath changed. The caller writes the page at + * `toRelPath`, then removes `fromRelPath` — but ONLY after the new-path write + * succeeded. The old path is NOT in `toDelete`. + */ + moved: MovedEntry[]; +} +/** + * Compute the reconciliation plan. + * + * Rules: + * - Every `live` page is written at its relPath (covers add + update + move). + * - A tracked pageId present in `live` whose relPath changed is `moved`; its + * OLD relPath goes into `moved` ONLY (the caller removes it after the new + * path is written) and is NEVER added to `toDelete`. + * - A tracked pageId NOT present in `live` is an ABSENCE delete; its relPath + * is added to `toDelete`. + * + * Notes: + * - Safety filter (no data loss): no path that is a live TARGET path of any + * page is ever deleted/removed (a write owns it). This applies to BOTH the + * absence `toDelete` set AND a moved page's old-path removal — if a moved + * page's OLD path is reused by ANOTHER live page, the move records no old + * path to remove, because that path will be (re)written. + * - `existing` may legitimately contain duplicate pageIds (two stray files + * carrying the same meta pageId); each such file that is not the live target + * path is removed (as an absence/move) so the vault converges to exactly the + * live set. + */ +export declare function planReconciliation(live: LiveEntry[], existing: ExistingEntry[]): ReconciliationPlan; +/** + * Below this many tracked files the mass-delete fraction guard is not applied + * (a tiny vault where deleting "most" files is normal, e.g. 1-of-2). + */ +export declare const MASS_DELETE_MIN_EXISTING = 4; +/** Fraction of tracked files above which a delete plan is a suspected wipe. */ +export declare const MASS_DELETE_FRACTION = 0.5; +/** Why absence-based deletions were (or were not) applied this cycle. */ +export type DeletionDecision = { + apply: true; +} | { + apply: false; + reason: "incomplete-fetch" | "empty-live" | "mass-delete"; +}; +/** + * Pure decision: should the ABSENCE-based deletions (`plan.toDelete`) be applied + * this cycle? Encapsulates the SPEC §8 safety invariants so they are unit- + * testable without live creds or git: + * + * - `treeComplete === false` (a partial Docmost tree fetch) -> SUPPRESS. A page + * missing from a partial tree is NOT proof of deletion (SPEC §8); we must not + * delete merely-absent files this cycle. (Writes/updates/moves still happen.) + * - The live fetch returned 0 pages while files are tracked -> SUPPRESS + * (almost always a failed fetch, never a real "delete everything"). + * - The plan would delete more than `MASS_DELETE_FRACTION` of a non-trivial + * vault -> SUPPRESS as a mass-deletion guard (defense in depth). + * + * Moves are NOT governed by this decision: a moved page IS present in `live`, so + * its old-path removal is real (handled by the caller separately). + */ +export declare function decideAbsenceDeletions(args: { + treeComplete: boolean; + liveCount: number; + existingCount: number; + deleteCount: number; +}): DeletionDecision; diff --git a/packages/git-sync/build/engine/reconcile.js b/packages/git-sync/build/engine/reconcile.js new file mode 100644 index 00000000..9a111bb5 --- /dev/null +++ b/packages/git-sync/build/engine/reconcile.js @@ -0,0 +1,117 @@ +/** + * Pure reconciliation planner (SPEC §5/§6/§8). + * + * Given the desired live set of files (computed from the current Docmost tree) + * and the set of files currently tracked in the vault, compute what to write, + * what to move (old path to remove), and what to delete. Identity is `pageId` + * (the stable file<->page anchor, SPEC §4): a page that keeps its pageId but + * changes relPath is a MOVE, not delete+add; a tracked pageId that is gone from + * the live tree is a DELETE. + * + * This module is intentionally PURE (no IO, no git) so the whole plan is + * unit-testable. The actual file writing / git operations happen in pull.ts. + */ +/** + * Compute the reconciliation plan. + * + * Rules: + * - Every `live` page is written at its relPath (covers add + update + move). + * - A tracked pageId present in `live` whose relPath changed is `moved`; its + * OLD relPath goes into `moved` ONLY (the caller removes it after the new + * path is written) and is NEVER added to `toDelete`. + * - A tracked pageId NOT present in `live` is an ABSENCE delete; its relPath + * is added to `toDelete`. + * + * Notes: + * - Safety filter (no data loss): no path that is a live TARGET path of any + * page is ever deleted/removed (a write owns it). This applies to BOTH the + * absence `toDelete` set AND a moved page's old-path removal — if a moved + * page's OLD path is reused by ANOTHER live page, the move records no old + * path to remove, because that path will be (re)written. + * - `existing` may legitimately contain duplicate pageIds (two stray files + * carrying the same meta pageId); each such file that is not the live target + * path is removed (as an absence/move) so the vault converges to exactly the + * live set. + */ +export function planReconciliation(live, existing) { + // Desired path for each live pageId. + const liveByPageId = new Map(); + // Set of all paths that WILL be written (never delete/remove one of these). + const liveTargetPaths = new Set(); + for (const e of live) { + liveByPageId.set(e.pageId, e.relPath); + liveTargetPaths.add(e.relPath); + } + const toWrite = live.map((e) => ({ + pageId: e.pageId, + relPath: e.relPath, + })); + const moved = []; + // Absence-based deletions ONLY (tracked pageId absent from `live`). Use a Set + // so the same path coming from multiple existing rows is queued only once. + const toDeleteSet = new Set(); + for (const ex of existing) { + const liveRel = liveByPageId.get(ex.pageId); + if (liveRel === undefined) { + // Tracked page is gone from the live tree -> absence delete. + // Never queue a path a live page will (re)write (path reuse -> no loss). + if (!liveTargetPaths.has(ex.relPath)) + toDeleteSet.add(ex.relPath); + continue; + } + if (liveRel !== ex.relPath) { + // Same pageId, different path -> a MOVE. Record it so the caller can write + // the new path first, then remove the old one. If the old path is itself a + // live target (reused by another page), it must NOT be removed — the write + // owns it — so flag `removeOldPath: false` (move still recorded). + moved.push({ + pageId: ex.pageId, + fromRelPath: ex.relPath, + toRelPath: liveRel, + removeOldPath: !liveTargetPaths.has(ex.relPath), + }); + } + // liveRel === ex.relPath -> content-update in place; nothing extra to do + // (the write above re-emits the file; identical bytes => git no-op). + } + const toDelete = [...toDeleteSet]; + return { toWrite, toDelete, moved }; +} +/** + * Below this many tracked files the mass-delete fraction guard is not applied + * (a tiny vault where deleting "most" files is normal, e.g. 1-of-2). + */ +export const MASS_DELETE_MIN_EXISTING = 4; +/** Fraction of tracked files above which a delete plan is a suspected wipe. */ +export const MASS_DELETE_FRACTION = 0.5; +/** + * Pure decision: should the ABSENCE-based deletions (`plan.toDelete`) be applied + * this cycle? Encapsulates the SPEC §8 safety invariants so they are unit- + * testable without live creds or git: + * + * - `treeComplete === false` (a partial Docmost tree fetch) -> SUPPRESS. A page + * missing from a partial tree is NOT proof of deletion (SPEC §8); we must not + * delete merely-absent files this cycle. (Writes/updates/moves still happen.) + * - The live fetch returned 0 pages while files are tracked -> SUPPRESS + * (almost always a failed fetch, never a real "delete everything"). + * - The plan would delete more than `MASS_DELETE_FRACTION` of a non-trivial + * vault -> SUPPRESS as a mass-deletion guard (defense in depth). + * + * Moves are NOT governed by this decision: a moved page IS present in `live`, so + * its old-path removal is real (handled by the caller separately). + */ +export function decideAbsenceDeletions(args) { + const { treeComplete, liveCount, existingCount, deleteCount } = args; + // No tracked files, or nothing to delete -> trivially fine to "apply". + if (existingCount === 0 || deleteCount === 0) + return { apply: true }; + if (!treeComplete) + return { apply: false, reason: "incomplete-fetch" }; + if (liveCount === 0) + return { apply: false, reason: "empty-live" }; + if (existingCount >= MASS_DELETE_MIN_EXISTING && + deleteCount > existingCount * MASS_DELETE_FRACTION) { + return { apply: false, reason: "mass-delete" }; + } + return { apply: true }; +} diff --git a/packages/git-sync/build/engine/roundtrip-helpers.d.ts b/packages/git-sync/build/engine/roundtrip-helpers.d.ts new file mode 100644 index 00000000..5211e709 --- /dev/null +++ b/packages/git-sync/build/engine/roundtrip-helpers.d.ts @@ -0,0 +1,23 @@ +/** + * Pure helpers extracted from the docmost-sync Phase-0 idempotency harness + * (`src/roundtrip.ts`). Only the IO-free comparison utilities are vendored — + * the CLI scaffold (`--fixture`/`--page`/`--corpus`, `loadSettings`, the + * `DocmostClient` live path and `process.exit`) is NOT vendored (plan §2.1: + * the roundtrip harness moves into the package's tests, not the engine). + */ +/** + * Recursively strip every `attrs.id` from a ProseMirror node tree. Block ids + * are regenerated by `markdownToProseMirror` (SPEC §11), so they must be + * ignored when comparing the semantic shape of two documents. Returns a NEW + * tree; the input is not mutated. + */ +export declare function stripBlockIds(node: any): any; +/** + * Find the first divergence between two values via a recursive deep compare. + * Returns a short path + the two differing values, or null if they are equal. + */ +export declare function firstDivergence(a: any, b: any, path?: string): { + path: string; + a: any; + b: any; +} | null; diff --git a/packages/git-sync/build/engine/roundtrip-helpers.js b/packages/git-sync/build/engine/roundtrip-helpers.js new file mode 100644 index 00000000..2ef4e906 --- /dev/null +++ b/packages/git-sync/build/engine/roundtrip-helpers.js @@ -0,0 +1,72 @@ +/** + * Pure helpers extracted from the docmost-sync Phase-0 idempotency harness + * (`src/roundtrip.ts`). Only the IO-free comparison utilities are vendored — + * the CLI scaffold (`--fixture`/`--page`/`--corpus`, `loadSettings`, the + * `DocmostClient` live path and `process.exit`) is NOT vendored (plan §2.1: + * the roundtrip harness moves into the package's tests, not the engine). + */ +/** + * Recursively strip every `attrs.id` from a ProseMirror node tree. Block ids + * are regenerated by `markdownToProseMirror` (SPEC §11), so they must be + * ignored when comparing the semantic shape of two documents. Returns a NEW + * tree; the input is not mutated. + */ +export function stripBlockIds(node) { + if (Array.isArray(node)) { + return node.map(stripBlockIds); + } + if (node && typeof node === "object") { + const out = {}; + for (const key of Object.keys(node)) { + if (key === "attrs" && node.attrs && typeof node.attrs === "object") { + // Drop the `id` attr; keep every other attribute. + const { id, ...rest } = node.attrs; + void id; + out.attrs = stripBlockIds(rest); + } + else { + out[key] = stripBlockIds(node[key]); + } + } + return out; + } + return node; +} +/** + * Find the first divergence between two values via a recursive deep compare. + * Returns a short path + the two differing values, or null if they are equal. + */ +export function firstDivergence(a, b, path = "$") { + if (a === b) + return null; + const ta = typeof a; + const tb = typeof b; + if (ta !== tb || a === null || b === null) { + return { path, a, b }; + } + if (ta !== "object") { + return { path, a, b }; + } + const aIsArr = Array.isArray(a); + const bIsArr = Array.isArray(b); + if (aIsArr !== bIsArr) + return { path, a, b }; + if (aIsArr) { + if (a.length !== b.length) { + return { path: `${path}.length`, a: a.length, b: b.length }; + } + for (let i = 0; i < a.length; i++) { + const d = firstDivergence(a[i], b[i], `${path}[${i}]`); + if (d) + return d; + } + return null; + } + const keys = new Set([...Object.keys(a), ...Object.keys(b)]); + for (const k of keys) { + const d = firstDivergence(a[k], b[k], `${path}.${k}`); + if (d) + return d; + } + return null; +} diff --git a/packages/git-sync/build/engine/sanitize.d.ts b/packages/git-sync/build/engine/sanitize.d.ts new file mode 100644 index 00000000..0889a9f6 --- /dev/null +++ b/packages/git-sync/build/engine/sanitize.d.ts @@ -0,0 +1,23 @@ +/** + * Deterministic filename strategy (SPEC §12). + * + * The file name is COSMETIC — the source of truth for the file<->page link is + * `pageId` / `slugId` inside the meta block, so renaming a file is safe. These + * functions are intentionally dependency-free and pure, so they are trivially + * unit-testable. + */ +/** + * Sanitize a page title into a safe file-name component (WITHOUT extension). + * + * Steps: replace forbidden / control characters with "-", collapse whitespace + * runs to a single space, trim, cap the length, then guard against an empty + * result, an all-dots result, or a reserved Windows device name by prefixing + * with "_". + */ +export declare function sanitizeTitle(title: string): string; +/** + * Disambiguate a sanitized name when two siblings in the same folder collapse + * to the same name. Appends a stable suffix built from the page's `slugId`, so + * the result stays deterministic across runs (SPEC §12: `Title ~slugId`). + */ +export declare function disambiguate(name: string, slugId: string): string; diff --git a/packages/git-sync/build/engine/sanitize.js b/packages/git-sync/build/engine/sanitize.js new file mode 100644 index 00000000..2aff0f3c --- /dev/null +++ b/packages/git-sync/build/engine/sanitize.js @@ -0,0 +1,97 @@ +/** + * Deterministic filename strategy (SPEC §12). + * + * The file name is COSMETIC — the source of truth for the file<->page link is + * `pageId` / `slugId` inside the meta block, so renaming a file is safe. These + * functions are intentionally dependency-free and pure, so they are trivially + * unit-testable. + */ +// Printable characters forbidden in file names on common filesystems (mainly +// Windows): / \ < > : " | ? *. Each match is replaced with a single "-". +// Spaces are NOT in this set; whitespace is normalized separately below. +// ASCII control characters (code points 0..31) are stripped in a separate pass +// (see stripControlChars) to keep this literal free of embedded control bytes. +const FORBIDDEN_PRINTABLE_RE = /[/\\<>:"|?*]/g; +// Runs of whitespace (including tabs/newlines) collapse to a single space. +const WHITESPACE_RUN_RE = /\s+/g; +// Reserved Windows device names (case-insensitive). A bare match (with or +// without an extension) is unusable as a file name, so it is prefixed with "_". +const RESERVED_WINDOWS_NAMES = new Set([ + "con", + "prn", + "aux", + "nul", + "com1", + "com2", + "com3", + "com4", + "com5", + "com6", + "com7", + "com8", + "com9", + "lpt1", + "lpt2", + "lpt3", + "lpt4", + "lpt5", + "lpt6", + "lpt7", + "lpt8", + "lpt9", +]); +// Cap on the sanitized length to stay well within filesystem path-component +// limits (255 bytes on most FSes) while leaving room for an extension and a +// disambiguation suffix. +const MAX_LENGTH = 120; +/** + * Replace every ASCII control character (code points 0..31) with "-". Done by + * scanning code points rather than a control-range regex literal, so the source + * file carries no embedded control bytes. + */ +function stripControlChars(input) { + let out = ""; + for (let i = 0; i < input.length; i++) { + out += input.charCodeAt(i) < 32 ? "-" : input[i]; + } + return out; +} +/** + * Sanitize a page title into a safe file-name component (WITHOUT extension). + * + * Steps: replace forbidden / control characters with "-", collapse whitespace + * runs to a single space, trim, cap the length, then guard against an empty + * result, an all-dots result, or a reserved Windows device name by prefixing + * with "_". + */ +export function sanitizeTitle(title) { + let name = stripControlChars(title ?? "") + .replace(FORBIDDEN_PRINTABLE_RE, "-") + .replace(WHITESPACE_RUN_RE, " ") + .trim(); + if (name.length > MAX_LENGTH) { + name = name.slice(0, MAX_LENGTH).trim(); + } + // Compare the base name (before the first dot) against reserved names, so + // both "CON" and "con.md" are caught. + const base = name.split(".")[0]?.toLowerCase() ?? ""; + // A name that is empty, consists only of dots ("." / ".." / "..."), or is a + // reserved Windows device name is unusable as a path component. The all-dots + // case is a path-traversal hazard in particular: an unprefixed ".." would + // become a parent-directory segment and let a page escape the vault, so it + // MUST be neutralized here (becomes "_..", which is a literal file name). + if (name.length === 0 || + /^\.+$/.test(name) || + RESERVED_WINDOWS_NAMES.has(base)) { + name = "_" + name; + } + return name; +} +/** + * Disambiguate a sanitized name when two siblings in the same folder collapse + * to the same name. Appends a stable suffix built from the page's `slugId`, so + * the result stays deterministic across runs (SPEC §12: `Title ~slugId`). + */ +export function disambiguate(name, slugId) { + return `${name} ~${slugId}`; +} diff --git a/packages/git-sync/build/engine/stabilize.d.ts b/packages/git-sync/build/engine/stabilize.d.ts new file mode 100644 index 00000000..b124ef8e --- /dev/null +++ b/packages/git-sync/build/engine/stabilize.d.ts @@ -0,0 +1,26 @@ +/** + * Meta object as `exportPageBody` builds it (SPEC §4). Kept byte-for-byte + * compatible so files produced here match `exportPageBody`'s output exactly. + */ +export interface PageMeta { + version: 1; + pageId: string; + slugId: string; + title: string; + spaceId: string; + parentPageId: string | null; +} +/** + * Produce the self-contained `.md` file text for a page from its raw + * ProseMirror `content` + identity meta, in the verified fixpoint form. + * + * md1 = convertProseMirrorToMarkdown(content) + * doc2 = markdownToProseMirror(md1) // one import... + * stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export + * file = serializeDocmostMarkdownBody(meta, stableBody) + * + * The single export->import->export pass is the verified fixpoint (SPEC §11): + * idempotent for already-stable content, and the convergence point for the + * known converter asymmetries. + */ +export declare function stabilizePageFile(content: unknown, meta: PageMeta): Promise; diff --git a/packages/git-sync/build/engine/stabilize.js b/packages/git-sync/build/engine/stabilize.js new file mode 100644 index 00000000..d9e32962 --- /dev/null +++ b/packages/git-sync/build/engine/stabilize.js @@ -0,0 +1,36 @@ +/** + * Normalize-on-write helper (SPEC §11 "Резолюция"). + * + * git diffs byte-for-byte, so writing a page in a NON-fixpoint markdown form + * would make the next pull re-export it to a slightly different (but stable) + * form and produce a phantom diff -> churny commits. The converter has a couple + * of known one-pass asymmetries (a block image after a paragraph adds an empty + * paragraph; a diagram materializes `data-align`), all of which converge to a + * fixpoint after ONE `export -> import -> export` round-trip. + * + * So at write time we run exactly that one pass and persist the fixpoint form. + * Already-stable content is unaffected (the pass is idempotent), so re-pulls of + * unchanged pages produce identical bytes and git sees no diff. + */ +import { convertProseMirrorToMarkdown, markdownToProseMirror, serializeDocmostMarkdownBody, } from "../lib/index.js"; +/** + * Produce the self-contained `.md` file text for a page from its raw + * ProseMirror `content` + identity meta, in the verified fixpoint form. + * + * md1 = convertProseMirrorToMarkdown(content) + * doc2 = markdownToProseMirror(md1) // one import... + * stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export + * file = serializeDocmostMarkdownBody(meta, stableBody) + * + * The single export->import->export pass is the verified fixpoint (SPEC §11): + * idempotent for already-stable content, and the convergence point for the + * known converter asymmetries. + */ +export async function stabilizePageFile(content, meta) { + const md1 = convertProseMirrorToMarkdown(content); + const doc2 = await markdownToProseMirror(md1); + const stableBody = convertProseMirrorToMarkdown(doc2); + // The meta shape is exactly what `exportPageBody` writes; cast to the lib's + // DocmostMdMeta (a superset with optional fields) for the serializer. + return serializeDocmostMarkdownBody(meta, stableBody); +} diff --git a/packages/git-sync/build/index.d.ts b/packages/git-sync/build/index.d.ts new file mode 100644 index 00000000..c89072ee --- /dev/null +++ b/packages/git-sync/build/index.d.ts @@ -0,0 +1,17 @@ +/** + * Public surface of `@docmost/git-sync`. + * + * Phase A (plan §12.A) vendors only the PURE converter + pure engine modules + * from docmost-sync. Server integration (GitmostDataSource, orchestrator, + * VaultGit, pull/push) is added in later steps. + */ +export { serializeDocmostMarkdown, serializeDocmostMarkdownBody, parseDocmostMarkdown, convertProseMirrorToMarkdown, markdownToProseMirror, canonicalizeContent, docsCanonicallyEqual, } from "./lib/index.js"; +export type { DocmostMdMeta } from "./lib/index.js"; +export { planReconciliation, decideAbsenceDeletions, MASS_DELETE_MIN_EXISTING, MASS_DELETE_FRACTION, } from "./engine/reconcile.js"; +export type { LiveEntry, ExistingEntry, WriteEntry, MovedEntry, ReconciliationPlan, DeletionDecision, } from "./engine/reconcile.js"; +export { buildVaultLayout } from "./engine/layout.js"; +export type { PageNode, VaultEntry } from "./engine/layout.js"; +export { sanitizeTitle, disambiguate } from "./engine/sanitize.js"; +export { stabilizePageFile } from "./engine/stabilize.js"; +export type { PageMeta } from "./engine/stabilize.js"; +export { bodyHash } from "./engine/loop-guard.js"; diff --git a/packages/git-sync/build/index.js b/packages/git-sync/build/index.js new file mode 100644 index 00000000..45bc04fe --- /dev/null +++ b/packages/git-sync/build/index.js @@ -0,0 +1,16 @@ +/** + * Public surface of `@docmost/git-sync`. + * + * Phase A (plan §12.A) vendors only the PURE converter + pure engine modules + * from docmost-sync. Server integration (GitmostDataSource, orchestrator, + * VaultGit, pull/push) is added in later steps. + */ +// Pure converter (markdown <-> ProseMirror, file envelope, canonicalization). +export { serializeDocmostMarkdown, serializeDocmostMarkdownBody, parseDocmostMarkdown, convertProseMirrorToMarkdown, markdownToProseMirror, canonicalizeContent, docsCanonicallyEqual, } from "./lib/index.js"; +// Pure engine (no IO): reconcile planner, vault layout, sanitize, stabilize, +// loop-guard body hash. +export { planReconciliation, decideAbsenceDeletions, MASS_DELETE_MIN_EXISTING, MASS_DELETE_FRACTION, } from "./engine/reconcile.js"; +export { buildVaultLayout } from "./engine/layout.js"; +export { sanitizeTitle, disambiguate } from "./engine/sanitize.js"; +export { stabilizePageFile } from "./engine/stabilize.js"; +export { bodyHash } from "./engine/loop-guard.js"; diff --git a/packages/git-sync/build/lib/canonicalize.d.ts b/packages/git-sync/build/lib/canonicalize.d.ts new file mode 100644 index 00000000..b899a7a3 --- /dev/null +++ b/packages/git-sync/build/lib/canonicalize.d.ts @@ -0,0 +1,41 @@ +/** + * docmost-sync ADDITION (not present in docmost-mcp). + * + * Semantic canonicalization of ProseMirror/TipTap documents for the Phase-0 + * round-trip idempotency check (SPEC §11, "Задача №0", option (б): compare a + * CANONICALIZED form rather than raw bytes). + * + * `markdownToProseMirror` reconstructs schema DEFAULT attributes (e.g. + * `indent: null` where the source omitted it) and regenerates per-block ids on + * every import. A raw deep-equal of the source doc against the re-imported doc + * therefore diverges even when the two are semantically identical. This module + * normalizes a document so that two semantically-equal docs compare deep-equal + * regardless of block ids and absent-vs-explicit-default-null attributes. + * + * This file is intentionally a NEW, self-contained module so it is trivial to + * backport into docmost-mcp without touching existing code. + */ +/** + * Return a DEEP COPY of a ProseMirror node tree, canonicalized so that two + * semantically-equal documents compare deep-equal. Rules (applied recursively + * to the node, its `content`, and its `marks`): + * + * 1. Remove node-level `attrs.id` (regenerated on import). Mark attrs are NOT + * touched for `id` (marks carry no block id; only their meaningful attrs). + * 2. In any `attrs` object (node OR mark) drop keys whose value is `null`/ + * `undefined` (absent ≡ explicit default null) OR equals that node/mark + * type's known non-null schema default (absent ≡ explicit default). + * Keep every non-default value. The type is passed into the attrs + * normalizer so it can look up `KNOWN_DEFAULTS`. + * 3. If an `attrs` object becomes empty after pruning, drop the `attrs` key. + * 4. Preserve `marks` (including the `comment` mark and its `commentId` — a + * meaningful anchor per SPEC §3; never strip it). + * 5. Preserve `text`, `type`, and `content` order exactly. + * 6. Never mutate the input. + */ +export declare function canonicalizeContent(node: any): any; +/** + * True when two ProseMirror documents are semantically equal: equal after + * canonicalization (block ids stripped, absent-vs-default-null normalized). + */ +export declare function docsCanonicallyEqual(a: any, b: any): boolean; diff --git a/packages/git-sync/build/lib/canonicalize.js b/packages/git-sync/build/lib/canonicalize.js new file mode 100644 index 00000000..5a6c0bbc --- /dev/null +++ b/packages/git-sync/build/lib/canonicalize.js @@ -0,0 +1,248 @@ +/** + * docmost-sync ADDITION (not present in docmost-mcp). + * + * Semantic canonicalization of ProseMirror/TipTap documents for the Phase-0 + * round-trip idempotency check (SPEC §11, "Задача №0", option (б): compare a + * CANONICALIZED form rather than raw bytes). + * + * `markdownToProseMirror` reconstructs schema DEFAULT attributes (e.g. + * `indent: null` where the source omitted it) and regenerates per-block ids on + * every import. A raw deep-equal of the source doc against the re-imported doc + * therefore diverges even when the two are semantically identical. This module + * normalizes a document so that two semantically-equal docs compare deep-equal + * regardless of block ids and absent-vs-explicit-default-null attributes. + * + * This file is intentionally a NEW, self-contained module so it is trivial to + * backport into docmost-mcp without touching existing code. + */ +/** + * Known NON-NULL schema defaults that `markdownToProseMirror` materializes on + * import, keyed by node/mark type → { attr: defaultValue }. + * + * Why this exists: `canonicalizeAttrs` already treats an absent attr as + * equivalent to an explicit `null`/`undefined`. But several Docmost schema + * attributes default to a NON-null value, so import fills them in even when the + * source omitted them — making "attr absent" diverge from "attr at its default + * value" under a raw deep-equal. To keep "absent ≡ explicit-default", we ALSO + * drop any attr whose value equals its known schema default. A non-default + * value (e.g. `orderedList.start: 5`) is NOT a default, so it is KEPT. + * + * Every entry below was read from `packages/docmost-client/src/lib/ + * docmost-schema.ts` (the line refs are the exact `default:` declarations) and + * confirmed to be materialized by an export→import→export round-trip: + * - mark `link` target / rel — DocmostAttributes + StarterKit link. + * StarterKit's link extension defaults `target: "_blank"` and + * `rel: "noopener noreferrer nofollow"`; both materialize on import + * (empirically confirmed) even when the source had only `href`. + * - mark `comment` resolved — docmost-schema.ts L213-214 (`default: false`). + * - node `orderedList` start — provided by StarterKit's orderedList + * (`default: 1`); materializes on import (empirically confirmed). + * - node `drawio`/`excalidraw`/`video`/`youtube`/`embed` align — the diagram + * attribute set and the media nodes declare `align: { default: "center" }` + * (docmost-schema.ts L745-750 diagramAttributes; L564 video; L626 youtube; + * L667 embed). The diagram `align` is the one the round-trip materializes + * (docmost-schema.ts L745); the media/embed entries normalize the SAME + * `align` default for consistency. Note: this only normalizes `align` — + * full canonical stability of `embed` is separately limited by the + * converter coercing numeric `width`/`height` to strings, which is outside + * canonicalize's scope. + * + * NOTE: `image` has NO non-null align default — its `align` defaults to `null` + * (docmost-schema.ts L174), so it is already handled by the null-drop rule and + * is intentionally NOT listed here. + */ +const KNOWN_DEFAULTS = { + // mark types + link: { + target: "_blank", + rel: "noopener noreferrer nofollow", + }, + comment: { + resolved: false, + }, + // node types + orderedList: { + start: 1, + }, + drawio: { + align: "center", + }, + excalidraw: { + align: "center", + }, + video: { + align: "center", + }, + youtube: { + align: "center", + }, + embed: { + align: "center", + }, +}; +/** + * Prune an `attrs` object in place on a fresh copy: drop keys whose value is + * `null` or `undefined` (an absent attribute and an explicit default of `null` + * are semantically equivalent here). Optionally also drop a node-level `id` + * (block ids are regenerated on import, SPEC §11). ALSO drop any attr whose + * value equals the node/mark `type`'s known NON-null schema default + * (`KNOWN_DEFAULTS`), so "attr absent" ≡ "attr at its default value" — without + * this, the import-materialized `link.target`/`comment.resolved`/ + * `orderedList.start`/diagram `align` defaults would be a phantom diff. Every + * non-default attribute value is KEPT (level, language, src, href, commentId, + * width, a non-default `start`/`align`, ...). + * + * Returns the pruned attrs object, or `undefined` if nothing meaningful is + * left (so the caller can drop the `attrs` key entirely: `{attrs:{}}` ≡ no + * attrs). + */ +function canonicalizeAttrs(attrs, dropId, type) { + const defaults = type ? KNOWN_DEFAULTS[type] : undefined; + const out = {}; + // Stable key order so a JSON.stringify of the canonical form is comparable + // regardless of the input's key order. + for (const key of Object.keys(attrs).sort()) { + // Block ids are regenerated on import; drop them on NODE attrs only. + if (dropId && key === "id") + continue; + const value = attrs[key]; + // Absent ≡ explicit-default-null/undefined. + if (value === null || value === undefined) + continue; + // Absent ≡ explicit known non-null default (e.g. link.target="_blank"). + // A non-default value (e.g. orderedList.start=5) does NOT match, so it is + // kept. The `comment` mark's `commentId` is never a default, so it always + // survives (SPEC §3); only its `resolved: false` default is normalized away. + if (defaults && key in defaults && value === defaults[key]) + continue; + out[key] = value; + } + return Object.keys(out).length > 0 ? out : undefined; +} +/** + * Return a DEEP COPY of a ProseMirror node tree, canonicalized so that two + * semantically-equal documents compare deep-equal. Rules (applied recursively + * to the node, its `content`, and its `marks`): + * + * 1. Remove node-level `attrs.id` (regenerated on import). Mark attrs are NOT + * touched for `id` (marks carry no block id; only their meaningful attrs). + * 2. In any `attrs` object (node OR mark) drop keys whose value is `null`/ + * `undefined` (absent ≡ explicit default null) OR equals that node/mark + * type's known non-null schema default (absent ≡ explicit default). + * Keep every non-default value. The type is passed into the attrs + * normalizer so it can look up `KNOWN_DEFAULTS`. + * 3. If an `attrs` object becomes empty after pruning, drop the `attrs` key. + * 4. Preserve `marks` (including the `comment` mark and its `commentId` — a + * meaningful anchor per SPEC §3; never strip it). + * 5. Preserve `text`, `type`, and `content` order exactly. + * 6. Never mutate the input. + */ +export function canonicalizeContent(node) { + if (Array.isArray(node)) { + return node.map((child) => canonicalizeContent(child)); + } + if (node === null || typeof node !== "object") { + // Primitive leaf (string/number/boolean/null): returned as-is. + return node; + } + // A node is a mark when it has a `type` but never carries block `content` + // and lives inside a `marks` array. We cannot tell from the node alone, so + // we distinguish at the recursion site: node `attrs` drop `id`, mark `attrs` + // do not. This is handled by passing a `dropId` flag down for the `attrs` + // key specifically (nodes) vs the `marks[].attrs` path (marks). + const out = {}; + for (const key of Object.keys(node)) { + if (key === "attrs" && node.attrs && typeof node.attrs === "object") { + // Node-level attrs: drop the block id, null/undefined attrs, and any + // attr at this node type's known non-null schema default. + const canon = canonicalizeAttrs(node.attrs, true, typeof node.type === "string" ? node.type : undefined); + if (canon !== undefined) + out.attrs = canon; + // else: drop the `attrs` key entirely (rule 3). + } + else if (key === "marks" && Array.isArray(node.marks)) { + // Marks: keep them all (incl. comment); canonicalize their attrs but do + // NOT drop `id` (a mark's `id` would be a meaningful attr, not a block + // id). An empty marks array is dropped so `marks:[]` ≡ no marks. + const marks = node.marks.map((mark) => canonicalizeMark(mark)); + if (marks.length > 0) + out.marks = marks; + } + else { + out[key] = canonicalizeContent(node[key]); + } + } + return out; +} +/** + * Canonicalize a single mark: keep `type`, prune its `attrs` (null/undefined + * AND known non-null defaults dropped, empty attrs removed) but NEVER drop a + * mark's attribute as a "block id" — marks have no block id, only meaningful + * attrs (href, commentId, color, level, ...). Meaningful NON-default attrs + * survive (the `comment` mark's `commentId` is never a default, so it always + * survives — SPEC §3); only known defaults like `link.target="_blank"`, + * `link.rel="noopener…"` and `comment.resolved=false` are normalized away. + */ +function canonicalizeMark(mark) { + if (mark === null || typeof mark !== "object") + return mark; + const out = {}; + for (const key of Object.keys(mark)) { + if (key === "attrs" && mark.attrs && typeof mark.attrs === "object") { + const canon = canonicalizeAttrs(mark.attrs, false, typeof mark.type === "string" ? mark.type : undefined); + if (canon !== undefined) + out.attrs = canon; + } + else { + out[key] = canonicalizeContent(mark[key]); + } + } + return out; +} +/** + * Deep structural equality of two values that is key-order-insensitive. + * Used to compare canonical forms. (`canonicalizeContent` already emits + * `attrs` in a stable key order, but the top-level node keys preserve input + * order, so we compare structurally rather than by string.) + */ +function deepEqual(a, b) { + if (a === b) + return true; + if (typeof a !== typeof b) + return false; + if (a === null || b === null) + return a === b; + if (typeof a !== "object") + return false; + const aIsArr = Array.isArray(a); + const bIsArr = Array.isArray(b); + if (aIsArr !== bIsArr) + return false; + if (aIsArr) { + if (a.length !== b.length) + return false; + for (let i = 0; i < a.length; i++) { + if (!deepEqual(a[i], b[i])) + return false; + } + return true; + } + const aKeys = Object.keys(a); + const bKeys = Object.keys(b); + if (aKeys.length !== bKeys.length) + return false; + for (const k of aKeys) { + if (!Object.prototype.hasOwnProperty.call(b, k)) + return false; + if (!deepEqual(a[k], b[k])) + return false; + } + return true; +} +/** + * True when two ProseMirror documents are semantically equal: equal after + * canonicalization (block ids stripped, absent-vs-default-null normalized). + */ +export function docsCanonicallyEqual(a, b) { + return deepEqual(canonicalizeContent(a), canonicalizeContent(b)); +} diff --git a/packages/git-sync/build/lib/diff.d.ts b/packages/git-sync/build/lib/diff.d.ts new file mode 100644 index 00000000..60997f4a --- /dev/null +++ b/packages/git-sync/build/lib/diff.d.ts @@ -0,0 +1,54 @@ +/** + * Headless, Docmost-equivalent document diff. + * + * Docmost's history editor computes a change set with the exact pipeline below + * (recreateTransform -> ChangeSet.addSteps -> simplifyChanges) and renders it as + * editor decorations. This module runs the SAME computation but serializes the + * result to text + integrity counts instead of decorations, so a diff can be + * previewed without a browser. + * + * recreateTransform here comes from @fellow/prosemirror-recreate-transform, the + * maintained published fork of the MIT prosemirror-recreate-steps source that + * Docmost vendors in @docmost/editor-ext; it exposes the identical + * recreateTransform(fromDoc, toDoc, { complexSteps, wordDiffs, simplifyDiff }) + * signature. + * + * If recreateTransform / the changeset throws on a pathological document pair, + * we fall back to a coarse block-level text diff so the tool never hard-fails. + */ +/** A single inserted/deleted change with its containing-block context. */ +export interface DiffChange { + op: "insert" | "delete"; + /** Lead (plain) text of the block that contains the change, for context. */ + block: string; + /** The inserted or deleted text. */ + text: string; +} +/** Integrity counts as [old, new] tuples; footnoteMarkers as [oldList, newList]. */ +export interface DiffIntegrity { + images: [number, number]; + links: [number, number]; + tables: [number, number]; + callouts: [number, number]; + footnoteMarkers: [number[], number[]]; +} +export interface DiffResult { + summary: { + inserted: number; + deleted: number; + blocksChanged: number; + }; + integrity: DiffIntegrity; + changes: DiffChange[]; + /** Human-readable unified-ish summary. */ + markdown: string; +} +/** + * Diff two ProseMirror JSON documents the way Docmost's history editor does and + * serialize the result to text + integrity counts. + * + * @param oldDocJson the earlier document + * @param newDocJson the later document + * @param notesHeading heading delimiting body from notes for footnote counting + */ +export declare function diffDocs(oldDocJson: any, newDocJson: any, notesHeading?: string): DiffResult; diff --git a/packages/git-sync/build/lib/diff.js b/packages/git-sync/build/lib/diff.js new file mode 100644 index 00000000..5205aff1 --- /dev/null +++ b/packages/git-sync/build/lib/diff.js @@ -0,0 +1,273 @@ +/** + * Headless, Docmost-equivalent document diff. + * + * Docmost's history editor computes a change set with the exact pipeline below + * (recreateTransform -> ChangeSet.addSteps -> simplifyChanges) and renders it as + * editor decorations. This module runs the SAME computation but serializes the + * result to text + integrity counts instead of decorations, so a diff can be + * previewed without a browser. + * + * recreateTransform here comes from @fellow/prosemirror-recreate-transform, the + * maintained published fork of the MIT prosemirror-recreate-steps source that + * Docmost vendors in @docmost/editor-ext; it exposes the identical + * recreateTransform(fromDoc, toDoc, { complexSteps, wordDiffs, simplifyDiff }) + * signature. + * + * If recreateTransform / the changeset throws on a pathological document pair, + * we fall back to a coarse block-level text diff so the tool never hard-fails. + */ +import { getSchema } from "@tiptap/core"; +import { Node } from "@tiptap/pm/model"; +import { ChangeSet, simplifyChanges } from "@tiptap/pm/changeset"; +import { recreateTransform } from "@fellow/prosemirror-recreate-transform"; +import { docmostExtensions } from "./docmost-schema.js"; +/** Build the schema once; it is pure and reused across calls. */ +const schema = getSchema(docmostExtensions); +/** Recursively concatenate the plain text of a JSON node. */ +function plainText(node) { + if (!node || typeof node !== "object") + return ""; + let out = ""; + if (typeof node.text === "string") + out += node.text; + if (Array.isArray(node.content)) { + for (const child of node.content) + out += plainText(child); + } + return out; +} +/** Count nodes in a JSON doc that satisfy `pred` (recursive). */ +function countNodes(doc, pred) { + let n = 0; + const visit = (node) => { + if (!node || typeof node !== "object") + return; + if (pred(node)) + n++; + if (Array.isArray(node.content)) + for (const c of node.content) + visit(c); + }; + visit(doc); + return n; +} +/** + * Count UNIQUE links in a JSON doc by their `href`. A single link can be split + * across several adjacent text runs (e.g. a "link+bold" run followed by a "link" + * run); counting link-bearing runs would over-count it. Walking the tree and + * collecting hrefs into a Set keys each distinct link once. Link marks with a + * missing/empty href are bucketed under a single "" key so a malformed link is + * still counted as one. + */ +function countUniqueLinks(doc) { + const hrefs = new Set(); + const visit = (node) => { + if (!node || typeof node !== "object") + return; + if (node.type === "text" && Array.isArray(node.marks)) { + for (const m of node.marks) { + if (m && m.type === "link") { + const href = m.attrs && typeof m.attrs.href === "string" ? m.attrs.href : ""; + hrefs.add(href); + } + } + } + if (Array.isArray(node.content)) + for (const c of node.content) + visit(c); + }; + visit(doc); + return hrefs.size; +} +/** + * Parse the ordered list of integers from `[N]` footnote markers found in the + * BODY only (every top-level block before the first "Примечания..." notes + * heading; if no such heading, the whole doc). Returned in reading order. + */ +function footnoteMarkers(doc, notesHeading) { + const top = Array.isArray(doc?.content) ? doc.content : []; + const notesIdx = top.findIndex((n) => n && + n.type === "heading" && + plainText(n).trim() === notesHeading); + const bodyBlocks = notesIdx >= 0 ? top.slice(0, notesIdx) : top; + const markers = []; + const re = /\[(\d+)\]/g; + for (const block of bodyBlocks) { + const text = plainText(block); + let m; + re.lastIndex = 0; + while ((m = re.exec(text)) !== null) { + markers.push(Number(m[1])); + } + } + return markers; +} +/** Compute the [old,new] integrity tuples for two JSON docs. */ +function computeIntegrity(oldDoc, newDoc, notesHeading) { + const images = [ + countNodes(oldDoc, (n) => n.type === "image"), + countNodes(newDoc, (n) => n.type === "image"), + ]; + const links = [ + countUniqueLinks(oldDoc), + countUniqueLinks(newDoc), + ]; + const tables = [ + countNodes(oldDoc, (n) => n.type === "table"), + countNodes(newDoc, (n) => n.type === "table"), + ]; + const callouts = [ + countNodes(oldDoc, (n) => n.type === "callout"), + countNodes(newDoc, (n) => n.type === "callout"), + ]; + const fns = [ + footnoteMarkers(oldDoc, notesHeading), + footnoteMarkers(newDoc, notesHeading), + ]; + return { images, links, tables, callouts, footnoteMarkers: fns }; +} +/** + * Resolve the lead text of the top-level block in a ProseMirror Node that + * contains the given document position. Returns "" when out of range. + */ +function blockContextAt(node, pos) { + try { + const clamped = Math.max(0, Math.min(pos, node.content.size)); + const $pos = node.resolve(clamped); + // depth 1 is the top-level block in a doc node. + const block = $pos.depth >= 1 ? $pos.node(1) : $pos.node(0); + const text = block.textContent || ""; + return text.length > 80 ? text.slice(0, 77) + "..." : text; + } + catch { + return ""; + } +} +/** Truncate a string for the markdown summary. */ +function truncate(s, n = 120) { + return s.length > n ? s.slice(0, n - 3) + "..." : s; +} +/** + * Coarse fallback: a block-by-block plain-text diff. Used only when the precise + * changeset pipeline throws, so the tool degrades gracefully instead of failing. + */ +function coarseDiff(oldDoc, newDoc) { + const oldBlocks = Array.isArray(oldDoc?.content) ? oldDoc.content : []; + const newBlocks = Array.isArray(newDoc?.content) ? newDoc.content : []; + const oldTexts = oldBlocks.map(plainText); + const newTexts = newBlocks.map(plainText); + const oldSet = new Set(oldTexts); + const newSet = new Set(newTexts); + const changes = []; + for (const t of oldTexts) { + if (!newSet.has(t) && t.trim() !== "") { + changes.push({ op: "delete", block: truncate(t, 80), text: t }); + } + } + for (const t of newTexts) { + if (!oldSet.has(t) && t.trim() !== "") { + changes.push({ op: "insert", block: truncate(t, 80), text: t }); + } + } + return changes; +} +/** Build the human-readable unified-ish markdown summary. */ +function renderMarkdown(result, fellBack) { + const lines = []; + const { summary, integrity, changes } = result; + lines.push(`# Diff: ${summary.inserted} inserted / ${summary.deleted} deleted (${summary.blocksChanged} blocks changed)`); + if (fellBack) { + lines.push(""); + lines.push("> note: precise diff failed; coarse block-level diff shown."); + } + lines.push(""); + lines.push("## Integrity (old -> new)"); + lines.push(`- images: ${integrity.images[0]} -> ${integrity.images[1]}`); + lines.push(`- links: ${integrity.links[0]} -> ${integrity.links[1]}`); + lines.push(`- tables: ${integrity.tables[0]} -> ${integrity.tables[1]}`); + lines.push(`- callouts: ${integrity.callouts[0]} -> ${integrity.callouts[1]}`); + lines.push(`- footnoteMarkers: [${integrity.footnoteMarkers[0].join(", ")}] -> [${integrity.footnoteMarkers[1].join(", ")}]`); + lines.push(""); + lines.push("## Changes"); + if (changes.length === 0) { + lines.push("(no textual changes)"); + } + else { + for (const c of changes) { + const sign = c.op === "insert" ? "+" : "-"; + const ctx = c.block ? ` @ ${truncate(c.block, 60)}` : ""; + lines.push(`${sign} ${truncate(c.text)}${ctx}`); + } + } + return lines.join("\n"); +} +/** + * Diff two ProseMirror JSON documents the way Docmost's history editor does and + * serialize the result to text + integrity counts. + * + * @param oldDocJson the earlier document + * @param newDocJson the later document + * @param notesHeading heading delimiting body from notes for footnote counting + */ +export function diffDocs(oldDocJson, newDocJson, notesHeading = "Примечания переводчика") { + const integrity = computeIntegrity(oldDocJson, newDocJson, notesHeading); + let changes = []; + let inserted = 0; + let deleted = 0; + let fellBack = false; + const changedBlocks = new Set(); + try { + const oldNode = Node.fromJSON(schema, oldDocJson); + const newNode = Node.fromJSON(schema, newDocJson); + const tr = recreateTransform(oldNode, newNode, { + complexSteps: false, + wordDiffs: true, + simplifyDiff: true, + }); + const changeSet = ChangeSet.create(oldNode).addSteps(tr.doc, tr.mapping.maps, []); + const simplified = simplifyChanges(changeSet.changes, newNode); + for (const change of simplified) { + // Deleted text lives in the OLD doc coordinate range [fromA, toA). + if (change.toA > change.fromA) { + const text = oldNode.textBetween(change.fromA, change.toA, "\n", " "); + if (text.length > 0) { + deleted += text.length; + const block = blockContextAt(oldNode, change.fromA); + changes.push({ op: "delete", block, text }); + if (block) + changedBlocks.add("d:" + block); + } + } + // Inserted text lives in the NEW doc coordinate range [fromB, toB). + if (change.toB > change.fromB) { + const text = newNode.textBetween(change.fromB, change.toB, "\n", " "); + if (text.length > 0) { + inserted += text.length; + const block = blockContextAt(newNode, change.fromB); + changes.push({ op: "insert", block, text }); + if (block) + changedBlocks.add("i:" + block); + } + } + } + } + catch { + // Pathological pair: degrade to a coarse block-level diff so we never throw. + fellBack = true; + changes = coarseDiff(oldDocJson, newDocJson); + for (const c of changes) { + if (c.op === "insert") + inserted += c.text.length; + else + deleted += c.text.length; + if (c.block) + changedBlocks.add(c.op[0] + ":" + c.block); + } + } + const partial = { + summary: { inserted, deleted, blocksChanged: changedBlocks.size }, + integrity, + changes, + }; + return { ...partial, markdown: renderMarkdown(partial, fellBack) }; +} diff --git a/packages/git-sync/build/lib/docmost-schema.d.ts b/packages/git-sync/build/lib/docmost-schema.d.ts new file mode 100644 index 00000000..8684e1bc --- /dev/null +++ b/packages/git-sync/build/lib/docmost-schema.d.ts @@ -0,0 +1,9 @@ +import { Node, Extension, Mark } from "@tiptap/core"; +export declare const clampCalloutType: (value: string | null | undefined) => string; +export declare const sanitizeCssColor: (value: string | null | undefined) => string | null; +/** + * Full extension list. Image is block-level (matches Docmost); the + * ProseMirror DOM parser hoists found inside

automatically. + * StarterKit v3 already bundles the link extension, configured here. + */ +export declare const docmostExtensions: (Node | Mark | Extension | Extension | Node | Node | Node | Mark | Mark)[]; diff --git a/packages/git-sync/build/lib/docmost-schema.js b/packages/git-sync/build/lib/docmost-schema.js new file mode 100644 index 00000000..97cdcafd --- /dev/null +++ b/packages/git-sync/build/lib/docmost-schema.js @@ -0,0 +1,999 @@ +/** + * Full TipTap extension set matching the real Docmost document schema. + * + * The default StarterKit-only schema silently destroys Docmost-specific + * nodes (callout, table) and drops attributes it does not know about + * (node ids, image sizing, link targets). Every code path that converts + * to or from ProseMirror JSON must use THIS set, otherwise a round-trip + * loses content. + */ +import StarterKit from "@tiptap/starter-kit"; +import Image from "@tiptap/extension-image"; +import TaskList from "@tiptap/extension-task-list"; +import TaskItem from "@tiptap/extension-task-item"; +import Highlight from "@tiptap/extension-highlight"; +import Subscript from "@tiptap/extension-subscript"; +import Superscript from "@tiptap/extension-superscript"; +import { Node, Extension, Mark } from "@tiptap/core"; +// Inlined from @tiptap/core's getStyleProperty (added after 3.20.x) so this +// package can stay on the same @tiptap/core version as the editor and avoid a +// duplicate-tiptap version split in the monorepo. Reads a single declaration +// from an element's inline `style` attribute, last-wins, case-insensitive. +function getStyleProperty(element, propertyName) { + const styleAttr = element.getAttribute("style"); + if (!styleAttr) { + return null; + } + const decls = styleAttr.split(";").map((decl) => decl.trim()).filter(Boolean); + const target = propertyName.toLowerCase(); + for (let i = decls.length - 1; i >= 0; i -= 1) { + const decl = decls[i]; + const colonIndex = decl.indexOf(":"); + if (colonIndex === -1) { + continue; + } + const prop = decl.slice(0, colonIndex).trim().toLowerCase(); + if (prop === target) { + return decl.slice(colonIndex + 1).trim(); + } + } + return null; +} +/** Allowed Docmost callout types; anything else falls back to "info". */ +const CALLOUT_TYPES = ["info", "warning", "danger", "success"]; +export const clampCalloutType = (value) => value && CALLOUT_TYPES.includes(value.toLowerCase()) + ? value.toLowerCase() + : "info"; +/** + * Allowlist guard for CSS color values imported from HTML. + * + * Docmost interpolates stored mark colors straight into an inline style + * attribute (e.g. style="background-color: ${color}" / "color: ${color}"). + * An unsanitized value such as `red; --x: url(...)` or `red">