feat(git-sync): vendor pure converter + engine into @docmost/git-sync (Phase A.1)

First step of docs/git-sync-plan.md. New workspace package @docmost/git-sync
vendoring the PURE parts from docmost-sync (HEAD b03eb35):
- lib: markdown-converter, markdown-document, canonicalize, docmost-schema,
  node-ops, diff, and an extracted markdown-to-prosemirror (only the pure
  marked->HTML->generateJSON path from upstream collaboration.ts; no websocket).
- engine (pure, no IO): reconcile, layout, sanitize, stabilize, loop-guard.
Ported the upstream pure-module + round-trip corpus tests (vitest): 314 pass,
3 expected upstream known-limitation fails. tsc clean. No server wiring yet.

docmost-schema inlines getStyleProperty (as packages/mcp does — @tiptap/core
3.20.4 doesn't export it). IO engine (pull/push/git/settings) deferred to later
Phase A/B steps; the editor-ext idempotency gate (plan §13.1) is the next step.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
claude code agent 227
2026-06-21 13:55:23 +03:00
parent 406921ac6a
commit c0dbc97fe2
61 changed files with 9729 additions and 1817 deletions

View File

@@ -1,170 +0,0 @@
/**
* Pure page-tree -> vault path mapping (SPEC §12).
*
* Given the flat list of page nodes for a space (as returned by
* `listAllSpacePages`), compute for every page a deterministic, collision-free
* destination: a folder path (root -> leaf ancestors) plus a file stem (the
* page's own name, no extension). This module is intentionally PURE and
* dependency-free apart from the sanitization helpers, so the whole tree ->
* path logic is unit-testable without any I/O. The names are COSMETIC; identity
* lives in each file's meta block (pageId / slugId).
*/
import { sanitizeTitle, disambiguate } from "./sanitize.js";
/**
* Build the full vault layout for a space.
*
* Returns a Map keyed by pageId -> `{ segments, stem }`. The result is
* deterministic for a given input and guarantees every full destination path
* (`[...segments, stem].join("/")`) is unique, so no page can silently overwrite
* another.
*
* Disambiguation is layered:
* 1. Sibling collisions (same sanitized title under the same parent) are
* resolved with a stable ` ~<slugId>` suffix (the suffix is itself
* sanitized, since slugId/id is untrusted data that must never inject a
* path separator).
* 2. A final full-path pass catches residual collisions that sibling-scoping
* cannot see — e.g. two pages whose parents are BOTH outside the input set
* both bucket at the root with `segments: []`.
*/
export function buildVaultLayout(pages) {
// Index pages by id so the parent chain can be walked. Guard against
// duplicate ids in the input (first one wins).
const byId = new Map();
for (const p of pages) {
if (p && p.id && !byId.has(p.id))
byId.set(p.id, p);
}
// Resolve each node's display name once, deterministically, tracking sibling
// collisions per parent. `usedBySibling` maps a parent key -> set of names
// already taken under that parent. The bucket key is the node's parent ONLY
// when that parent is actually present in `byId`; otherwise (null parent, or
// an orphan whose parent is outside the input set) the node buckets at
// `"__root__"`. This is critical: orphans land at the vault root (see
// `folderSegmentsFor`), so they MUST share the root bucket with real root
// pages to be disambiguated against each other here — making `nameById` final
// before any `segments` are computed, so no ancestor name can drift later.
const usedBySibling = new Map();
const nameById = new Map();
for (const p of pages) {
if (p && p.id && !nameById.has(p.id)) {
const parentKey = p.parentPageId && byId.has(p.parentPageId) ? p.parentPageId : "__root__";
nameById.set(p.id, nameForNode(p, parentKey, usedBySibling));
}
}
// Every id we index above MUST get a resolved name; this helper returns it
// and THROWS if it is somehow absent, rather than silently recomputing a
// DIFFERENT, non-disambiguated name (which would desync a folder segment from
// its target file).
const nameOf = (id) => {
const name = nameById.get(id);
if (name === undefined) {
throw new Error(`buildVaultLayout: no resolved name for page id ${id}`);
}
return name;
};
// Build the folder path for a page by walking parentPageId to the root. The
// page's OWN name is the file stem; its ancestors become folders. A `visited`
// guard prevents an infinite loop on a malformed parent cycle.
const folderSegmentsFor = (node) => {
const ancestors = [];
const visited = new Set();
let current = node.parentPageId
? byId.get(node.parentPageId)
: undefined;
while (current && current.id && !visited.has(current.id)) {
visited.add(current.id);
ancestors.unshift(nameOf(current.id));
current = current.parentPageId
? byId.get(current.parentPageId)
: undefined;
}
return ancestors;
};
// First pass: compute the provisional { segments, stem } for every node.
const layout = new Map();
for (const p of pages) {
if (!p || !p.id || layout.has(p.id))
continue;
layout.set(p.id, {
segments: folderSegmentsFor(p),
stem: nameOf(p.id),
});
}
// FOLDER-NOTE transform (native-Obsidian layout): a page WITH CHILDREN lives at
// `<…>/<stem>/<stem>.md` — its body is the folder-note INSIDE its own folder
// (LostPaul Folder Notes convention), and its children sit alongside it in that
// folder. A leaf stays `<…>/<stem>.md`. Children's segments already point into
// the parent's folder (folderSegmentsFor walks ancestor NAMES), so only the
// parent's own file relocates here; the sibling name pass above already made
// the parent name unique, so folder == file name stays consistent.
for (const p of pages) {
if (!p || !p.id)
continue;
const entry = layout.get(p.id);
if (entry && p.hasChildren) {
entry.segments = [...entry.segments, entry.stem];
}
}
// Final full-path uniqueness pass — a belt-and-suspenders safety net. Note
// that cross-bucket (orphan/root) collisions are now resolved in the name pass
// above (orphans share the "__root__" bucket), so ancestor names are final
// before `segments` are built and this pass should rarely/never re-stem an
// ancestor. It only re-stems the colliding LATER leaf via the sanitized
// slugId/id, then (if still colliding) appends the id.
//
// Process FOLDER-NOTES (pages with children) FIRST so a parent claims its
// canonical `<name>/<name>.md` before a same-named CHILD — the child (a leaf)
// is the one that disambiguates, never the folder-note.
const usedPaths = new Set();
const seenIds = new Set();
const pathKey = (e) => [...e.segments, e.stem].join("/");
const ordered = pages
.filter((p) => Boolean(p && p.id))
.sort((a, b) => Number(Boolean(b.hasChildren)) - Number(Boolean(a.hasChildren)));
for (const p of ordered) {
if (seenIds.has(p.id))
continue;
seenIds.add(p.id);
const entry = layout.get(p.id);
if (!entry)
continue;
if (usedPaths.has(pathKey(entry))) {
// First attempt: disambiguate the stem with the sanitized slugId (or id).
entry.stem = disambiguate(entry.stem, sanitizeTitle(p.slugId ?? p.id));
if (usedPaths.has(pathKey(entry))) {
// Still colliding: append the (sanitized) id as a last resort. The id
// is globally unique, so this always resolves the collision.
entry.stem = disambiguate(entry.stem, sanitizeTitle(p.id));
}
}
usedPaths.add(pathKey(entry));
}
return layout;
}
/**
* Compute a deterministic, collision-free name for a node among its SIBLINGS.
* `usedBySibling` maps a parent key -> set of names already taken, so two
* siblings that sanitize to the same name get a stable ` ~slugId` suffix
* (SPEC §12). The suffix is itself passed through `sanitizeTitle`, because the
* slugId/id is a second untrusted-data channel that must never leak a path
* separator into the name. `parentKey` is supplied by the caller (it resolves
* to `"__root__"` for root pages AND for orphans whose parent is outside the
* input set, so they share one bucket). The name is COSMETIC; identity lives in
* the meta block.
*/
function nameForNode(node, parentKey, usedBySibling) {
let used = usedBySibling.get(parentKey);
if (!used) {
used = new Set();
usedBySibling.set(parentKey, used);
}
let name = sanitizeTitle(node.title ?? "");
if (used.has(name)) {
// Sibling collision: disambiguate with the stable, sanitized slugId (fall
// back to the sanitized pageId if no slugId is present).
name = disambiguate(name, sanitizeTitle(node.slugId ?? node.id));
}
used.add(name);
return name;
}

View File

@@ -1,21 +0,0 @@
/**
* Pure, IO-free comparison helpers for the idempotency round-trip checks. The
* round-trip harness that drives these lives in the package's tests, not in the
* engine.
*/
/**
* Recursively strip every `attrs.id` from a ProseMirror node tree. Block ids
* are regenerated by `markdownToProseMirror` (SPEC §11), so they must be
* ignored when comparing the semantic shape of two documents. Returns a NEW
* tree; the input is not mutated.
*/
export declare function stripBlockIds(node: any): any;
/**
* Find the first divergence between two values via a recursive deep compare.
* Returns a short path + the two differing values, or null if they are equal.
*/
export declare function firstDivergence(a: any, b: any, path?: string): {
path: string;
a: any;
b: any;
} | null;

View File

@@ -1,70 +0,0 @@
/**
* Pure, IO-free comparison helpers for the idempotency round-trip checks. The
* round-trip harness that drives these lives in the package's tests, not in the
* engine.
*/
/**
* Recursively strip every `attrs.id` from a ProseMirror node tree. Block ids
* are regenerated by `markdownToProseMirror` (SPEC §11), so they must be
* ignored when comparing the semantic shape of two documents. Returns a NEW
* tree; the input is not mutated.
*/
export function stripBlockIds(node) {
if (Array.isArray(node)) {
return node.map(stripBlockIds);
}
if (node && typeof node === "object") {
const out = {};
for (const key of Object.keys(node)) {
if (key === "attrs" && node.attrs && typeof node.attrs === "object") {
// Drop the `id` attr; keep every other attribute.
const { id, ...rest } = node.attrs;
void id;
out.attrs = stripBlockIds(rest);
}
else {
out[key] = stripBlockIds(node[key]);
}
}
return out;
}
return node;
}
/**
* Find the first divergence between two values via a recursive deep compare.
* Returns a short path + the two differing values, or null if they are equal.
*/
export function firstDivergence(a, b, path = "$") {
if (a === b)
return null;
const ta = typeof a;
const tb = typeof b;
if (ta !== tb || a === null || b === null) {
return { path, a, b };
}
if (ta !== "object") {
return { path, a, b };
}
const aIsArr = Array.isArray(a);
const bIsArr = Array.isArray(b);
if (aIsArr !== bIsArr)
return { path, a, b };
if (aIsArr) {
if (a.length !== b.length) {
return { path: `${path}.length`, a: a.length, b: b.length };
}
for (let i = 0; i < a.length; i++) {
const d = firstDivergence(a[i], b[i], `${path}[${i}]`);
if (d)
return d;
}
return null;
}
const keys = new Set([...Object.keys(a), ...Object.keys(b)]);
for (const k of keys) {
const d = firstDivergence(a[k], b[k], `${path}.${k}`);
if (d)
return d;
}
return null;
}

View File

@@ -1,41 +0,0 @@
/**
* Meta object as `exportPageBody` builds it (SPEC §4). Kept byte-for-byte
* compatible so files produced here match `exportPageBody`'s output exactly.
*/
export interface PageMeta {
version: 1;
pageId: string;
slugId: string;
title: string;
spaceId: string;
parentPageId: string | null;
}
/**
* Produce the self-contained `.md` file text for a page from its raw
* ProseMirror `content` + identity meta, in the verified fixpoint form.
*
* md1 = convertProseMirrorToMarkdown(content)
* doc2 = markdownToProseMirror(md1) // one import...
* stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export
* file = serializeDocmostMarkdownBody(meta, stableBody)
*
* The single export->import->export pass is the verified fixpoint (SPEC §11):
* idempotent for already-stable content, and the convergence point for the
* known converter asymmetries.
*/
export declare function stabilizePageFile(content: unknown, meta: PageMeta): Promise<string>;
/**
* The fixpoint markdown BODY for a page's ProseMirror `content`, WITHOUT any meta
* envelope:
*
* md1 = convertProseMirrorToMarkdown(content) // export...
* doc2 = markdownToProseMirror(md1) // ...import...
* stableBody = convertProseMirrorToMarkdown(doc2) // ...re-export
*
* The single export->import->export pass is the verified fixpoint (SPEC §11):
* idempotent for already-stable content, and the convergence point for the known
* converter asymmetries. The native-Obsidian writer (`serializePageFile`) wraps
* this body with a minimal `gitmost_id` frontmatter; determinism here is what
* keeps re-pulls of an unchanged page byte-identical (no churn, loop-guard).
*/
export declare function stabilizePageBody(content: unknown): Promise<string>;

View File

@@ -1,52 +0,0 @@
/**
* Normalize-on-write helper (SPEC §11 "Резолюция").
*
* git diffs byte-for-byte, so writing a page in a NON-fixpoint markdown form
* would make the next pull re-export it to a slightly different (but stable)
* form and produce a phantom diff -> churny commits. The converter has a couple
* of known one-pass asymmetries (a block image after a paragraph adds an empty
* paragraph; a diagram materializes `data-align`), all of which converge to a
* fixpoint after ONE `export -> import -> export` round-trip.
*
* So at write time we run exactly that one pass and persist the fixpoint form.
* Already-stable content is unaffected (the pass is idempotent), so re-pulls of
* unchanged pages produce identical bytes and git sees no diff.
*/
import { convertProseMirrorToMarkdown, markdownToProseMirror, serializeDocmostMarkdownBody, } from "../lib/index.js";
/**
* Produce the self-contained `.md` file text for a page from its raw
* ProseMirror `content` + identity meta, in the verified fixpoint form.
*
* md1 = convertProseMirrorToMarkdown(content)
* doc2 = markdownToProseMirror(md1) // one import...
* stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export
* file = serializeDocmostMarkdownBody(meta, stableBody)
*
* The single export->import->export pass is the verified fixpoint (SPEC §11):
* idempotent for already-stable content, and the convergence point for the
* known converter asymmetries.
*/
export async function stabilizePageFile(content, meta) {
// The meta shape is exactly what `exportPageBody` writes; cast to the lib's
// DocmostMdMeta (a superset with optional fields) for the serializer.
return serializeDocmostMarkdownBody(meta, await stabilizePageBody(content));
}
/**
* The fixpoint markdown BODY for a page's ProseMirror `content`, WITHOUT any meta
* envelope:
*
* md1 = convertProseMirrorToMarkdown(content) // export...
* doc2 = markdownToProseMirror(md1) // ...import...
* stableBody = convertProseMirrorToMarkdown(doc2) // ...re-export
*
* The single export->import->export pass is the verified fixpoint (SPEC §11):
* idempotent for already-stable content, and the convergence point for the known
* converter asymmetries. The native-Obsidian writer (`serializePageFile`) wraps
* this body with a minimal `gitmost_id` frontmatter; determinism here is what
* keeps re-pulls of an unchanged page byte-identical (no churn, loop-guard).
*/
export async function stabilizePageBody(content) {
const md1 = convertProseMirrorToMarkdown(content);
const doc2 = await markdownToProseMirror(md1);
return convertProseMirrorToMarkdown(doc2);
}