feat(git-sync): vendor pure converter + engine into @docmost/git-sync (Phase A.1)
First step of docs/git-sync-plan.md. New workspace package @docmost/git-sync vendoring the PURE parts from docmost-sync (HEAD b03eb35): - lib: markdown-converter, markdown-document, canonicalize, docmost-schema, node-ops, diff, and an extracted markdown-to-prosemirror (only the pure marked->HTML->generateJSON path from upstream collaboration.ts; no websocket). - engine (pure, no IO): reconcile, layout, sanitize, stabilize, loop-guard. Ported the upstream pure-module + round-trip corpus tests (vitest): 314 pass, 3 expected upstream known-limitation fails. tsc clean. No server wiring yet. docmost-schema inlines getStyleProperty (as packages/mcp does — @tiptap/core 3.20.4 doesn't export it). IO engine (pull/push/git/settings) deferred to later Phase A/B steps; the editor-ext idempotency gate (plan §13.1) is the next step. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
177
packages/git-sync/src/engine/layout.ts
Normal file
177
packages/git-sync/src/engine/layout.ts
Normal file
@@ -0,0 +1,177 @@
|
||||
/**
|
||||
* Pure page-tree -> vault path mapping (SPEC §12).
|
||||
*
|
||||
* Given the flat list of page nodes for a space (as returned by
|
||||
* `listAllSpacePages`), compute for every page a deterministic, collision-free
|
||||
* destination: a folder path (root -> leaf ancestors) plus a file stem (the
|
||||
* page's own name, no extension). This module is intentionally PURE and
|
||||
* dependency-free apart from the sanitization helpers, so the whole tree ->
|
||||
* path logic is unit-testable without any I/O. The names are COSMETIC; identity
|
||||
* lives in each file's meta block (pageId / slugId).
|
||||
*/
|
||||
|
||||
import { sanitizeTitle, disambiguate } from "./sanitize.js";
|
||||
|
||||
/** Flat page node as returned by `listAllSpacePages` (no content). */
|
||||
export interface PageNode {
|
||||
id: string;
|
||||
title?: string;
|
||||
slugId?: string;
|
||||
parentPageId?: string | null;
|
||||
hasChildren?: boolean;
|
||||
}
|
||||
|
||||
/** A page's resolved vault destination: folder path + file stem. */
|
||||
export interface VaultEntry {
|
||||
/** Folder path, root -> leaf (the page's ancestors). Empty for a root page. */
|
||||
segments: string[];
|
||||
/** The page's own file name without extension. */
|
||||
stem: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the full vault layout for a space.
|
||||
*
|
||||
* Returns a Map keyed by pageId -> `{ segments, stem }`. The result is
|
||||
* deterministic for a given input and guarantees every full destination path
|
||||
* (`[...segments, stem].join("/")`) is unique, so no page can silently overwrite
|
||||
* another.
|
||||
*
|
||||
* Disambiguation is layered:
|
||||
* 1. Sibling collisions (same sanitized title under the same parent) are
|
||||
* resolved with a stable ` ~<slugId>` suffix (the suffix is itself
|
||||
* sanitized, since slugId/id is untrusted data that must never inject a
|
||||
* path separator).
|
||||
* 2. A final full-path pass catches residual collisions that sibling-scoping
|
||||
* cannot see — e.g. two pages whose parents are BOTH outside the input set
|
||||
* both bucket at the root with `segments: []`.
|
||||
*/
|
||||
export function buildVaultLayout(pages: PageNode[]): Map<string, VaultEntry> {
|
||||
// Index pages by id so the parent chain can be walked. Guard against
|
||||
// duplicate ids in the input (first one wins).
|
||||
const byId = new Map<string, PageNode>();
|
||||
for (const p of pages) {
|
||||
if (p && p.id && !byId.has(p.id)) byId.set(p.id, p);
|
||||
}
|
||||
|
||||
// Resolve each node's display name once, deterministically, tracking sibling
|
||||
// collisions per parent. `usedBySibling` maps a parent key -> set of names
|
||||
// already taken under that parent. The bucket key is the node's parent ONLY
|
||||
// when that parent is actually present in `byId`; otherwise (null parent, or
|
||||
// an orphan whose parent is outside the input set) the node buckets at
|
||||
// `"__root__"`. This is critical: orphans land at the vault root (see
|
||||
// `folderSegmentsFor`), so they MUST share the root bucket with real root
|
||||
// pages to be disambiguated against each other here — making `nameById` final
|
||||
// before any `segments` are computed, so no ancestor name can drift later.
|
||||
const usedBySibling = new Map<string, Set<string>>();
|
||||
const nameById = new Map<string, string>();
|
||||
for (const p of pages) {
|
||||
if (p && p.id && !nameById.has(p.id)) {
|
||||
const parentKey =
|
||||
p.parentPageId && byId.has(p.parentPageId) ? p.parentPageId : "__root__";
|
||||
nameById.set(p.id, nameForNode(p, parentKey, usedBySibling));
|
||||
}
|
||||
}
|
||||
|
||||
// Every id we index above MUST get a resolved name; this helper returns it
|
||||
// and THROWS if it is somehow absent, rather than silently recomputing a
|
||||
// DIFFERENT, non-disambiguated name (which would desync a folder segment from
|
||||
// its target file).
|
||||
const nameOf = (id: string): string => {
|
||||
const name = nameById.get(id);
|
||||
if (name === undefined) {
|
||||
throw new Error(`buildVaultLayout: no resolved name for page id ${id}`);
|
||||
}
|
||||
return name;
|
||||
};
|
||||
|
||||
// Build the folder path for a page by walking parentPageId to the root. The
|
||||
// page's OWN name is the file stem; its ancestors become folders. A `visited`
|
||||
// guard prevents an infinite loop on a malformed parent cycle.
|
||||
const folderSegmentsFor = (node: PageNode): string[] => {
|
||||
const ancestors: string[] = [];
|
||||
const visited = new Set<string>();
|
||||
let current: PageNode | undefined = node.parentPageId
|
||||
? byId.get(node.parentPageId)
|
||||
: undefined;
|
||||
while (current && current.id && !visited.has(current.id)) {
|
||||
visited.add(current.id);
|
||||
ancestors.unshift(nameOf(current.id));
|
||||
current = current.parentPageId
|
||||
? byId.get(current.parentPageId)
|
||||
: undefined;
|
||||
}
|
||||
return ancestors;
|
||||
};
|
||||
|
||||
// First pass: compute the provisional { segments, stem } for every node.
|
||||
const layout = new Map<string, VaultEntry>();
|
||||
for (const p of pages) {
|
||||
if (!p || !p.id || layout.has(p.id)) continue;
|
||||
layout.set(p.id, {
|
||||
segments: folderSegmentsFor(p),
|
||||
stem: nameOf(p.id),
|
||||
});
|
||||
}
|
||||
|
||||
// Final full-path uniqueness pass — a belt-and-suspenders safety net. Note
|
||||
// that cross-bucket (orphan/root) collisions are now resolved in the name pass
|
||||
// above (orphans share the "__root__" bucket), so ancestor names are final
|
||||
// before `segments` are built and this pass should rarely/never re-stem an
|
||||
// ancestor. It only re-stems the colliding LATER leaf via the sanitized
|
||||
// slugId/id, then (if still colliding) appends the id.
|
||||
const usedPaths = new Set<string>();
|
||||
const seenIds = new Set<string>();
|
||||
const pathKey = (e: VaultEntry): string => [...e.segments, e.stem].join("/");
|
||||
for (const p of pages) {
|
||||
if (!p || !p.id || seenIds.has(p.id)) continue;
|
||||
seenIds.add(p.id);
|
||||
const entry = layout.get(p.id);
|
||||
if (!entry) continue;
|
||||
|
||||
if (usedPaths.has(pathKey(entry))) {
|
||||
// First attempt: disambiguate the stem with the sanitized slugId (or id).
|
||||
entry.stem = disambiguate(entry.stem, sanitizeTitle(p.slugId ?? p.id));
|
||||
if (usedPaths.has(pathKey(entry))) {
|
||||
// Still colliding: append the (sanitized) id as a last resort. The id
|
||||
// is globally unique, so this always resolves the collision.
|
||||
entry.stem = disambiguate(entry.stem, sanitizeTitle(p.id));
|
||||
}
|
||||
}
|
||||
usedPaths.add(pathKey(entry));
|
||||
}
|
||||
|
||||
return layout;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a deterministic, collision-free name for a node among its SIBLINGS.
|
||||
* `usedBySibling` maps a parent key -> set of names already taken, so two
|
||||
* siblings that sanitize to the same name get a stable ` ~slugId` suffix
|
||||
* (SPEC §12). The suffix is itself passed through `sanitizeTitle`, because the
|
||||
* slugId/id is a second untrusted-data channel that must never leak a path
|
||||
* separator into the name. `parentKey` is supplied by the caller (it resolves
|
||||
* to `"__root__"` for root pages AND for orphans whose parent is outside the
|
||||
* input set, so they share one bucket). The name is COSMETIC; identity lives in
|
||||
* the meta block.
|
||||
*/
|
||||
function nameForNode(
|
||||
node: PageNode,
|
||||
parentKey: string,
|
||||
usedBySibling: Map<string, Set<string>>,
|
||||
): string {
|
||||
let used = usedBySibling.get(parentKey);
|
||||
if (!used) {
|
||||
used = new Set<string>();
|
||||
usedBySibling.set(parentKey, used);
|
||||
}
|
||||
|
||||
let name = sanitizeTitle(node.title ?? "");
|
||||
if (used.has(name)) {
|
||||
// Sibling collision: disambiguate with the stable, sanitized slugId (fall
|
||||
// back to the sanitized pageId if no slugId is present).
|
||||
name = disambiguate(name, sanitizeTitle(node.slugId ?? node.id));
|
||||
}
|
||||
used.add(name);
|
||||
return name;
|
||||
}
|
||||
Reference in New Issue
Block a user