From 531b320776f6ba62966f2b9e0a8a72c7166e3bc0 Mon Sep 17 00:00:00 2001 From: vvzvlad Date: Tue, 16 Jun 2026 23:57:50 +0300 Subject: [PATCH] =?UTF-8?q?feat(sync):=20add=20git=20vault=20layer=20(?= =?UTF-8?q?=C2=A75)=20and=20the=20Docmost->vault=20pull=20cycle=20(=C2=A76?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turn the read-only mirror into a git-backed pull cycle. Read-only toward Docmost. - git.ts (VaultGit): system-git wrapper, all ops cwd=vaultPath (vault is its own repo under data/vault, never the source repo); ensureRepo/branches main+docmost, commit with provenance (author/committer identity + Docmost-Sync-Source trailer, §7.3), merge with conflict surfacing (no auto-resolve, §9), isMergeInProgress; GIT_DIR/GIT_WORK_TREE stripped from env (§12 cwd isolation) - stabilize.ts: normalize-on-write (one export->import->export fixpoint pass, §11) - reconcile.ts: pure planReconciliation (add/update/move/delete by pageId) + decideAbsenceDeletions gate - pull.ts: write/commit on docmost -> merge into main; listSpaceTree completeness signal suppresses absence-deletions on a partial fetch (§8); mass-delete guard; merge-in-progress guard makes re-runs converge (§12); move old-path removal only on successful write - docmost-client: listSpaceTree({pages, complete}) without touching the 1:1-copied enumerateSpacePages - tests: reconcile planner + decideAbsenceDeletions, VaultGit incl. real temp-repo merge conflict, listSpaceTree completeness (586 green) Push to a git remote and the FS->Docmost direction are deferred to the next increment. --- packages/docmost-client/src/client.ts | 63 +++++ src/git.ts | 352 ++++++++++++++++++++++++++ src/pull.ts | 339 +++++++++++++++++++++---- src/reconcile.ts | 200 +++++++++++++++ src/stabilize.ts | 58 +++++ test/client-rest.test.ts | 99 ++++++++ test/git.test.ts | 292 +++++++++++++++++++++ test/reconcile.test.ts | 238 +++++++++++++++++ 8 files changed, 1594 insertions(+), 47 deletions(-) create mode 100644 src/git.ts create mode 100644 src/reconcile.ts create mode 100644 src/stabilize.ts create mode 100644 test/git.test.ts create mode 100644 test/reconcile.test.ts diff --git a/packages/docmost-client/src/client.ts b/packages/docmost-client/src/client.ts index 5229cf4..54fd4df 100644 --- a/packages/docmost-client/src/client.ts +++ b/packages/docmost-client/src/client.ts @@ -2619,6 +2619,69 @@ export class DocmostClient { return this.enumerateSpacePages(spaceId, rootPageId); } + /** + * Completeness-tracking variant of the space tree walk (SPEC §8). + * + * Same iterative breadth-first walk as the private `enumerateSpacePages` + * (kept 1:1 with upstream for backport), but it does NOT silently swallow + * partial fetches: it returns `{ pages, complete }`, where `complete` is + * `false` if ANY branch's children fetch threw (the branch is skipped and the + * walk continues) OR if the hard `MAX_NODES` cap was hit before the queue + * drained. The caller uses this signal to SUPPRESS absence-based deletions on + * an incomplete fetch — per SPEC §8, a page missing from a partial tree is NOT + * proof it was deleted ("детекция удаления — точный запрос, а не вывод 'pageId + * пропал из дерева'"). + */ + async listSpaceTree( + spaceId: string, + rootPageId?: string, + ): Promise<{ pages: any[]; complete: boolean }> { + const MAX_NODES = 10000; + const result: any[] = []; + const visited = new Set(); + let complete = true; + + // Seed the queue with the starting level (subtree children or roots). A + // failure to fetch even the seed level means the result is incomplete. + let queue: any[]; + try { + queue = await this.listSidebarPages(spaceId, rootPageId); + } catch (e: any) { + return { pages: result, complete: false }; + } + + while (queue.length > 0 && result.length < MAX_NODES) { + const node = queue.shift(); + if (!node || typeof node !== "object" || !node.id) continue; + + // Skip already-seen ids to guard against cycles / duplicate references. + if (visited.has(node.id)) continue; + visited.add(node.id); + + result.push(node); + + if (node.hasChildren) { + try { + const children = await this.listSidebarPages(spaceId, node.id); + for (const child of children) queue.push(child); + } catch (e: any) { + // A failure fetching one node's children must not abort the whole + // walk: skip this branch and keep enumerating the rest, but RECORD + // that the tree we return is incomplete (SPEC §8). + complete = false; + } + } + } + + // If we stopped because the node cap was hit while the queue still had + // work, the tree is incomplete too. + if (queue.length > 0 && result.length >= MAX_NODES) { + complete = false; + } + + return { pages: result, complete }; + } + /** * "Changes since T" scan (SPEC §16). There is NO server-side `updatedAt` * filter in Docmost and `/pages/recent` is CURSOR-paginated, so this is a diff --git a/src/git.ts b/src/git.ts new file mode 100644 index 0000000..e6e982d --- /dev/null +++ b/src/git.ts @@ -0,0 +1,352 @@ +/** + * Thin async wrapper over the system `git` binary (SPEC §5: state store = git). + * + * IMPORTANT — VAULT-SCOPED: every operation here runs with `cwd = vaultPath`, + * which is the vault's OWN git repository (default `data/vault`), SEPARATE from + * the docmost-sync source repo. This module MUST NEVER run git against the + * source repo. `data/` is gitignored by the source repo, so a nested repo under + * `data/vault` is safe. The pull cycle is READ-ONLY toward Docmost; this module + * only touches the local vault git, never a git remote (push is deferred, see + * SPEC §7). + * + * Implementation notes: + * - We shell out via `node:child_process` `execFile` (promisified), passing + * ARGS AS AN ARRAY — no shell, so there is no command injection surface even + * if a page title / branch name contains shell metacharacters. + * - Every invocation prepends `--no-pager` so git never blocks on a pager. + * - "nothing to commit" is treated as a graceful no-op, not an error. + */ +import { execFile } from "node:child_process"; +import { mkdir } from "node:fs/promises"; +import { promisify } from "node:util"; + +const execFileAsync = promisify(execFile); + +/** Bot identity used for engine-authored vault commits (SPEC §7.3). */ +export const BOT_AUTHOR_NAME = "Docmost Sync"; +export const BOT_AUTHOR_EMAIL = "docmost-sync@local"; + +/** Default branch the vault repo is initialized on. */ +export const DEFAULT_BRANCH = "main"; + +/** Result of a `merge`: whether it succeeded cleanly or left conflict markers. */ +export interface MergeResult { + /** True when the merge applied cleanly (fast-forward or clean 3-way). */ + ok: boolean; + /** True when the merge stopped on conflicts (markers left in the worktree). */ + conflict: boolean; + /** Raw combined stdout+stderr, for logging/diagnostics. */ + output: string; +} + +/** Options for an engine-authored commit (provenance, SPEC §7.3). */ +export interface CommitOptions { + authorName: string; + authorEmail: string; + /** + * Trailer lines appended to the commit message body (e.g. + * `Docmost-Sync-Source: docmost`). These are the machine-readable provenance + * the loop-guard keys on (SPEC §12, "commit-attribution"). + */ + trailers?: string[]; +} + +/** + * A git wrapper bound to a single vault path. Construct once per vault; every + * method runs git with `cwd = vaultPath`. + */ +export class VaultGit { + constructor(private readonly vaultPath: string) {} + + /** + * Run `git --no-pager ` in the vault. Returns trimmed stdout. + * Throws a clear Error (including stderr) on a non-zero exit. + */ + private async run(args: string[]): Promise { + try { + const { stdout } = await execFileAsync("git", ["--no-pager", ...args], { + cwd: this.vaultPath, + // Generous buffer: `git status --porcelain` / file listings on a large + // vault can be sizable. + maxBuffer: 64 * 1024 * 1024, + env: vaultGitEnv(), + }); + return stdout.trim(); + } catch (err: unknown) { + const e = err as { stderr?: string; stdout?: string; message?: string }; + const detail = (e.stderr || e.stdout || e.message || "").toString().trim(); + throw new Error(`git ${args.join(" ")} failed: ${detail}`); + } + } + + /** + * Like `run`, but returns the full exit info instead of throwing on a + * non-zero exit. Used where a non-zero exit is an expected, meaningful state + * (e.g. a merge conflict, or a porcelain diff that "fails" deliberately). + */ + private async runRaw( + args: string[], + ): Promise<{ code: number; stdout: string; stderr: string }> { + try { + const { stdout, stderr } = await execFileAsync( + "git", + ["--no-pager", ...args], + { cwd: this.vaultPath, maxBuffer: 64 * 1024 * 1024, env: vaultGitEnv() }, + ); + return { code: 0, stdout, stderr }; + } catch (err: unknown) { + const e = err as { + code?: number; + stdout?: string; + stderr?: string; + }; + return { + code: typeof e.code === "number" ? e.code : 1, + stdout: e.stdout ?? "", + stderr: e.stderr ?? "", + }; + } + } + + /** + * Ensure the vault directory exists and is an initialized git repo on `main` + * with an initial (empty) commit so branches exist. Idempotent: safe to call + * on every run. Sets a LOCAL bot identity for the vault repo if none is set + * (so engine commits never fall back to a global/unset identity). + */ + async ensureRepo(): Promise { + await mkdir(this.vaultPath, { recursive: true }); + + if (!(await this.isRepo())) { + // `git init -b main` sets the initial branch on modern git; we still + // guard the branch name below for safety on older binaries. + await this.run(["init", "-b", DEFAULT_BRANCH]); + } + + // Set a local identity for the vault repo if unset, so engine commits have + // a deterministic committer even on a machine with no global git config. + if (!(await this.hasLocalConfig("user.name"))) { + await this.run(["config", "user.name", BOT_AUTHOR_NAME]); + } + if (!(await this.hasLocalConfig("user.email"))) { + await this.run(["config", "user.email", BOT_AUTHOR_EMAIL]); + } + + // Create the initial empty commit on `main` if the repo has no commits yet, + // so both `main` and (later) `docmost` branches have a common base. + if (!(await this.hasAnyCommit())) { + // Make sure we are on the default branch before the first commit (covers + // the older-git case where `init -b` was not honored). + await this.run(["checkout", "-B", DEFAULT_BRANCH]); + await this.commitRaw("init vault", { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + allowEmpty: true, + }); + } + } + + /** True if `cwd` is inside a git work-tree (the vault is initialized). */ + private async isRepo(): Promise { + const r = await this.runRaw(["rev-parse", "--is-inside-work-tree"]); + return r.code === 0 && r.stdout.trim() === "true"; + } + + /** True if a LOCAL git config key is set in the vault repo. */ + private async hasLocalConfig(key: string): Promise { + const r = await this.runRaw(["config", "--local", "--get", key]); + return r.code === 0 && r.stdout.trim().length > 0; + } + + /** True if the repo has at least one commit (HEAD resolves). */ + private async hasAnyCommit(): Promise { + const r = await this.runRaw(["rev-parse", "--verify", "HEAD"]); + return r.code === 0; + } + + /** True if a branch with the given name exists. */ + async branchExists(name: string): Promise { + const r = await this.runRaw([ + "rev-parse", + "--verify", + `refs/heads/${name}`, + ]); + return r.code === 0; + } + + /** + * Create `name` from `fromBranch` if it does not already exist. No-op (and no + * checkout) when the branch is already present. + */ + async ensureBranch(name: string, fromBranch: string): Promise { + if (await this.branchExists(name)) return; + await this.run(["branch", name, fromBranch]); + } + + /** Name of the currently checked-out branch. */ + async currentBranch(): Promise { + return this.run(["rev-parse", "--abbrev-ref", "HEAD"]); + } + + /** Check out an existing branch. */ + async checkout(name: string): Promise { + await this.run(["checkout", name]); + } + + /** Stage everything (adds, modifications, deletions). */ + async stageAll(): Promise { + await this.run(["add", "-A"]); + } + + /** + * True if the vault is mid-merge (an unresolved merge from a previous run, + * SPEC §9 / §12). Detected via a `MERGE_HEAD` ref OR any unmerged + * (conflicted) index entries (`git ls-files -u`). The pull cycle checks this + * BEFORE any checkout so a left-over merge produces a clear, actionable + * message instead of a raw "you need to resolve your current index first" + * failure deep inside `checkout`. This is what makes re-runs converge + * (resumability, SPEC §12). + */ + async isMergeInProgress(): Promise { + // MERGE_HEAD exists exactly while a merge is in progress. + const mergeHead = await this.runRaw([ + "rev-parse", + "--verify", + "--quiet", + "MERGE_HEAD", + ]); + if (mergeHead.code === 0 && mergeHead.stdout.trim().length > 0) return true; + // Fallback / belt-and-suspenders: any unmerged index entries also mean the + // working tree is mid-conflict and a checkout would refuse. + const unmerged = await this.runRaw(["ls-files", "-u"]); + return unmerged.code === 0 && unmerged.stdout.trim().length > 0; + } + + /** + * Commit the currently STAGED changes with an explicit author/committer + * identity and the given trailers appended to the message body (SPEC §7.3 + * provenance). Returns `true` if a commit was made, `false` if there was + * nothing to commit (graceful no-op). The caller is expected to have staged + * its changes first (e.g. via `stageAll`). + */ + async commit(message: string, opts: CommitOptions): Promise { + // Nothing staged -> nothing to commit. Treat as a no-op (SPEC §11: a + // deterministic re-pull of unchanged pages produces identical bytes, so + // git sees no diff and we must not error). + const staged = await this.runRaw([ + "diff", + "--cached", + "--quiet", + ]); + // `diff --cached --quiet` exits 0 when the index matches HEAD (nothing + // staged), 1 when there are staged changes. + if (staged.code === 0) return false; + + await this.commitRaw(message, opts); + return true; + } + + /** + * Low-level commit used by both `commit` and `ensureRepo`'s initial commit. + * Builds the full message with appended trailers and sets author + committer + * identity via env vars (so the committer matches the author, not the repo + * default). + */ + private async commitRaw( + message: string, + opts: CommitOptions & { allowEmpty?: boolean }, + ): Promise { + const fullMessage = buildCommitMessage(message, opts.trailers); + const args = ["commit", "-m", fullMessage]; + if (opts.allowEmpty) args.push("--allow-empty"); + + await execFileAsync("git", ["--no-pager", ...args], { + cwd: this.vaultPath, + maxBuffer: 64 * 1024 * 1024, + env: vaultGitEnv({ + GIT_AUTHOR_NAME: opts.authorName, + GIT_AUTHOR_EMAIL: opts.authorEmail, + GIT_COMMITTER_NAME: opts.authorName, + GIT_COMMITTER_EMAIL: opts.authorEmail, + }), + }).catch((err: unknown) => { + const e = err as { stderr?: string; message?: string }; + throw new Error( + `git commit failed: ${(e.stderr || e.message || "").toString().trim()}`, + ); + }); + } + + /** + * Merge `fromBranch` into the current branch (`git merge --no-edit`). + * Fast-forwards when possible; performs a real 3-way merge otherwise. Conflict + * state is SURFACED (returned), NOT auto-resolved (SPEC §9): the conflict + * markers are left in the worktree for manual resolution by a later increment, + * and — critically — nothing is pushed to Docmost (we never write to Docmost + * anyway). + */ + async merge(fromBranch: string): Promise { + const r = await this.runRaw(["merge", "--no-edit", fromBranch]); + const output = `${r.stdout}\n${r.stderr}`.trim(); + if (r.code === 0) { + return { ok: true, conflict: false, output }; + } + // A non-zero exit on merge most commonly means a conflict. Confirm by + // checking for unmerged paths (porcelain "U" status) so we don't mislabel + // an unrelated failure as a conflict. + const conflict = await this.hasUnmergedPaths(); + return { ok: false, conflict, output }; + } + + /** True if the index has any unmerged (conflicted) paths. */ + private async hasUnmergedPaths(): Promise { + const r = await this.runRaw(["diff", "--name-only", "--diff-filter=U"]); + return r.code === 0 && r.stdout.trim().length > 0; + } + + /** + * List tracked files on the current branch (paths relative to the vault + * root, forward-slash separated). An optional glob (a git pathspec) narrows + * the listing, e.g. `"*.md"`. + */ + async listTrackedFiles(glob?: string): Promise { + const args = ["ls-files"]; + if (glob) args.push(glob); + const out = await this.run(args); + if (out.length === 0) return []; + return out.split("\n").filter((l) => l.length > 0); + } +} + +/** + * Build the environment for a vault git invocation (SPEC §12 cwd-isolation). + * + * cwd-isolation is this module's central safety guarantee: every git command + * MUST operate on the vault repo at `cwd: vaultPath` and nothing else. An + * inherited `GIT_DIR` / `GIT_WORK_TREE` in `process.env` would silently + * redirect the operation away from `cwd` (e.g. to the source repo or another + * checkout), defeating that guarantee. So we always strip them, regardless of + * whatever else the caller adds (author/committer identity, etc.). + */ +function vaultGitEnv( + extra?: Record, +): NodeJS.ProcessEnv { + const env: NodeJS.ProcessEnv = { ...process.env, ...extra }; + delete env.GIT_DIR; + delete env.GIT_WORK_TREE; + return env; +} + +/** + * Build a commit message body with trailer lines appended (SPEC §7.3). The + * trailers are separated from the subject by a blank line so `git interpret- + * trailers` / `git log --format=%(trailers)` parse them as trailers. + * Exported for unit testing. + */ +export function buildCommitMessage( + subject: string, + trailers?: string[], +): string { + if (!trailers || trailers.length === 0) return subject; + return `${subject}\n\n${trailers.join("\n")}`; +} diff --git a/src/pull.ts b/src/pull.ts index da9b6b1..e948e6e 100644 --- a/src/pull.ts +++ b/src/pull.ts @@ -1,34 +1,110 @@ /** - * Read-only Docmost -> filesystem mirror (SPEC §6 pull, Phase 1). + * Pull cycle — Docmost -> vault (SPEC §6 "Docmost -> ФС"). * - * Walks the configured space's page tree and writes one self-contained `.md` - * per page under `/<...ancestors>/.md`. This increment is - * READ-ONLY toward Docmost (no writes, no git) — it only fetches and writes - * local files. The meta block inside each file carries - * `{ version, pageId, slugId, title, spaceId, parentPageId }` (identity), so no - * external map file is needed. + * This increment turns the read-only mirror into the git-backed pull cycle: * - * The pure tree -> path mapping lives in `./layout.js`; this file is a thin, - * fault-tolerant I/O loop around it. + * 1. ensureRepo(vault); refuse if a merge is in progress (SPEC §9/§12); + * ensureBranch("docmost", "main") (SPEC §5 branches) + * 2. checkout docmost + * 3. fetch the live tree (listSpaceTree -> {pages, complete}) -> compute the + * desired `live` files (relPath via the pure sanitize/disambiguation layout) + * 4. parse `existing` tracked .md files (pageId + relPath from docmost:meta) + * 5. plan = planReconciliation(live, existing) (pure, SPEC §5/§8); toDelete + * is absence-only, moves are separate + * 6. decideAbsenceDeletions: SUPPRESS absence deletions on an incomplete tree + * fetch (SPEC §8) and behind the mass-delete guard (defense in depth) + * 7. write each live page in its fixpoint form (normalize-on-write, SPEC §11); + * apply moved-old-path removals (only when the move write SUCCEEDED) and + * absence-delete removals (only when the decision allowed them) + * 8. stageAll + commit on `docmost` with the provenance trailer (SPEC §7.3) + * 9. checkout main + merge docmost (conflicts are surfaced, NOT auto-resolved, + * SPEC §9); push is deferred (SPEC §7) + * 10. one-line summary + * + * DIRECTION IS Docmost -> vault ONLY. Nothing here ever writes to Docmost + * (read-only: listSpaceTree + getPageJson). All git operations run against + * the vault repo (`cwd = vaultPath`), never the source repo (see ./git.ts). * * Requires a `.env` with real Docmost credentials. This file must COMPILE and - * be correct, but is not expected to be run without live access. + * be correct, but is NOT expected to be run without live access. * * Run via: npm run pull (-> node build/pull.js) */ -import { mkdir, writeFile } from "node:fs/promises"; -import { join } from "node:path"; +import { mkdir, readFile, rm, writeFile } from "node:fs/promises"; +import { dirname, join, sep } from "node:path"; import { pathToFileURL } from "node:url"; -import { DocmostClient } from "docmost-client"; +import { DocmostClient, parseDocmostMarkdown } from "docmost-client"; import { loadSettings } from "./settings.js"; import { buildVaultLayout, type PageNode } from "./layout.js"; +import { + VaultGit, + BOT_AUTHOR_NAME, + BOT_AUTHOR_EMAIL, + DEFAULT_BRANCH, +} from "./git.js"; +import { + planReconciliation, + decideAbsenceDeletions, + type LiveEntry, +} from "./reconcile.js"; +import { stabilizePageFile, type PageMeta } from "./stabilize.js"; -// Number of pages fetched/written concurrently. Bounded so a large space does -// not open thousands of simultaneous requests/file handles. +// Engine-only mirror branch (SPEC §5): the engine writes here, humans never do. +const DOCMOST_BRANCH = "docmost"; +// Machine-readable provenance the loop-guard keys on (SPEC §7.3 / §12). +const SOURCE_TRAILER = "Docmost-Sync-Source: docmost"; + +// Number of pages fetched/stabilized concurrently. Bounded so a large space +// does not open thousands of simultaneous requests/conversions at once. const CONCURRENCY = 6; // How often to log incremental progress (every N completed pages). const PROGRESS_EVERY = 25; +/** Convert a vault-relative path (forward-slash) to an absolute FS path. */ +function relToAbs(vaultRoot: string, relPath: string): string { + return join(vaultRoot, ...relPath.split("/")); +} + +/** Convert an absolute/relative segment list under the vault to a relPath. */ +function segmentsToRelPath(segments: string[], stem: string): string { + return [...segments, `${stem}.md`].join("/"); +} + +/** + * Read every tracked .md file in the vault and parse its `docmost:meta` to + * recover `{ pageId, relPath }`. Files without a parseable pageId in meta are + * skipped (they are not engine-tracked pages — e.g. a stray hand-written file). + */ +async function readExisting( + git: VaultGit, + vaultRoot: string, +): Promise<{ pageId: string; relPath: string }[]> { + const tracked = await git.listTrackedFiles("*.md"); + const existing: { pageId: string; relPath: string }[] = []; + for (const relPath of tracked) { + // git ls-files always emits forward-slash paths; normalize just in case. + const rel = relPath.split(sep).join("/"); + let text: string; + try { + text = await readFile(relToAbs(vaultRoot, rel), "utf8"); + } catch { + // Tracked but missing on disk (mid-operation race) — skip; the next pull + // converges. + continue; + } + let pageId: string | undefined; + try { + const { meta } = parseDocmostMarkdown(text); + pageId = meta?.pageId; + } catch { + // Unparseable meta — not engine-tracked; leave it alone. + pageId = undefined; + } + if (pageId) existing.push({ pageId, relPath: rel }); + } + return existing; +} + async function main(): Promise<void> { const s = loadSettings(); const client = new DocmostClient( @@ -40,75 +116,244 @@ async function main(): Promise<void> { const spaceId = s.docmostSpaceId; const vaultRoot = s.vaultPath; - const pages: PageNode[] = await client.listAllSpacePages(spaceId); + // 1. Ensure the vault git repo exists with main + an initial commit, and the + // engine-only `docmost` branch exists, branched from main. + const git = new VaultGit(vaultRoot); + await git.ensureRepo(); + + // 1b. Refuse to run on top of an unresolved merge (SPEC §9 / §12). A previous + // conflicting pull leaves the vault mid-merge; the next `checkout` would + // fail with a raw "you need to resolve your current index first". Detect + // it BEFORE any checkout and exit cleanly with an actionable message so + // re-runs converge once the human resolves (or aborts) the merge. + if (await git.isMergeInProgress()) { + console.error( + `vault has an unresolved merge at ${vaultRoot} — resolve it (or ` + + `'git merge --abort') and re-run (SPEC §9)`, + ); + process.exitCode = 1; + return; + } + + await git.ensureBranch(DOCMOST_BRANCH, DEFAULT_BRANCH); + + // 2. Work on the docmost mirror branch. + await git.checkout(DOCMOST_BRANCH); + + // 3. Fetch the live tree and compute the desired files (relPath via the pure + // sanitize + disambiguation layout). `listSpaceTree` reports completeness: + // if ANY branch's children fetch failed or the node cap was hit, the tree + // is PARTIAL and absence-based deletions must be suppressed this cycle + // (SPEC §8) — a missing pageId in a partial tree is NOT proof of deletion. + const { pages: rawPages, complete: treeComplete } = + await client.listSpaceTree(spaceId); + const pages = rawPages as PageNode[]; const layout = buildVaultLayout(pages); - const total = pages.length; + const live: LiveEntry[] = []; + const liveNodeByPageId = new Map<string, PageNode>(); + for (const p of pages) { + if (!p || !p.id) continue; + const entry = layout.get(p.id); + if (!entry) continue; + live.push({ + pageId: p.id, + relPath: segmentsToRelPath(entry.segments, entry.stem), + }); + liveNodeByPageId.set(p.id, p); + } + + // 4. Parse the existing tracked .md files (pageId + relPath). + const existing = await readExisting(git, vaultRoot); + + // 5. Plan reconciliation (pure). `plan.toDelete` is ABSENCE-based only; + // `plan.moved` carries move old-path removals separately. + const plan = planReconciliation(live, existing); + + // 6. Decide whether the ABSENCE-based deletions (`plan.toDelete`) may be + // applied this cycle (SPEC §8). The pure helper folds in BOTH the + // incomplete-fetch suppression (a partial tree must not look like + // deletions) AND the mass-delete guard (defense in depth). Moves are NOT + // governed by this — a moved page is present in `live`, so its old-path + // removal is real and applied unconditionally (subject only to its write + // succeeding). + const deleteDecision = decideAbsenceDeletions({ + treeComplete, + liveCount: live.length, + existingCount: existing.length, + deleteCount: plan.toDelete.length, + }); + if (!deleteDecision.apply) { + if (deleteDecision.reason === "incomplete-fetch") { + console.warn( + "pull: tree fetch incomplete — deletions suppressed this cycle (SPEC §8)", + ); + } else if (deleteDecision.reason === "empty-live") { + console.warn( + `pull: live fetch returned 0 pages but ${existing.length} file(s) are ` + + `tracked — deletions suppressed this cycle (SPEC §8). Re-run when ` + + `Docmost is reachable.`, + ); + } else { + console.warn( + `pull: plan would delete ${plan.toDelete.length} of ${existing.length} ` + + `tracked file(s) (mass-delete guard) — deletions suppressed this ` + + `cycle (SPEC §8). Verify the live Docmost tree, then re-run.`, + ); + } + } + + // 7. Write each live page in its fixpoint form (normalize-on-write, SPEC §11), + // then apply move-old-path + absence-delete removals. let written = 0; let failed = 0; let completed = 0; let nextIndex = 0; + // pageIds whose write FAILED. A moved page whose new-path write failed must + // NOT have its old path removed (otherwise the page vanishes entirely). + const failedPageIds = new Set<string>(); - // Pull + write a single page. Each call is wrapped so one bad page (network - // error, page deleted between the walk and the fetch, body conversion - // failure) NEVER aborts the whole pull — it is counted as a failure and the - // pool moves on. Mirrors the deliberately fault-tolerant enumerateSpacePages. - const pullOne = async (page: PageNode): Promise<void> => { - if (!page || !page.id) return; - const entry = layout.get(page.id); - if (!entry) return; // no layout entry (e.g. duplicate/skipped id) + const writeOne = async (w: { pageId: string; relPath: string }): Promise<void> => { + const node = liveNodeByPageId.get(w.pageId); + if (!node) return; try { - const dir = join(vaultRoot, ...entry.segments); - await mkdir(dir, { recursive: true }); - // Body + meta only (no comments block) — SPEC §3. - const md = await client.exportPageBody(page.id); - await writeFile(join(dir, `${entry.stem}.md`), md, "utf8"); + const page = await client.getPageJson(w.pageId); + const meta: PageMeta = { + version: 1, + pageId: page.id, + slugId: page.slugId, + title: page.title, + spaceId: page.spaceId, + parentPageId: page.parentPageId ?? null, + }; + const text = await stabilizePageFile(page.content, meta); + const abs = relToAbs(vaultRoot, w.relPath); + await mkdir(dirname(abs), { recursive: true }); + await writeFile(abs, text, "utf8"); written++; } catch (err) { failed++; + failedPageIds.add(w.pageId); console.error( - `pull: failed page ${page.id}:`, + `pull: failed page ${w.pageId}:`, err instanceof Error ? err.message : String(err), ); } finally { completed++; if (completed % PROGRESS_EVERY === 0) { - console.log(`pulled ${completed}/${total}`); + console.log(`pulled ${completed}/${plan.toWrite.length}`); } } }; - // A small dependency-free bounded-concurrency pool: a fixed set of runners - // each pull the next index until the list is exhausted. + // Bounded-concurrency pool (dependency-free): a fixed set of runners each + // take the next index until the write list is exhausted. One bad page never + // aborts the whole pull (mirrors the fault-tolerant tree walk). const runner = async (): Promise<void> => { while (true) { const i = nextIndex++; - if (i >= pages.length) return; - await pullOne(pages[i]); + if (i >= plan.toWrite.length) return; + await writeOne(plan.toWrite[i]); + } + }; + await Promise.all( + Array.from( + { length: Math.min(CONCURRENCY, plan.toWrite.length) || 1 }, + () => runner(), + ), + ); + + // Helper: `rm` with force:true is a no-op if the file is already gone. + const removePath = async (rel: string, what: string): Promise<boolean> => { + try { + await rm(relToAbs(vaultRoot, rel), { force: true }); + return true; + } catch (err) { + console.error( + `pull: failed to ${what} ${rel}:`, + err instanceof Error ? err.message : String(err), + ); + return false; } }; - const runners = Array.from( - { length: Math.min(CONCURRENCY, pages.length) }, - () => runner(), - ); - await Promise.all(runners); + // 7a. Apply MOVE old-path removals. A moved page IS present in `live`, so its + // old path is genuinely stale — this is NOT subject to the incomplete- + // fetch suppression. BUT only remove the old path when (a) the planner + // marked it removable (not reused by another live page) AND (b) the new- + // path write actually SUCCEEDED — otherwise we would delete the only copy + // of a page whose move-write failed. + let movedApplied = 0; + for (const m of plan.moved) { + if (!m.removeOldPath) continue; + if (failedPageIds.has(m.pageId)) { + console.warn( + `pull: move write for ${m.pageId} failed — keeping old path ` + + `${m.fromRelPath} (SPEC §8)`, + ); + continue; + } + if (await removePath(m.fromRelPath, "remove moved old path")) movedApplied++; + } + // 7b. Apply ABSENCE-based deletions — ONLY if the decision allowed them + // (incomplete-fetch suppression + mass-delete guard, SPEC §8). + let deleted = 0; + if (deleteDecision.apply) { + for (const rel of plan.toDelete) { + if (await removePath(rel, "delete")) deleted++; + } + } + + // 8. Stage + commit on `docmost` (only if there is something to commit). + // Deterministic stabilized output means unchanged pages produce identical + // bytes -> git sees no diff -> no churn (SPEC §11). The subject reflects the + // ACTUAL work applied (pages written + files deleted), not the planned size, + // so a run with failures does not over-report (SPEC §5 nit). + const subject = + deleted > 0 + ? `docmost: sync ${written} page(s), ${deleted} deleted` + : `docmost: sync ${written} page(s)`; + await git.stageAll(); + const committed = await git.commit(subject, { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + trailers: [SOURCE_TRAILER], + }); + + // 9. Merge docmost -> main. Conflicts are surfaced and left in git (SPEC §9); + // we never push to Docmost. Push to a git remote is deferred (SPEC §7). + await git.checkout(DEFAULT_BRANCH); + const merge = await git.merge(DOCMOST_BRANCH); + if (merge.conflict) { + console.error( + "pull: merge of docmost -> main CONFLICTED. Conflict markers were left " + + "in the vault for manual resolution (SPEC §9). Nothing is pushed to " + + "Docmost (read-only). Resolve locally, then re-run.", + ); + } else if (!merge.ok) { + console.error(`pull: merge of docmost -> main failed: ${merge.output}`); + } + console.log("pull: git push to remote is DEFERRED in this increment (SPEC §7)."); + + // 10. One-line summary. console.log( - `pull complete: ${written} page(s) written, ${failed} failed, ` + - `out of ${total} from space ${spaceId} into ${vaultRoot}`, + `pull complete: ${written} written, ${movedApplied} moved, ` + + `${deleted} deleted, committed=${committed}, ` + + `merge=${merge.conflict ? "CONFLICT" : merge.ok ? "ok" : "failed"} ` + + `(${failed} page failures) from space ${spaceId} into ${vaultRoot}`, ); - // Signal a partial mirror so callers/CI can react. Use process.exitCode (not - // a hard process.exit) so any buffered output is flushed cleanly. - if (failed > 0) { + // Signal a partial mirror / conflict so callers/CI can react. Use + // process.exitCode (not a hard exit) so buffered output flushes cleanly. + if (failed > 0 || merge.conflict || !merge.ok) { process.exitCode = 1; } } // Only auto-run when invoked directly as the CLI entrypoint, not when this // module is imported (e.g. by a unit test), so the import does not trigger -// loadSettings() + process.exit. +// loadSettings() + git/network access. const invokedDirectly = typeof process.argv[1] === "string" && import.meta.url === pathToFileURL(process.argv[1]).href; diff --git a/src/reconcile.ts b/src/reconcile.ts new file mode 100644 index 0000000..ef8ec11 --- /dev/null +++ b/src/reconcile.ts @@ -0,0 +1,200 @@ +/** + * Pure reconciliation planner (SPEC §5/§6/§8). + * + * Given the desired live set of files (computed from the current Docmost tree) + * and the set of files currently tracked in the vault, compute what to write, + * what to move (old path to remove), and what to delete. Identity is `pageId` + * (the stable file<->page anchor, SPEC §4): a page that keeps its pageId but + * changes relPath is a MOVE, not delete+add; a tracked pageId that is gone from + * the live tree is a DELETE. + * + * This module is intentionally PURE (no IO, no git) so the whole plan is + * unit-testable. The actual file writing / git operations happen in pull.ts. + */ + +/** A page that SHOULD exist in the vault at a given path. */ +export interface LiveEntry { + pageId: string; + /** Vault-relative path (forward-slash), e.g. `Space/Parent/Child.md`. */ + relPath: string; +} + +/** A page currently tracked in the vault (pageId parsed from its meta). */ +export interface ExistingEntry { + pageId: string; + /** Vault-relative path (forward-slash) of the tracked file. */ + relPath: string; +} + +/** A page to (re)write at its destination path. */ +export interface WriteEntry { + pageId: string; + relPath: string; +} + +/** A page that moved: written at its NEW relPath, with the OLD path removed. */ +export interface MovedEntry { + pageId: string; + fromRelPath: string; + toRelPath: string; + /** + * Whether the old path (`fromRelPath`) is SAFE to remove. False when another + * live page will (re)write that exact path (path reuse): removing it would + * destroy real data, so the caller must skip the removal. The move itself is + * still recorded (the new path is written regardless). + */ + removeOldPath: boolean; +} + +/** The full reconciliation plan. */ +export interface ReconciliationPlan { + /** + * Pages present in `live` -> (re)write at their relPath. This naturally + * covers add, content-update (same path) AND move (same pageId, new path), + * since every live page is (re)written regardless of whether it existed. + */ + toWrite: WriteEntry[]; + /** + * Vault-relative paths to delete because their tracked pageId is ABSENT from + * `live` (page removed/trashed). This set is ONLY absence-based deletions — + * the OLD paths of moved pages are NOT here (they live in `moved` and are + * applied separately by the caller). Keeping the two apart lets pull.ts gate + * absence deletions behind the incomplete-fetch suppression + mass-delete + * guard (SPEC §8) while still applying real moves. + */ + toDelete: string[]; + /** + * Tracked pages whose relPath changed. The caller writes the page at + * `toRelPath`, then removes `fromRelPath` — but ONLY after the new-path write + * succeeded. The old path is NOT in `toDelete`. + */ + moved: MovedEntry[]; +} + +/** + * Compute the reconciliation plan. + * + * Rules: + * - Every `live` page is written at its relPath (covers add + update + move). + * - A tracked pageId present in `live` whose relPath changed is `moved`; its + * OLD relPath goes into `moved` ONLY (the caller removes it after the new + * path is written) and is NEVER added to `toDelete`. + * - A tracked pageId NOT present in `live` is an ABSENCE delete; its relPath + * is added to `toDelete`. + * + * Notes: + * - Safety filter (no data loss): no path that is a live TARGET path of any + * page is ever deleted/removed (a write owns it). This applies to BOTH the + * absence `toDelete` set AND a moved page's old-path removal — if a moved + * page's OLD path is reused by ANOTHER live page, the move records no old + * path to remove, because that path will be (re)written. + * - `existing` may legitimately contain duplicate pageIds (two stray files + * carrying the same meta pageId); each such file that is not the live target + * path is removed (as an absence/move) so the vault converges to exactly the + * live set. + */ +export function planReconciliation( + live: LiveEntry[], + existing: ExistingEntry[], +): ReconciliationPlan { + // Desired path for each live pageId. + const liveByPageId = new Map<string, string>(); + // Set of all paths that WILL be written (never delete/remove one of these). + const liveTargetPaths = new Set<string>(); + for (const e of live) { + liveByPageId.set(e.pageId, e.relPath); + liveTargetPaths.add(e.relPath); + } + + const toWrite: WriteEntry[] = live.map((e) => ({ + pageId: e.pageId, + relPath: e.relPath, + })); + + const moved: MovedEntry[] = []; + // Absence-based deletions ONLY (tracked pageId absent from `live`). Use a Set + // so the same path coming from multiple existing rows is queued only once. + const toDeleteSet = new Set<string>(); + + for (const ex of existing) { + const liveRel = liveByPageId.get(ex.pageId); + if (liveRel === undefined) { + // Tracked page is gone from the live tree -> absence delete. + // Never queue a path a live page will (re)write (path reuse -> no loss). + if (!liveTargetPaths.has(ex.relPath)) toDeleteSet.add(ex.relPath); + continue; + } + if (liveRel !== ex.relPath) { + // Same pageId, different path -> a MOVE. Record it so the caller can write + // the new path first, then remove the old one. If the old path is itself a + // live target (reused by another page), it must NOT be removed — the write + // owns it — so flag `removeOldPath: false` (move still recorded). + moved.push({ + pageId: ex.pageId, + fromRelPath: ex.relPath, + toRelPath: liveRel, + removeOldPath: !liveTargetPaths.has(ex.relPath), + }); + } + // liveRel === ex.relPath -> content-update in place; nothing extra to do + // (the write above re-emits the file; identical bytes => git no-op). + } + + const toDelete = [...toDeleteSet]; + + return { toWrite, toDelete, moved }; +} + +/** + * Below this many tracked files the mass-delete fraction guard is not applied + * (a tiny vault where deleting "most" files is normal, e.g. 1-of-2). + */ +export const MASS_DELETE_MIN_EXISTING = 4; +/** Fraction of tracked files above which a delete plan is a suspected wipe. */ +export const MASS_DELETE_FRACTION = 0.5; + +/** Why absence-based deletions were (or were not) applied this cycle. */ +export type DeletionDecision = + | { apply: true } + | { apply: false; reason: "incomplete-fetch" | "empty-live" | "mass-delete" }; + +/** + * Pure decision: should the ABSENCE-based deletions (`plan.toDelete`) be applied + * this cycle? Encapsulates the SPEC §8 safety invariants so they are unit- + * testable without live creds or git: + * + * - `treeComplete === false` (a partial Docmost tree fetch) -> SUPPRESS. A page + * missing from a partial tree is NOT proof of deletion (SPEC §8); we must not + * delete merely-absent files this cycle. (Writes/updates/moves still happen.) + * - The live fetch returned 0 pages while files are tracked -> SUPPRESS + * (almost always a failed fetch, never a real "delete everything"). + * - The plan would delete more than `MASS_DELETE_FRACTION` of a non-trivial + * vault -> SUPPRESS as a mass-deletion guard (defense in depth). + * + * Moves are NOT governed by this decision: a moved page IS present in `live`, so + * its old-path removal is real (handled by the caller separately). + */ +export function decideAbsenceDeletions(args: { + treeComplete: boolean; + liveCount: number; + existingCount: number; + deleteCount: number; +}): DeletionDecision { + const { treeComplete, liveCount, existingCount, deleteCount } = args; + + // No tracked files, or nothing to delete -> trivially fine to "apply". + if (existingCount === 0 || deleteCount === 0) return { apply: true }; + + if (!treeComplete) return { apply: false, reason: "incomplete-fetch" }; + + if (liveCount === 0) return { apply: false, reason: "empty-live" }; + + if ( + existingCount >= MASS_DELETE_MIN_EXISTING && + deleteCount > existingCount * MASS_DELETE_FRACTION + ) { + return { apply: false, reason: "mass-delete" }; + } + + return { apply: true }; +} diff --git a/src/stabilize.ts b/src/stabilize.ts new file mode 100644 index 0000000..240d0be --- /dev/null +++ b/src/stabilize.ts @@ -0,0 +1,58 @@ +/** + * Normalize-on-write helper (SPEC §11 "Резолюция"). + * + * git diffs byte-for-byte, so writing a page in a NON-fixpoint markdown form + * would make the next pull re-export it to a slightly different (but stable) + * form and produce a phantom diff -> churny commits. The converter has a couple + * of known one-pass asymmetries (a block image after a paragraph adds an empty + * paragraph; a diagram materializes `data-align`), all of which converge to a + * fixpoint after ONE `export -> import -> export` round-trip. + * + * So at write time we run exactly that one pass and persist the fixpoint form. + * Already-stable content is unaffected (the pass is idempotent), so re-pulls of + * unchanged pages produce identical bytes and git sees no diff. + */ +import { + convertProseMirrorToMarkdown, + markdownToProseMirror, + serializeDocmostMarkdownBody, + type DocmostMdMeta, +} from "docmost-client"; + +/** + * Meta object as `exportPageBody` builds it (SPEC §4). Kept byte-for-byte + * compatible so files produced here match `exportPageBody`'s output exactly. + */ +export interface PageMeta { + version: 1; + pageId: string; + slugId: string; + title: string; + spaceId: string; + parentPageId: string | null; +} + +/** + * Produce the self-contained `.md` file text for a page from its raw + * ProseMirror `content` + identity meta, in the verified fixpoint form. + * + * md1 = convertProseMirrorToMarkdown(content) + * doc2 = markdownToProseMirror(md1) // one import... + * stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export + * file = serializeDocmostMarkdownBody(meta, stableBody) + * + * The single export->import->export pass is the verified fixpoint (SPEC §11): + * idempotent for already-stable content, and the convergence point for the + * known converter asymmetries. + */ +export async function stabilizePageFile( + content: unknown, + meta: PageMeta, +): Promise<string> { + const md1 = convertProseMirrorToMarkdown(content); + const doc2 = await markdownToProseMirror(md1); + const stableBody = convertProseMirrorToMarkdown(doc2); + // The meta shape is exactly what `exportPageBody` writes; cast to the lib's + // DocmostMdMeta (a superset with optional fields) for the serializer. + return serializeDocmostMarkdownBody(meta as DocmostMdMeta, stableBody); +} diff --git a/test/client-rest.test.ts b/test/client-rest.test.ts index 4957687..8002c07 100644 --- a/test/client-rest.test.ts +++ b/test/client-rest.test.ts @@ -442,6 +442,105 @@ describe('checkNewComments', () => { }); }); +// --------------------------------------------------------------------------- +// listSpaceTree — completeness signal (SPEC §8) +// --------------------------------------------------------------------------- +describe('listSpaceTree (completeness)', () => { + // The walk seeds from /pages/sidebar-pages with only { spaceId } (roots), then + // fetches each hasChildren node's children with { spaceId, pageId }. We route + // by the presence of `pageId` in the request body. + it('returns complete:true and every node for a fully-fetched tree', async () => { + const client = new DocmostClient(BASE_URL, 'a@b.c', 'pw'); + stubLoginSuccess(globalAxiosMock()); + const imock = instanceMock(client); + + imock.onPost('/pages/sidebar-pages').reply((config) => { + const body = JSON.parse(config.data); + if (!body.pageId) { + // Root level: one parent with children + one leaf. + return [ + 200, + { + data: { + items: [ + { id: 'root', title: 'Root', hasChildren: true }, + { id: 'leaf', title: 'Leaf', hasChildren: false }, + ], + }, + }, + ]; + } + if (body.pageId === 'root') { + return [ + 200, + { data: { items: [{ id: 'child', title: 'Child', hasChildren: false }] } }, + ]; + } + return [200, { data: { items: [] } }]; + }); + + const { pages, complete } = await client.listSpaceTree('space-1'); + expect(complete).toBe(true); + expect(new Set(pages.map((p: any) => p.id))).toEqual( + new Set(['root', 'leaf', 'child']), + ); + }); + + it('returns complete:false but still the other nodes when a branch fetch THROWS', async () => { + const client = new DocmostClient(BASE_URL, 'a@b.c', 'pw'); + stubLoginSuccess(globalAxiosMock()); + const imock = instanceMock(client); + + imock.onPost('/pages/sidebar-pages').reply((config) => { + const body = JSON.parse(config.data); + if (!body.pageId) { + // Two parents, both claim children; one of them will fail to expand. + return [ + 200, + { + data: { + items: [ + { id: 'ok', title: 'Ok', hasChildren: true }, + { id: 'boom', title: 'Boom', hasChildren: true }, + ], + }, + }, + ]; + } + if (body.pageId === 'ok') { + return [ + 200, + { data: { items: [{ id: 'okchild', title: 'OkChild', hasChildren: false }] } }, + ]; + } + // The 'boom' branch fails -> walk must continue, completeness must drop. + return [500, {}]; + }); + + const { pages, complete } = await client.listSpaceTree('space-1'); + // The failed branch flips completeness to false... + expect(complete).toBe(false); + // ...but the rest of the tree is still collected (no abort, no wipe signal). + expect(new Set(pages.map((p: any) => p.id))).toEqual( + new Set(['ok', 'boom', 'okchild']), + ); + }); + + it('returns complete:false and no nodes when the seed (root) fetch fails', async () => { + const client = new DocmostClient(BASE_URL, 'a@b.c', 'pw'); + stubLoginSuccess(globalAxiosMock()); + const imock = instanceMock(client); + + // Every sidebar-pages call fails -> listSidebarPages itself throws on the + // seed, so the walk returns empty + incomplete (never "0 pages, complete"). + imock.onPost('/pages/sidebar-pages').reply(500, {}); + + const { pages, complete } = await client.listSpaceTree('space-1'); + expect(complete).toBe(false); + expect(pages).toEqual([]); + }); +}); + // --------------------------------------------------------------------------- // AUTH: 401 interceptor + re-login dedup + getCollabTokenWithReauth // --------------------------------------------------------------------------- diff --git a/test/git.test.ts b/test/git.test.ts new file mode 100644 index 0000000..7b56b30 --- /dev/null +++ b/test/git.test.ts @@ -0,0 +1,292 @@ +import { execFile } from 'node:child_process'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { promisify } from 'node:util'; +import { afterEach, beforeAll, describe, expect, it } from 'vitest'; +import { + VaultGit, + BOT_AUTHOR_NAME, + BOT_AUTHOR_EMAIL, + buildCommitMessage, +} from '../src/git.js'; + +const execFileAsync = promisify(execFile); + +/** True if a usable `git` binary is on PATH (skip the suite otherwise). */ +async function gitAvailable(): Promise<boolean> { + try { + await execFileAsync('git', ['--version']); + return true; + } catch { + return false; + } +} + +/** Read the full commit message of HEAD (subject + body) in a repo dir. */ +async function headMessage(dir: string): Promise<string> { + const { stdout } = await execFileAsync( + 'git', + ['--no-pager', 'log', '-1', '--pretty=%B'], + { cwd: dir }, + ); + return stdout.trim(); +} + +/** Read the author "Name <email>" of HEAD in a repo dir. */ +async function headAuthor(dir: string): Promise<string> { + const { stdout } = await execFileAsync( + 'git', + ['--no-pager', 'log', '-1', '--pretty=%an <%ae>'], + { cwd: dir }, + ); + return stdout.trim(); +} + +describe('buildCommitMessage (pure)', () => { + it('returns the bare subject when there are no trailers', () => { + expect(buildCommitMessage('subject')).toBe('subject'); + expect(buildCommitMessage('subject', [])).toBe('subject'); + }); + + it('appends trailers separated from the subject by a blank line', () => { + expect(buildCommitMessage('subject', ['Docmost-Sync-Source: docmost'])).toBe( + 'subject\n\nDocmost-Sync-Source: docmost', + ); + }); +}); + +describe('VaultGit (integration; temp repo)', () => { + let available = false; + let dir: string; + + beforeAll(async () => { + available = await gitAvailable(); + }); + + afterEach(async () => { + if (dir) { + await rm(dir, { recursive: true, force: true }); + } + }); + + /** Make a fresh temp dir for one test (under the OS tmpdir, NOT the repo). */ + async function freshDir(): Promise<string> { + dir = await mkdtemp(join(tmpdir(), 'docmost-vault-')); + return dir; + } + + it('ensureRepo creates .git + main + an initial commit', async () => { + if (!available) return; // skip gracefully when git is unavailable + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + // It is a git work-tree now. + const { stdout: insideWt } = await execFileAsync( + 'git', + ['rev-parse', '--is-inside-work-tree'], + { cwd: vault }, + ); + expect(insideWt.trim()).toBe('true'); + + // On `main`. + expect(await git.currentBranch()).toBe('main'); + + // Has the initial commit. + expect(await headMessage(vault)).toBe('init vault'); + + // Idempotent: calling again does not create a second commit. + await git.ensureRepo(); + const { stdout: count } = await execFileAsync( + 'git', + ['rev-list', '--count', 'HEAD'], + { cwd: vault }, + ); + expect(count.trim()).toBe('1'); + }); + + it('ensureBranch creates the docmost branch from main', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + expect(await git.branchExists('docmost')).toBe(false); + await git.ensureBranch('docmost', 'main'); + expect(await git.branchExists('docmost')).toBe(true); + + // Idempotent. + await git.ensureBranch('docmost', 'main'); + expect(await git.branchExists('docmost')).toBe(true); + }); + + it('commit writes a commit with the provenance trailer and the bot identity', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + await writeFile(join(vault, 'page.md'), 'hello\n', 'utf8'); + await git.stageAll(); + const made = await git.commit('docmost: sync 1 page(s)', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + trailers: ['Docmost-Sync-Source: docmost'], + }); + expect(made).toBe(true); + + const msg = await headMessage(vault); + expect(msg).toContain('docmost: sync 1 page(s)'); + expect(msg).toContain('Docmost-Sync-Source: docmost'); + + const author = await headAuthor(vault); + expect(author).toBe(`${BOT_AUTHOR_NAME} <${BOT_AUTHOR_EMAIL}>`); + + // The trailer is parseable by git itself. + const { stdout: trailers } = await execFileAsync( + 'git', + ['--no-pager', 'log', '-1', '--pretty=%(trailers:key=Docmost-Sync-Source,valueonly)'], + { cwd: vault }, + ); + expect(trailers.trim()).toBe('docmost'); + }); + + it('commit is a no-op when there is nothing to commit', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + await git.stageAll(); // nothing changed since the init commit + const made = await git.commit('docmost: sync 0 page(s)', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + trailers: ['Docmost-Sync-Source: docmost'], + }); + expect(made).toBe(false); + + // Still exactly one commit (the init one). + const { stdout: count } = await execFileAsync( + 'git', + ['rev-list', '--count', 'HEAD'], + { cwd: vault }, + ); + expect(count.trim()).toBe('1'); + }); + + it('merge fast-forwards main to docmost', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + await git.ensureBranch('docmost', 'main'); + + // Commit a file on docmost. + await git.checkout('docmost'); + await writeFile(join(vault, 'a.md'), 'a\n', 'utf8'); + await git.stageAll(); + await git.commit('docmost: sync 1 page(s)', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + trailers: ['Docmost-Sync-Source: docmost'], + }); + + // main has not diverged, so the merge is a clean fast-forward. + await git.checkout('main'); + const res = await git.merge('docmost'); + expect(res.ok).toBe(true); + expect(res.conflict).toBe(false); + + // main now contains the file and the docmost commit. + const tracked = await git.listTrackedFiles(); + expect(tracked).toContain('a.md'); + expect(await headMessage(vault)).toContain('docmost: sync 1 page(s)'); + }); + + it('merge surfaces a conflict distinctly (no auto-resolve)', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + await git.ensureBranch('docmost', 'main'); + + // Divergent edits to the SAME file on both branches -> real conflict. + await git.checkout('docmost'); + await writeFile(join(vault, 'c.md'), 'from docmost\n', 'utf8'); + await git.stageAll(); + await git.commit('docmost edit', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + }); + + await git.checkout('main'); + await writeFile(join(vault, 'c.md'), 'from main\n', 'utf8'); + await git.stageAll(); + await git.commit('main edit', { + authorName: 'Human', + authorEmail: 'human@local', + }); + + const res = await git.merge('docmost'); + expect(res.ok).toBe(false); + expect(res.conflict).toBe(true); + }); + + it('isMergeInProgress is false on a clean repo and true mid-merge', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + await git.ensureBranch('docmost', 'main'); + + // Clean repo, no merge in progress. + expect(await git.isMergeInProgress()).toBe(false); + + // Create a REAL conflict: divergent edits to the same file on both branches. + await git.checkout('docmost'); + await writeFile(join(vault, 'c.md'), 'from docmost\n', 'utf8'); + await git.stageAll(); + await git.commit('docmost edit', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + }); + + await git.checkout('main'); + await writeFile(join(vault, 'c.md'), 'from main\n', 'utf8'); + await git.stageAll(); + await git.commit('main edit', { + authorName: 'Human', + authorEmail: 'human@local', + }); + + // Merge conflicts -> the repo is now left mid-merge. + const res = await git.merge('docmost'); + expect(res.conflict).toBe(true); + expect(await git.isMergeInProgress()).toBe(true); + + // Aborting the merge clears the in-progress state again. + await execFileAsync('git', ['--no-pager', 'merge', '--abort'], { cwd: vault }); + expect(await git.isMergeInProgress()).toBe(false); + }); + + it('listTrackedFiles supports a glob and returns forward-slash paths', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + await writeFile(join(vault, 'keep.md'), 'k\n', 'utf8'); + await writeFile(join(vault, 'note.txt'), 't\n', 'utf8'); + await git.stageAll(); + await git.commit('add files', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + }); + + const md = await git.listTrackedFiles('*.md'); + expect(md).toEqual(['keep.md']); + const all = await git.listTrackedFiles(); + expect(new Set(all)).toEqual(new Set(['keep.md', 'note.txt'])); + }); +}); diff --git a/test/reconcile.test.ts b/test/reconcile.test.ts new file mode 100644 index 0000000..e9be702 --- /dev/null +++ b/test/reconcile.test.ts @@ -0,0 +1,238 @@ +import { describe, expect, it } from 'vitest'; +import { + planReconciliation, + decideAbsenceDeletions, + type ExistingEntry, + type LiveEntry, +} from '../src/reconcile.js'; + +describe('planReconciliation', () => { + it('ADD: a new live page (not tracked) is written, nothing deleted', () => { + const live: LiveEntry[] = [{ pageId: 'p1', relPath: 'Space/New.md' }]; + const existing: ExistingEntry[] = []; + const plan = planReconciliation(live, existing); + expect(plan.toWrite).toEqual([{ pageId: 'p1', relPath: 'Space/New.md' }]); + expect(plan.toDelete).toEqual([]); + expect(plan.moved).toEqual([]); + }); + + it('CONTENT-UPDATE: tracked page at the SAME path is rewritten, not moved/deleted', () => { + const live: LiveEntry[] = [{ pageId: 'p1', relPath: 'Space/Doc.md' }]; + const existing: ExistingEntry[] = [{ pageId: 'p1', relPath: 'Space/Doc.md' }]; + const plan = planReconciliation(live, existing); + // Still written (re-emitted; identical bytes => git no-op), no move/delete. + expect(plan.toWrite).toEqual([{ pageId: 'p1', relPath: 'Space/Doc.md' }]); + expect(plan.toDelete).toEqual([]); + expect(plan.moved).toEqual([]); + }); + + it('MOVE: same pageId, new path -> write new + recorded as moved (NOT in toDelete)', () => { + const live: LiveEntry[] = [{ pageId: 'p1', relPath: 'Space/NewParent/Doc.md' }]; + const existing: ExistingEntry[] = [ + { pageId: 'p1', relPath: 'Space/OldParent/Doc.md' }, + ]; + const plan = planReconciliation(live, existing); + expect(plan.toWrite).toEqual([ + { pageId: 'p1', relPath: 'Space/NewParent/Doc.md' }, + ]); + // The old path is a MOVE removal, NOT an absence delete -> not in toDelete. + expect(plan.toDelete).toEqual([]); + expect(plan.moved).toEqual([ + { + pageId: 'p1', + fromRelPath: 'Space/OldParent/Doc.md', + toRelPath: 'Space/NewParent/Doc.md', + removeOldPath: true, + }, + ]); + }); + + it('DELETE: a tracked pageId gone from live -> its file is deleted', () => { + const live: LiveEntry[] = [{ pageId: 'p1', relPath: 'Space/Keep.md' }]; + const existing: ExistingEntry[] = [ + { pageId: 'p1', relPath: 'Space/Keep.md' }, + { pageId: 'p2', relPath: 'Space/Gone.md' }, + ]; + const plan = planReconciliation(live, existing); + expect(plan.toWrite).toEqual([{ pageId: 'p1', relPath: 'Space/Keep.md' }]); + expect(plan.toDelete).toEqual(['Space/Gone.md']); + expect(plan.moved).toEqual([]); + }); + + it('NO-OP: live and existing identical -> writes (re-emit) but no deletes/moves', () => { + const live: LiveEntry[] = [ + { pageId: 'p1', relPath: 'A.md' }, + { pageId: 'p2', relPath: 'B.md' }, + ]; + const existing: ExistingEntry[] = [ + { pageId: 'p1', relPath: 'A.md' }, + { pageId: 'p2', relPath: 'B.md' }, + ]; + const plan = planReconciliation(live, existing); + expect(plan.toWrite).toEqual(live); + expect(plan.toDelete).toEqual([]); + expect(plan.moved).toEqual([]); + }); + + it('does NOT delete an old path that another live page will write (path reuse)', () => { + // p1 moves from X.md to Y.md; p2 is a NEW page taking over X.md. The old + // X.md must NOT be deleted, because p2 writes it. + const live: LiveEntry[] = [ + { pageId: 'p1', relPath: 'Y.md' }, + { pageId: 'p2', relPath: 'X.md' }, + ]; + const existing: ExistingEntry[] = [{ pageId: 'p1', relPath: 'X.md' }]; + const plan = planReconciliation(live, existing); + expect(new Set(plan.toWrite)).toEqual( + new Set([ + { pageId: 'p1', relPath: 'Y.md' }, + { pageId: 'p2', relPath: 'X.md' }, + ]), + ); + // X.md is a live target, so nothing is deleted. + expect(plan.toDelete).toEqual([]); + // The move is still recorded, but its old path is NOT removable (p2 writes + // X.md): removeOldPath:false protects the reused path from data loss. + expect(plan.moved).toEqual([ + { pageId: 'p1', fromRelPath: 'X.md', toRelPath: 'Y.md', removeOldPath: false }, + ]); + }); + + it('combines add + update + move + delete in one plan', () => { + const live: LiveEntry[] = [ + { pageId: 'keep', relPath: 'Keep.md' }, // update in place + { pageId: 'mover', relPath: 'New/Moved.md' }, // moved + { pageId: 'fresh', relPath: 'Fresh.md' }, // added + ]; + const existing: ExistingEntry[] = [ + { pageId: 'keep', relPath: 'Keep.md' }, + { pageId: 'mover', relPath: 'Old/Moved.md' }, + { pageId: 'dead', relPath: 'Dead.md' }, // deleted + ]; + const plan = planReconciliation(live, existing); + expect(plan.toWrite).toEqual(live); + expect(plan.moved).toEqual([ + { + pageId: 'mover', + fromRelPath: 'Old/Moved.md', + toRelPath: 'New/Moved.md', + removeOldPath: true, + }, + ]); + // toDelete is ABSENCE-only now: the moved old path lives in `moved`, so only + // the genuinely-gone page (Dead.md) is here. + expect(plan.toDelete).toEqual(['Dead.md']); + }); + + it('records each duplicate tracked row of a present pageId as a removable move', () => { + // Two stray files both claim pageId "dup"; the live page lives elsewhere. + // Each stray is a MOVE (same pageId, different path) -> recorded in `moved` + // with removeOldPath:true, NOT in absence-based toDelete. + const live: LiveEntry[] = [{ pageId: 'dup', relPath: 'Canonical.md' }]; + const existing: ExistingEntry[] = [ + { pageId: 'dup', relPath: 'StrayA.md' }, + { pageId: 'dup', relPath: 'StrayB.md' }, + ]; + const plan = planReconciliation(live, existing); + expect(plan.toWrite).toEqual([{ pageId: 'dup', relPath: 'Canonical.md' }]); + expect(plan.toDelete).toEqual([]); + expect(plan.moved).toEqual([ + { + pageId: 'dup', + fromRelPath: 'StrayA.md', + toRelPath: 'Canonical.md', + removeOldPath: true, + }, + { + pageId: 'dup', + fromRelPath: 'StrayB.md', + toRelPath: 'Canonical.md', + removeOldPath: true, + }, + ]); + }); +}); + +describe('decideAbsenceDeletions (SPEC §8)', () => { + it('APPLIES when the tree is complete and the delete count is modest', () => { + const d = decideAbsenceDeletions({ + treeComplete: true, + liveCount: 10, + existingCount: 10, + deleteCount: 1, + }); + expect(d).toEqual({ apply: true }); + }); + + it('SUPPRESSES all absence deletions when the tree fetch is incomplete', () => { + // Even a single absence delete is suppressed on a partial tree (a missing + // pageId in a partial tree is NOT proof of deletion). + const d = decideAbsenceDeletions({ + treeComplete: false, + liveCount: 9, + existingCount: 10, + deleteCount: 1, + }); + expect(d).toEqual({ apply: false, reason: 'incomplete-fetch' }); + }); + + it('SUPPRESSES when live returned 0 pages but files are tracked (complete flag aside)', () => { + const d = decideAbsenceDeletions({ + treeComplete: true, + liveCount: 0, + existingCount: 5, + deleteCount: 5, + }); + expect(d).toEqual({ apply: false, reason: 'empty-live' }); + }); + + it('SUPPRESSES over the mass-delete guard (> 50% of a non-trivial vault)', () => { + const d = decideAbsenceDeletions({ + treeComplete: true, + liveCount: 4, + existingCount: 10, + deleteCount: 6, // 60% > 50% + }); + expect(d).toEqual({ apply: false, reason: 'mass-delete' }); + }); + + it('does NOT apply the fraction guard for a tiny vault (below the floor)', () => { + // 1-of-2 is normal in a tiny vault; the fraction guard does not fire. + const d = decideAbsenceDeletions({ + treeComplete: true, + liveCount: 1, + existingCount: 2, + deleteCount: 1, + }); + expect(d).toEqual({ apply: true }); + }); + + it('incomplete-fetch takes precedence over the mass-delete reason', () => { + const d = decideAbsenceDeletions({ + treeComplete: false, + liveCount: 4, + existingCount: 10, + deleteCount: 6, + }); + expect(d).toEqual({ apply: false, reason: 'incomplete-fetch' }); + }); + + it('trivially applies when nothing is tracked or nothing would be deleted', () => { + expect( + decideAbsenceDeletions({ + treeComplete: false, + liveCount: 0, + existingCount: 0, + deleteCount: 0, + }), + ).toEqual({ apply: true }); + expect( + decideAbsenceDeletions({ + treeComplete: false, + liveCount: 5, + existingCount: 5, + deleteCount: 0, + }), + ).toEqual({ apply: true }); + }); +});