/** * Pull cycle — Docmost -> vault (SPEC §6 "Docmost -> ФС"). * * This increment turns the read-only mirror into the git-backed pull cycle: * * 1. ensureRepo(vault); refuse if a merge is in progress (SPEC §9/§12); * ensureBranch("docmost", "main") (SPEC §5 branches) * 2. checkout docmost * 3. fetch the live tree (listSpaceTree -> {pages, complete}) -> compute the * desired `live` files (relPath via the pure sanitize/disambiguation layout) * 4. parse `existing` tracked .md files (pageId + relPath from docmost:meta) * 5. plan = planReconciliation(live, existing) (pure, SPEC §5/§8); toDelete * is absence-only, moves are separate * 6. decideAbsenceDeletions: SUPPRESS absence deletions on an incomplete tree * fetch (SPEC §8) and behind the mass-delete guard (defense in depth) * 7. write each live page in its fixpoint form (normalize-on-write, SPEC §11); * apply moved-old-path removals (only when the move write SUCCEEDED) and * absence-delete removals (only when the decision allowed them) * 8. stageAll + commit on `docmost` with the provenance trailer (SPEC §7.3) * 9. checkout main + merge docmost (conflicts are surfaced, NOT auto-resolved, * SPEC §9); push is deferred (SPEC §7) * 10. one-line summary * * DIRECTION IS Docmost -> vault ONLY. Nothing here ever writes to Docmost * (read-only: listSpaceTree + getPageJson). All git operations run against * the vault repo (`cwd = vaultPath`), never the source repo (see ./git.ts). * * Requires a `.env` with real Docmost credentials. This file must COMPILE and * be correct, but is NOT expected to be run without live access. * * Run via: npm run pull (-> node build/pull.js) */ import { mkdir, readFile, rm, writeFile } from "node:fs/promises"; import { dirname, join, sep } from "node:path"; import { pathToFileURL } from "node:url"; import { DocmostClient, parseDocmostMarkdown } from "docmost-client"; import { loadSettings } from "./settings.js"; import { buildVaultLayout, type PageNode } from "./layout.js"; import { VaultGit, BOT_AUTHOR_NAME, BOT_AUTHOR_EMAIL, DEFAULT_BRANCH, } from "./git.js"; import { planReconciliation, decideAbsenceDeletions, type LiveEntry, } from "./reconcile.js"; import { stabilizePageFile, type PageMeta } from "./stabilize.js"; // Engine-only mirror branch (SPEC §5): the engine writes here, humans never do. const DOCMOST_BRANCH = "docmost"; // Machine-readable provenance the loop-guard keys on (SPEC §7.3 / §12). const SOURCE_TRAILER = "Docmost-Sync-Source: docmost"; // Number of pages fetched/stabilized concurrently. Bounded so a large space // does not open thousands of simultaneous requests/conversions at once. const CONCURRENCY = 6; // How often to log incremental progress (every N completed pages). const PROGRESS_EVERY = 25; /** Convert a vault-relative path (forward-slash) to an absolute FS path. */ function relToAbs(vaultRoot: string, relPath: string): string { return join(vaultRoot, ...relPath.split("/")); } /** Convert an absolute/relative segment list under the vault to a relPath. */ function segmentsToRelPath(segments: string[], stem: string): string { return [...segments, `${stem}.md`].join("/"); } /** * Read every tracked .md file in the vault and parse its `docmost:meta` to * recover `{ pageId, relPath }`. Files without a parseable pageId in meta are * skipped (they are not engine-tracked pages — e.g. a stray hand-written file). */ async function readExisting( git: VaultGit, vaultRoot: string, ): Promise<{ pageId: string; relPath: string }[]> { const tracked = await git.listTrackedFiles("*.md"); const existing: { pageId: string; relPath: string }[] = []; for (const relPath of tracked) { // git ls-files always emits forward-slash paths; normalize just in case. const rel = relPath.split(sep).join("/"); let text: string; try { text = await readFile(relToAbs(vaultRoot, rel), "utf8"); } catch { // Tracked but missing on disk (mid-operation race) — skip; the next pull // converges. continue; } let pageId: string | undefined; try { const { meta } = parseDocmostMarkdown(text); pageId = meta?.pageId; } catch { // Unparseable meta — not engine-tracked; leave it alone. pageId = undefined; } if (pageId) existing.push({ pageId, relPath: rel }); } return existing; } async function main(): Promise { const s = loadSettings(); const client = new DocmostClient( s.docmostApiUrl, s.docmostEmail, s.docmostPassword, ); const spaceId = s.docmostSpaceId; const vaultRoot = s.vaultPath; // 1. Ensure the vault git repo exists with main + an initial commit, and the // engine-only `docmost` branch exists, branched from main. const git = new VaultGit(vaultRoot); // Preflight: fail fast (with an actionable message via main().catch) if the // git binary is missing — the entire vault state store relies on it. await git.assertGitAvailable(); await git.ensureRepo(); // 1b. Refuse to run on top of an unresolved merge (SPEC §9 / §12). A previous // conflicting pull leaves the vault mid-merge; the next `checkout` would // fail with a raw "you need to resolve your current index first". Detect // it BEFORE any checkout and exit cleanly with an actionable message so // re-runs converge once the human resolves (or aborts) the merge. if (await git.isMergeInProgress()) { console.error( `vault has an unresolved merge at ${vaultRoot} — resolve it (or ` + `'git merge --abort') and re-run (SPEC §9)`, ); process.exitCode = 1; return; } await git.ensureBranch(DOCMOST_BRANCH, DEFAULT_BRANCH); // 2. Work on the docmost mirror branch. await git.checkout(DOCMOST_BRANCH); // 3. Fetch the live tree and compute the desired files (relPath via the pure // sanitize + disambiguation layout). `listSpaceTree` reports completeness: // if ANY branch's children fetch failed or the node cap was hit, the tree // is PARTIAL and absence-based deletions must be suppressed this cycle // (SPEC §8) — a missing pageId in a partial tree is NOT proof of deletion. const { pages: rawPages, complete: treeComplete } = await client.listSpaceTree(spaceId); const pages = rawPages as PageNode[]; const layout = buildVaultLayout(pages); const live: LiveEntry[] = []; const liveNodeByPageId = new Map(); for (const p of pages) { if (!p || !p.id) continue; const entry = layout.get(p.id); if (!entry) continue; live.push({ pageId: p.id, relPath: segmentsToRelPath(entry.segments, entry.stem), }); liveNodeByPageId.set(p.id, p); } // 4. Parse the existing tracked .md files (pageId + relPath). const existing = await readExisting(git, vaultRoot); // 5. Plan reconciliation (pure). `plan.toDelete` is ABSENCE-based only; // `plan.moved` carries move old-path removals separately. const plan = planReconciliation(live, existing); // 6. Decide whether the ABSENCE-based deletions (`plan.toDelete`) may be // applied this cycle (SPEC §8). The pure helper folds in BOTH the // incomplete-fetch suppression (a partial tree must not look like // deletions) AND the mass-delete guard (defense in depth). Moves are NOT // governed by this — a moved page is present in `live`, so its old-path // removal is real and applied unconditionally (subject only to its write // succeeding). const deleteDecision = decideAbsenceDeletions({ treeComplete, liveCount: live.length, existingCount: existing.length, deleteCount: plan.toDelete.length, }); if (!deleteDecision.apply) { if (deleteDecision.reason === "incomplete-fetch") { console.warn( "pull: tree fetch incomplete — deletions suppressed this cycle (SPEC §8)", ); } else if (deleteDecision.reason === "empty-live") { console.warn( `pull: live fetch returned 0 pages but ${existing.length} file(s) are ` + `tracked — deletions suppressed this cycle (SPEC §8). Re-run when ` + `Docmost is reachable.`, ); } else { console.warn( `pull: plan would delete ${plan.toDelete.length} of ${existing.length} ` + `tracked file(s) (mass-delete guard) — deletions suppressed this ` + `cycle (SPEC §8). Verify the live Docmost tree, then re-run.`, ); } } // 7. Write each live page in its fixpoint form (normalize-on-write, SPEC §11), // then apply move-old-path + absence-delete removals. let written = 0; let failed = 0; let completed = 0; let nextIndex = 0; // pageIds whose write FAILED. A moved page whose new-path write failed must // NOT have its old path removed (otherwise the page vanishes entirely). const failedPageIds = new Set(); const writeOne = async (w: { pageId: string; relPath: string }): Promise => { const node = liveNodeByPageId.get(w.pageId); if (!node) return; try { const page = await client.getPageJson(w.pageId); const meta: PageMeta = { version: 1, pageId: page.id, slugId: page.slugId, title: page.title, spaceId: page.spaceId, parentPageId: page.parentPageId ?? null, }; const text = await stabilizePageFile(page.content, meta); const abs = relToAbs(vaultRoot, w.relPath); await mkdir(dirname(abs), { recursive: true }); await writeFile(abs, text, "utf8"); written++; } catch (err) { failed++; failedPageIds.add(w.pageId); console.error( `pull: failed page ${w.pageId}:`, err instanceof Error ? err.message : String(err), ); } finally { completed++; if (completed % PROGRESS_EVERY === 0) { console.log(`pulled ${completed}/${plan.toWrite.length}`); } } }; // Bounded-concurrency pool (dependency-free): a fixed set of runners each // take the next index until the write list is exhausted. One bad page never // aborts the whole pull (mirrors the fault-tolerant tree walk). const runner = async (): Promise => { while (true) { const i = nextIndex++; if (i >= plan.toWrite.length) return; await writeOne(plan.toWrite[i]); } }; await Promise.all( Array.from( { length: Math.min(CONCURRENCY, plan.toWrite.length) || 1 }, () => runner(), ), ); // Helper: `rm` with force:true is a no-op if the file is already gone. const removePath = async (rel: string, what: string): Promise => { try { await rm(relToAbs(vaultRoot, rel), { force: true }); return true; } catch (err) { console.error( `pull: failed to ${what} ${rel}:`, err instanceof Error ? err.message : String(err), ); return false; } }; // 7a. Apply MOVE old-path removals. A moved page IS present in `live`, so its // old path is genuinely stale — this is NOT subject to the incomplete- // fetch suppression. BUT only remove the old path when (a) the planner // marked it removable (not reused by another live page) AND (b) the new- // path write actually SUCCEEDED — otherwise we would delete the only copy // of a page whose move-write failed. let movedApplied = 0; for (const m of plan.moved) { if (!m.removeOldPath) continue; if (failedPageIds.has(m.pageId)) { console.warn( `pull: move write for ${m.pageId} failed — keeping old path ` + `${m.fromRelPath} (SPEC §8)`, ); continue; } if (await removePath(m.fromRelPath, "remove moved old path")) movedApplied++; } // 7b. Apply ABSENCE-based deletions — ONLY if the decision allowed them // (incomplete-fetch suppression + mass-delete guard, SPEC §8). let deleted = 0; if (deleteDecision.apply) { for (const rel of plan.toDelete) { if (await removePath(rel, "delete")) deleted++; } } // 8. Stage + commit on `docmost` (only if there is something to commit). // Deterministic stabilized output means unchanged pages produce identical // bytes -> git sees no diff -> no churn (SPEC §11). The subject reflects the // ACTUAL work applied (pages written + files deleted), not the planned size, // so a run with failures does not over-report (SPEC §5 nit). const subject = deleted > 0 ? `docmost: sync ${written} page(s), ${deleted} deleted` : `docmost: sync ${written} page(s)`; await git.stageAll(); const committed = await git.commit(subject, { authorName: BOT_AUTHOR_NAME, authorEmail: BOT_AUTHOR_EMAIL, trailers: [SOURCE_TRAILER], }); // 9. Merge docmost -> main. Conflicts are surfaced and left in git (SPEC §9); // we never push to Docmost. Push to a git remote is deferred (SPEC §7). await git.checkout(DEFAULT_BRANCH); const merge = await git.merge(DOCMOST_BRANCH); if (merge.conflict) { console.error( "pull: merge of docmost -> main CONFLICTED. Conflict markers were left " + "in the vault for manual resolution (SPEC §9). Nothing is pushed to " + "Docmost (read-only). Resolve locally, then re-run.", ); } else if (!merge.ok) { console.error(`pull: merge of docmost -> main failed: ${merge.output}`); } console.log("pull: git push to remote is DEFERRED in this increment (SPEC §7)."); // 10. One-line summary. console.log( `pull complete: ${written} written, ${movedApplied} moved, ` + `${deleted} deleted, committed=${committed}, ` + `merge=${merge.conflict ? "CONFLICT" : merge.ok ? "ok" : "failed"} ` + `(${failed} page failures) from space ${spaceId} into ${vaultRoot}`, ); // Signal a partial mirror / conflict so callers/CI can react. Use // process.exitCode (not a hard exit) so buffered output flushes cleanly. if (failed > 0 || merge.conflict || !merge.ok) { process.exitCode = 1; } } // Only auto-run when invoked directly as the CLI entrypoint, not when this // module is imported (e.g. by a unit test), so the import does not trigger // loadSettings() + git/network access. const invokedDirectly = typeof process.argv[1] === "string" && import.meta.url === pathToFileURL(process.argv[1]).href; if (invokedDirectly) { main().catch((err) => { console.error("pull failed:", err instanceof Error ? err.stack : err); process.exit(1); }); }