feat(sync): add git vault layer (§5) and the Docmost->vault pull cycle (§6)
Turn the read-only mirror into a git-backed pull cycle. Read-only toward Docmost.
- git.ts (VaultGit): system-git wrapper, all ops cwd=vaultPath (vault is its own
repo under data/vault, never the source repo); ensureRepo/branches main+docmost,
commit with provenance (author/committer identity + Docmost-Sync-Source trailer,
§7.3), merge with conflict surfacing (no auto-resolve, §9), isMergeInProgress;
GIT_DIR/GIT_WORK_TREE stripped from env (§12 cwd isolation)
- stabilize.ts: normalize-on-write (one export->import->export fixpoint pass, §11)
- reconcile.ts: pure planReconciliation (add/update/move/delete by pageId) +
decideAbsenceDeletions gate
- pull.ts: write/commit on docmost -> merge into main; listSpaceTree completeness
signal suppresses absence-deletions on a partial fetch (§8); mass-delete guard;
merge-in-progress guard makes re-runs converge (§12); move old-path removal only
on successful write
- docmost-client: listSpaceTree({pages, complete}) without touching the 1:1-copied
enumerateSpacePages
- tests: reconcile planner + decideAbsenceDeletions, VaultGit incl. real temp-repo
merge conflict, listSpaceTree completeness (586 green)
Push to a git remote and the FS->Docmost direction are deferred to the next increment.
This commit is contained in:
339
src/pull.ts
339
src/pull.ts
@@ -1,34 +1,110 @@
|
||||
/**
|
||||
* Read-only Docmost -> filesystem mirror (SPEC §6 pull, Phase 1).
|
||||
* Pull cycle — Docmost -> vault (SPEC §6 "Docmost -> ФС").
|
||||
*
|
||||
* Walks the configured space's page tree and writes one self-contained `.md`
|
||||
* per page under `<vaultPath>/<...ancestors>/<Title>.md`. This increment is
|
||||
* READ-ONLY toward Docmost (no writes, no git) — it only fetches and writes
|
||||
* local files. The meta block inside each file carries
|
||||
* `{ version, pageId, slugId, title, spaceId, parentPageId }` (identity), so no
|
||||
* external map file is needed.
|
||||
* This increment turns the read-only mirror into the git-backed pull cycle:
|
||||
*
|
||||
* The pure tree -> path mapping lives in `./layout.js`; this file is a thin,
|
||||
* fault-tolerant I/O loop around it.
|
||||
* 1. ensureRepo(vault); refuse if a merge is in progress (SPEC §9/§12);
|
||||
* ensureBranch("docmost", "main") (SPEC §5 branches)
|
||||
* 2. checkout docmost
|
||||
* 3. fetch the live tree (listSpaceTree -> {pages, complete}) -> compute the
|
||||
* desired `live` files (relPath via the pure sanitize/disambiguation layout)
|
||||
* 4. parse `existing` tracked .md files (pageId + relPath from docmost:meta)
|
||||
* 5. plan = planReconciliation(live, existing) (pure, SPEC §5/§8); toDelete
|
||||
* is absence-only, moves are separate
|
||||
* 6. decideAbsenceDeletions: SUPPRESS absence deletions on an incomplete tree
|
||||
* fetch (SPEC §8) and behind the mass-delete guard (defense in depth)
|
||||
* 7. write each live page in its fixpoint form (normalize-on-write, SPEC §11);
|
||||
* apply moved-old-path removals (only when the move write SUCCEEDED) and
|
||||
* absence-delete removals (only when the decision allowed them)
|
||||
* 8. stageAll + commit on `docmost` with the provenance trailer (SPEC §7.3)
|
||||
* 9. checkout main + merge docmost (conflicts are surfaced, NOT auto-resolved,
|
||||
* SPEC §9); push is deferred (SPEC §7)
|
||||
* 10. one-line summary
|
||||
*
|
||||
* DIRECTION IS Docmost -> vault ONLY. Nothing here ever writes to Docmost
|
||||
* (read-only: listSpaceTree + getPageJson). All git operations run against
|
||||
* the vault repo (`cwd = vaultPath`), never the source repo (see ./git.ts).
|
||||
*
|
||||
* Requires a `.env` with real Docmost credentials. This file must COMPILE and
|
||||
* be correct, but is not expected to be run without live access.
|
||||
* be correct, but is NOT expected to be run without live access.
|
||||
*
|
||||
* Run via: npm run pull (-> node build/pull.js)
|
||||
*/
|
||||
import { mkdir, writeFile } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import { dirname, join, sep } from "node:path";
|
||||
import { pathToFileURL } from "node:url";
|
||||
import { DocmostClient } from "docmost-client";
|
||||
import { DocmostClient, parseDocmostMarkdown } from "docmost-client";
|
||||
import { loadSettings } from "./settings.js";
|
||||
import { buildVaultLayout, type PageNode } from "./layout.js";
|
||||
import {
|
||||
VaultGit,
|
||||
BOT_AUTHOR_NAME,
|
||||
BOT_AUTHOR_EMAIL,
|
||||
DEFAULT_BRANCH,
|
||||
} from "./git.js";
|
||||
import {
|
||||
planReconciliation,
|
||||
decideAbsenceDeletions,
|
||||
type LiveEntry,
|
||||
} from "./reconcile.js";
|
||||
import { stabilizePageFile, type PageMeta } from "./stabilize.js";
|
||||
|
||||
// Number of pages fetched/written concurrently. Bounded so a large space does
|
||||
// not open thousands of simultaneous requests/file handles.
|
||||
// Engine-only mirror branch (SPEC §5): the engine writes here, humans never do.
|
||||
const DOCMOST_BRANCH = "docmost";
|
||||
// Machine-readable provenance the loop-guard keys on (SPEC §7.3 / §12).
|
||||
const SOURCE_TRAILER = "Docmost-Sync-Source: docmost";
|
||||
|
||||
// Number of pages fetched/stabilized concurrently. Bounded so a large space
|
||||
// does not open thousands of simultaneous requests/conversions at once.
|
||||
const CONCURRENCY = 6;
|
||||
// How often to log incremental progress (every N completed pages).
|
||||
const PROGRESS_EVERY = 25;
|
||||
|
||||
/** Convert a vault-relative path (forward-slash) to an absolute FS path. */
|
||||
function relToAbs(vaultRoot: string, relPath: string): string {
|
||||
return join(vaultRoot, ...relPath.split("/"));
|
||||
}
|
||||
|
||||
/** Convert an absolute/relative segment list under the vault to a relPath. */
|
||||
function segmentsToRelPath(segments: string[], stem: string): string {
|
||||
return [...segments, `${stem}.md`].join("/");
|
||||
}
|
||||
|
||||
/**
|
||||
* Read every tracked .md file in the vault and parse its `docmost:meta` to
|
||||
* recover `{ pageId, relPath }`. Files without a parseable pageId in meta are
|
||||
* skipped (they are not engine-tracked pages — e.g. a stray hand-written file).
|
||||
*/
|
||||
async function readExisting(
|
||||
git: VaultGit,
|
||||
vaultRoot: string,
|
||||
): Promise<{ pageId: string; relPath: string }[]> {
|
||||
const tracked = await git.listTrackedFiles("*.md");
|
||||
const existing: { pageId: string; relPath: string }[] = [];
|
||||
for (const relPath of tracked) {
|
||||
// git ls-files always emits forward-slash paths; normalize just in case.
|
||||
const rel = relPath.split(sep).join("/");
|
||||
let text: string;
|
||||
try {
|
||||
text = await readFile(relToAbs(vaultRoot, rel), "utf8");
|
||||
} catch {
|
||||
// Tracked but missing on disk (mid-operation race) — skip; the next pull
|
||||
// converges.
|
||||
continue;
|
||||
}
|
||||
let pageId: string | undefined;
|
||||
try {
|
||||
const { meta } = parseDocmostMarkdown(text);
|
||||
pageId = meta?.pageId;
|
||||
} catch {
|
||||
// Unparseable meta — not engine-tracked; leave it alone.
|
||||
pageId = undefined;
|
||||
}
|
||||
if (pageId) existing.push({ pageId, relPath: rel });
|
||||
}
|
||||
return existing;
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const s = loadSettings();
|
||||
const client = new DocmostClient(
|
||||
@@ -40,75 +116,244 @@ async function main(): Promise<void> {
|
||||
const spaceId = s.docmostSpaceId;
|
||||
const vaultRoot = s.vaultPath;
|
||||
|
||||
const pages: PageNode[] = await client.listAllSpacePages(spaceId);
|
||||
// 1. Ensure the vault git repo exists with main + an initial commit, and the
|
||||
// engine-only `docmost` branch exists, branched from main.
|
||||
const git = new VaultGit(vaultRoot);
|
||||
await git.ensureRepo();
|
||||
|
||||
// 1b. Refuse to run on top of an unresolved merge (SPEC §9 / §12). A previous
|
||||
// conflicting pull leaves the vault mid-merge; the next `checkout` would
|
||||
// fail with a raw "you need to resolve your current index first". Detect
|
||||
// it BEFORE any checkout and exit cleanly with an actionable message so
|
||||
// re-runs converge once the human resolves (or aborts) the merge.
|
||||
if (await git.isMergeInProgress()) {
|
||||
console.error(
|
||||
`vault has an unresolved merge at ${vaultRoot} — resolve it (or ` +
|
||||
`'git merge --abort') and re-run (SPEC §9)`,
|
||||
);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
await git.ensureBranch(DOCMOST_BRANCH, DEFAULT_BRANCH);
|
||||
|
||||
// 2. Work on the docmost mirror branch.
|
||||
await git.checkout(DOCMOST_BRANCH);
|
||||
|
||||
// 3. Fetch the live tree and compute the desired files (relPath via the pure
|
||||
// sanitize + disambiguation layout). `listSpaceTree` reports completeness:
|
||||
// if ANY branch's children fetch failed or the node cap was hit, the tree
|
||||
// is PARTIAL and absence-based deletions must be suppressed this cycle
|
||||
// (SPEC §8) — a missing pageId in a partial tree is NOT proof of deletion.
|
||||
const { pages: rawPages, complete: treeComplete } =
|
||||
await client.listSpaceTree(spaceId);
|
||||
const pages = rawPages as PageNode[];
|
||||
const layout = buildVaultLayout(pages);
|
||||
|
||||
const total = pages.length;
|
||||
const live: LiveEntry[] = [];
|
||||
const liveNodeByPageId = new Map<string, PageNode>();
|
||||
for (const p of pages) {
|
||||
if (!p || !p.id) continue;
|
||||
const entry = layout.get(p.id);
|
||||
if (!entry) continue;
|
||||
live.push({
|
||||
pageId: p.id,
|
||||
relPath: segmentsToRelPath(entry.segments, entry.stem),
|
||||
});
|
||||
liveNodeByPageId.set(p.id, p);
|
||||
}
|
||||
|
||||
// 4. Parse the existing tracked .md files (pageId + relPath).
|
||||
const existing = await readExisting(git, vaultRoot);
|
||||
|
||||
// 5. Plan reconciliation (pure). `plan.toDelete` is ABSENCE-based only;
|
||||
// `plan.moved` carries move old-path removals separately.
|
||||
const plan = planReconciliation(live, existing);
|
||||
|
||||
// 6. Decide whether the ABSENCE-based deletions (`plan.toDelete`) may be
|
||||
// applied this cycle (SPEC §8). The pure helper folds in BOTH the
|
||||
// incomplete-fetch suppression (a partial tree must not look like
|
||||
// deletions) AND the mass-delete guard (defense in depth). Moves are NOT
|
||||
// governed by this — a moved page is present in `live`, so its old-path
|
||||
// removal is real and applied unconditionally (subject only to its write
|
||||
// succeeding).
|
||||
const deleteDecision = decideAbsenceDeletions({
|
||||
treeComplete,
|
||||
liveCount: live.length,
|
||||
existingCount: existing.length,
|
||||
deleteCount: plan.toDelete.length,
|
||||
});
|
||||
if (!deleteDecision.apply) {
|
||||
if (deleteDecision.reason === "incomplete-fetch") {
|
||||
console.warn(
|
||||
"pull: tree fetch incomplete — deletions suppressed this cycle (SPEC §8)",
|
||||
);
|
||||
} else if (deleteDecision.reason === "empty-live") {
|
||||
console.warn(
|
||||
`pull: live fetch returned 0 pages but ${existing.length} file(s) are ` +
|
||||
`tracked — deletions suppressed this cycle (SPEC §8). Re-run when ` +
|
||||
`Docmost is reachable.`,
|
||||
);
|
||||
} else {
|
||||
console.warn(
|
||||
`pull: plan would delete ${plan.toDelete.length} of ${existing.length} ` +
|
||||
`tracked file(s) (mass-delete guard) — deletions suppressed this ` +
|
||||
`cycle (SPEC §8). Verify the live Docmost tree, then re-run.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// 7. Write each live page in its fixpoint form (normalize-on-write, SPEC §11),
|
||||
// then apply move-old-path + absence-delete removals.
|
||||
let written = 0;
|
||||
let failed = 0;
|
||||
let completed = 0;
|
||||
let nextIndex = 0;
|
||||
// pageIds whose write FAILED. A moved page whose new-path write failed must
|
||||
// NOT have its old path removed (otherwise the page vanishes entirely).
|
||||
const failedPageIds = new Set<string>();
|
||||
|
||||
// Pull + write a single page. Each call is wrapped so one bad page (network
|
||||
// error, page deleted between the walk and the fetch, body conversion
|
||||
// failure) NEVER aborts the whole pull — it is counted as a failure and the
|
||||
// pool moves on. Mirrors the deliberately fault-tolerant enumerateSpacePages.
|
||||
const pullOne = async (page: PageNode): Promise<void> => {
|
||||
if (!page || !page.id) return;
|
||||
const entry = layout.get(page.id);
|
||||
if (!entry) return; // no layout entry (e.g. duplicate/skipped id)
|
||||
const writeOne = async (w: { pageId: string; relPath: string }): Promise<void> => {
|
||||
const node = liveNodeByPageId.get(w.pageId);
|
||||
if (!node) return;
|
||||
try {
|
||||
const dir = join(vaultRoot, ...entry.segments);
|
||||
await mkdir(dir, { recursive: true });
|
||||
// Body + meta only (no comments block) — SPEC §3.
|
||||
const md = await client.exportPageBody(page.id);
|
||||
await writeFile(join(dir, `${entry.stem}.md`), md, "utf8");
|
||||
const page = await client.getPageJson(w.pageId);
|
||||
const meta: PageMeta = {
|
||||
version: 1,
|
||||
pageId: page.id,
|
||||
slugId: page.slugId,
|
||||
title: page.title,
|
||||
spaceId: page.spaceId,
|
||||
parentPageId: page.parentPageId ?? null,
|
||||
};
|
||||
const text = await stabilizePageFile(page.content, meta);
|
||||
const abs = relToAbs(vaultRoot, w.relPath);
|
||||
await mkdir(dirname(abs), { recursive: true });
|
||||
await writeFile(abs, text, "utf8");
|
||||
written++;
|
||||
} catch (err) {
|
||||
failed++;
|
||||
failedPageIds.add(w.pageId);
|
||||
console.error(
|
||||
`pull: failed page ${page.id}:`,
|
||||
`pull: failed page ${w.pageId}:`,
|
||||
err instanceof Error ? err.message : String(err),
|
||||
);
|
||||
} finally {
|
||||
completed++;
|
||||
if (completed % PROGRESS_EVERY === 0) {
|
||||
console.log(`pulled ${completed}/${total}`);
|
||||
console.log(`pulled ${completed}/${plan.toWrite.length}`);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// A small dependency-free bounded-concurrency pool: a fixed set of runners
|
||||
// each pull the next index until the list is exhausted.
|
||||
// Bounded-concurrency pool (dependency-free): a fixed set of runners each
|
||||
// take the next index until the write list is exhausted. One bad page never
|
||||
// aborts the whole pull (mirrors the fault-tolerant tree walk).
|
||||
const runner = async (): Promise<void> => {
|
||||
while (true) {
|
||||
const i = nextIndex++;
|
||||
if (i >= pages.length) return;
|
||||
await pullOne(pages[i]);
|
||||
if (i >= plan.toWrite.length) return;
|
||||
await writeOne(plan.toWrite[i]);
|
||||
}
|
||||
};
|
||||
await Promise.all(
|
||||
Array.from(
|
||||
{ length: Math.min(CONCURRENCY, plan.toWrite.length) || 1 },
|
||||
() => runner(),
|
||||
),
|
||||
);
|
||||
|
||||
// Helper: `rm` with force:true is a no-op if the file is already gone.
|
||||
const removePath = async (rel: string, what: string): Promise<boolean> => {
|
||||
try {
|
||||
await rm(relToAbs(vaultRoot, rel), { force: true });
|
||||
return true;
|
||||
} catch (err) {
|
||||
console.error(
|
||||
`pull: failed to ${what} ${rel}:`,
|
||||
err instanceof Error ? err.message : String(err),
|
||||
);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
const runners = Array.from(
|
||||
{ length: Math.min(CONCURRENCY, pages.length) },
|
||||
() => runner(),
|
||||
);
|
||||
await Promise.all(runners);
|
||||
// 7a. Apply MOVE old-path removals. A moved page IS present in `live`, so its
|
||||
// old path is genuinely stale — this is NOT subject to the incomplete-
|
||||
// fetch suppression. BUT only remove the old path when (a) the planner
|
||||
// marked it removable (not reused by another live page) AND (b) the new-
|
||||
// path write actually SUCCEEDED — otherwise we would delete the only copy
|
||||
// of a page whose move-write failed.
|
||||
let movedApplied = 0;
|
||||
for (const m of plan.moved) {
|
||||
if (!m.removeOldPath) continue;
|
||||
if (failedPageIds.has(m.pageId)) {
|
||||
console.warn(
|
||||
`pull: move write for ${m.pageId} failed — keeping old path ` +
|
||||
`${m.fromRelPath} (SPEC §8)`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if (await removePath(m.fromRelPath, "remove moved old path")) movedApplied++;
|
||||
}
|
||||
|
||||
// 7b. Apply ABSENCE-based deletions — ONLY if the decision allowed them
|
||||
// (incomplete-fetch suppression + mass-delete guard, SPEC §8).
|
||||
let deleted = 0;
|
||||
if (deleteDecision.apply) {
|
||||
for (const rel of plan.toDelete) {
|
||||
if (await removePath(rel, "delete")) deleted++;
|
||||
}
|
||||
}
|
||||
|
||||
// 8. Stage + commit on `docmost` (only if there is something to commit).
|
||||
// Deterministic stabilized output means unchanged pages produce identical
|
||||
// bytes -> git sees no diff -> no churn (SPEC §11). The subject reflects the
|
||||
// ACTUAL work applied (pages written + files deleted), not the planned size,
|
||||
// so a run with failures does not over-report (SPEC §5 nit).
|
||||
const subject =
|
||||
deleted > 0
|
||||
? `docmost: sync ${written} page(s), ${deleted} deleted`
|
||||
: `docmost: sync ${written} page(s)`;
|
||||
await git.stageAll();
|
||||
const committed = await git.commit(subject, {
|
||||
authorName: BOT_AUTHOR_NAME,
|
||||
authorEmail: BOT_AUTHOR_EMAIL,
|
||||
trailers: [SOURCE_TRAILER],
|
||||
});
|
||||
|
||||
// 9. Merge docmost -> main. Conflicts are surfaced and left in git (SPEC §9);
|
||||
// we never push to Docmost. Push to a git remote is deferred (SPEC §7).
|
||||
await git.checkout(DEFAULT_BRANCH);
|
||||
const merge = await git.merge(DOCMOST_BRANCH);
|
||||
if (merge.conflict) {
|
||||
console.error(
|
||||
"pull: merge of docmost -> main CONFLICTED. Conflict markers were left " +
|
||||
"in the vault for manual resolution (SPEC §9). Nothing is pushed to " +
|
||||
"Docmost (read-only). Resolve locally, then re-run.",
|
||||
);
|
||||
} else if (!merge.ok) {
|
||||
console.error(`pull: merge of docmost -> main failed: ${merge.output}`);
|
||||
}
|
||||
console.log("pull: git push to remote is DEFERRED in this increment (SPEC §7).");
|
||||
|
||||
// 10. One-line summary.
|
||||
console.log(
|
||||
`pull complete: ${written} page(s) written, ${failed} failed, ` +
|
||||
`out of ${total} from space ${spaceId} into ${vaultRoot}`,
|
||||
`pull complete: ${written} written, ${movedApplied} moved, ` +
|
||||
`${deleted} deleted, committed=${committed}, ` +
|
||||
`merge=${merge.conflict ? "CONFLICT" : merge.ok ? "ok" : "failed"} ` +
|
||||
`(${failed} page failures) from space ${spaceId} into ${vaultRoot}`,
|
||||
);
|
||||
|
||||
// Signal a partial mirror so callers/CI can react. Use process.exitCode (not
|
||||
// a hard process.exit) so any buffered output is flushed cleanly.
|
||||
if (failed > 0) {
|
||||
// Signal a partial mirror / conflict so callers/CI can react. Use
|
||||
// process.exitCode (not a hard exit) so buffered output flushes cleanly.
|
||||
if (failed > 0 || merge.conflict || !merge.ok) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Only auto-run when invoked directly as the CLI entrypoint, not when this
|
||||
// module is imported (e.g. by a unit test), so the import does not trigger
|
||||
// loadSettings() + process.exit.
|
||||
// loadSettings() + git/network access.
|
||||
const invokedDirectly =
|
||||
typeof process.argv[1] === "string" &&
|
||||
import.meta.url === pathToFileURL(process.argv[1]).href;
|
||||
|
||||
Reference in New Issue
Block a user