refactor(pull): extract tested vault-layout module; harden pull; close review findings
Address the Increment-1 code review (3 warnings + suggestions). - layout: new pure src/layout.ts (buildVaultLayout) — page-tree -> vault paths, sibling + full-path collision disambiguation (sanitized ~slugId suffix), parent cycle guard; pull.ts is now a thin I/O loop - layout: resolve orphan/root collisions at the NAME stage so an orphan ancestor can't desync its children's folder segments (fixes review Major); covered by test - pull: per-page try/catch (one bad page no longer aborts the mirror), bounded concurrency (6), progress logging, process.exitCode=1 on partial mirror - security: filename disambiguation suffix now passes through sanitizeTitle - docs: AGENTS.md -> Increment 1 status/structure/run targets; pull.ts meta-block comment; collectRecentSince JSDoc (lexicographic UTC-ISO precondition) - tests: layout (9), markdown-document round-trip (no comments block, SPEC §3), firstDivergence; export firstDivergence. 49 tests green.
This commit is contained in:
150
src/pull.ts
150
src/pull.ts
@@ -4,8 +4,12 @@
|
||||
* Walks the configured space's page tree and writes one self-contained `.md`
|
||||
* per page under `<vaultPath>/<...ancestors>/<Title>.md`. This increment is
|
||||
* READ-ONLY toward Docmost (no writes, no git) — it only fetches and writes
|
||||
* local files. The meta block inside each file carries pageId/slugId/
|
||||
* parentPageId (identity), so no external map file is needed.
|
||||
* local files. The meta block inside each file carries
|
||||
* `{ version, pageId, slugId, title, spaceId, parentPageId }` (identity), so no
|
||||
* external map file is needed.
|
||||
*
|
||||
* The pure tree -> path mapping lives in `./layout.js`; this file is a thin,
|
||||
* fault-tolerant I/O loop around it.
|
||||
*
|
||||
* Requires a `.env` with real Docmost credentials. This file must COMPILE and
|
||||
* be correct, but is not expected to be run without live access.
|
||||
@@ -17,43 +21,13 @@ import { join } from "node:path";
|
||||
import { pathToFileURL } from "node:url";
|
||||
import { DocmostClient } from "docmost-client";
|
||||
import { loadSettings } from "./settings.js";
|
||||
import { sanitizeTitle, disambiguate } from "./sanitize.js";
|
||||
import { buildVaultLayout, type PageNode } from "./layout.js";
|
||||
|
||||
/** Flat page node as returned by listAllSpacePages (no content). */
|
||||
interface PageNode {
|
||||
id: string;
|
||||
title?: string;
|
||||
slugId?: string;
|
||||
parentPageId?: string | null;
|
||||
hasChildren?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a deterministic, collision-free folder/file name for a node among its
|
||||
* siblings. `usedBySibling` maps a parent key -> set of names already taken, so
|
||||
* two siblings that sanitize to the same name get a stable ` ~slugId` suffix
|
||||
* (SPEC §12). The name is COSMETIC; identity lives in the file's meta block.
|
||||
*/
|
||||
function nameForNode(
|
||||
node: PageNode,
|
||||
usedBySibling: Map<string, Set<string>>,
|
||||
): string {
|
||||
const parentKey = node.parentPageId ?? "__root__";
|
||||
let used = usedBySibling.get(parentKey);
|
||||
if (!used) {
|
||||
used = new Set<string>();
|
||||
usedBySibling.set(parentKey, used);
|
||||
}
|
||||
|
||||
let name = sanitizeTitle(node.title ?? "");
|
||||
if (used.has(name)) {
|
||||
// Sibling collision: disambiguate with the stable slugId (fall back to the
|
||||
// pageId if no slugId is present).
|
||||
name = disambiguate(name, node.slugId ?? node.id);
|
||||
}
|
||||
used.add(name);
|
||||
return name;
|
||||
}
|
||||
// Number of pages fetched/written concurrently. Bounded so a large space does
|
||||
// not open thousands of simultaneous requests/file handles.
|
||||
const CONCURRENCY = 6;
|
||||
// How often to log incremental progress (every N completed pages).
|
||||
const PROGRESS_EVERY = 25;
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const s = loadSettings();
|
||||
@@ -67,64 +41,74 @@ async function main(): Promise<void> {
|
||||
const vaultRoot = s.vaultPath;
|
||||
|
||||
const pages: PageNode[] = await client.listAllSpacePages(spaceId);
|
||||
const layout = buildVaultLayout(pages);
|
||||
|
||||
// Index pages by id so the parent chain can be walked.
|
||||
const byId = new Map<string, PageNode>();
|
||||
for (const p of pages) {
|
||||
if (p && p.id) byId.set(p.id, p);
|
||||
}
|
||||
const total = pages.length;
|
||||
let written = 0;
|
||||
let failed = 0;
|
||||
let completed = 0;
|
||||
let nextIndex = 0;
|
||||
|
||||
// Resolve each node's display name once, deterministically, tracking sibling
|
||||
// collisions per parent.
|
||||
const usedBySibling = new Map<string, Set<string>>();
|
||||
const nameById = new Map<string, string>();
|
||||
for (const p of pages) {
|
||||
if (p && p.id) nameById.set(p.id, nameForNode(p, usedBySibling));
|
||||
}
|
||||
|
||||
// Build the folder path for a page by walking parentPageId to the root. The
|
||||
// page's OWN name is the file stem; its ancestors become folders. A `visited`
|
||||
// guard prevents an infinite loop on a malformed parent cycle.
|
||||
const folderSegmentsFor = (node: PageNode): string[] => {
|
||||
const ancestors: string[] = [];
|
||||
const visited = new Set<string>();
|
||||
let current: PageNode | undefined = node.parentPageId
|
||||
? byId.get(node.parentPageId)
|
||||
: undefined;
|
||||
while (current && current.id && !visited.has(current.id)) {
|
||||
visited.add(current.id);
|
||||
ancestors.unshift(
|
||||
nameById.get(current.id) ?? sanitizeTitle(current.title ?? ""),
|
||||
// Pull + write a single page. Each call is wrapped so one bad page (network
|
||||
// error, page deleted between the walk and the fetch, body conversion
|
||||
// failure) NEVER aborts the whole pull — it is counted as a failure and the
|
||||
// pool moves on. Mirrors the deliberately fault-tolerant enumerateSpacePages.
|
||||
const pullOne = async (page: PageNode): Promise<void> => {
|
||||
if (!page || !page.id) return;
|
||||
const entry = layout.get(page.id);
|
||||
if (!entry) return; // no layout entry (e.g. duplicate/skipped id)
|
||||
try {
|
||||
const dir = join(vaultRoot, ...entry.segments);
|
||||
await mkdir(dir, { recursive: true });
|
||||
// Body + meta only (no comments block) — SPEC §3.
|
||||
const md = await client.exportPageBody(page.id);
|
||||
await writeFile(join(dir, `${entry.stem}.md`), md, "utf8");
|
||||
written++;
|
||||
} catch (err) {
|
||||
failed++;
|
||||
console.error(
|
||||
`pull: failed page ${page.id}:`,
|
||||
err instanceof Error ? err.message : String(err),
|
||||
);
|
||||
current = current.parentPageId
|
||||
? byId.get(current.parentPageId)
|
||||
: undefined;
|
||||
} finally {
|
||||
completed++;
|
||||
if (completed % PROGRESS_EVERY === 0) {
|
||||
console.log(`pulled ${completed}/${total}`);
|
||||
}
|
||||
}
|
||||
return ancestors;
|
||||
};
|
||||
|
||||
let written = 0;
|
||||
for (const p of pages) {
|
||||
if (!p || !p.id) continue;
|
||||
const segments = folderSegmentsFor(p);
|
||||
const fileStem = nameById.get(p.id) ?? sanitizeTitle(p.title ?? "");
|
||||
const dir = join(vaultRoot, ...segments);
|
||||
await mkdir(dir, { recursive: true });
|
||||
// A small dependency-free bounded-concurrency pool: a fixed set of runners
|
||||
// each pull the next index until the list is exhausted.
|
||||
const runner = async (): Promise<void> => {
|
||||
while (true) {
|
||||
const i = nextIndex++;
|
||||
if (i >= pages.length) return;
|
||||
await pullOne(pages[i]);
|
||||
}
|
||||
};
|
||||
|
||||
// Body + meta only (no comments block) — SPEC §3.
|
||||
const fileMd = await client.exportPageBody(p.id);
|
||||
await writeFile(join(dir, `${fileStem}.md`), fileMd, "utf8");
|
||||
written++;
|
||||
}
|
||||
const runners = Array.from(
|
||||
{ length: Math.min(CONCURRENCY, pages.length) },
|
||||
() => runner(),
|
||||
);
|
||||
await Promise.all(runners);
|
||||
|
||||
console.log(
|
||||
`pull complete: ${written} page(s) from space ${spaceId} into ${vaultRoot}`,
|
||||
`pull complete: ${written} page(s) written, ${failed} failed, ` +
|
||||
`out of ${total} from space ${spaceId} into ${vaultRoot}`,
|
||||
);
|
||||
|
||||
// Signal a partial mirror so callers/CI can react. Use process.exitCode (not
|
||||
// a hard process.exit) so any buffered output is flushed cleanly.
|
||||
if (failed > 0) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Only auto-run when invoked directly as the CLI entrypoint, not when this
|
||||
// module is imported (e.g. by a unit test importing sanitizeTitle / path
|
||||
// helpers), so the import does not trigger loadSettings() + process.exit.
|
||||
// module is imported (e.g. by a unit test), so the import does not trigger
|
||||
// loadSettings() + process.exit.
|
||||
const invokedDirectly =
|
||||
typeof process.argv[1] === "string" &&
|
||||
import.meta.url === pathToFileURL(process.argv[1]).href;
|
||||
|
||||
Reference in New Issue
Block a user