refactor(pull): extract tested vault-layout module; harden pull; close review findings

Address the Increment-1 code review (3 warnings + suggestions).

- layout: new pure src/layout.ts (buildVaultLayout) — page-tree -> vault paths,
  sibling + full-path collision disambiguation (sanitized ~slugId suffix), parent
  cycle guard; pull.ts is now a thin I/O loop
- layout: resolve orphan/root collisions at the NAME stage so an orphan ancestor
  can't desync its children's folder segments (fixes review Major); covered by test
- pull: per-page try/catch (one bad page no longer aborts the mirror), bounded
  concurrency (6), progress logging, process.exitCode=1 on partial mirror
- security: filename disambiguation suffix now passes through sanitizeTitle
- docs: AGENTS.md -> Increment 1 status/structure/run targets; pull.ts meta-block
  comment; collectRecentSince JSDoc (lexicographic UTC-ISO precondition)
- tests: layout (9), markdown-document round-trip (no comments block, SPEC §3),
  firstDivergence; export firstDivergence. 49 tests green.
This commit is contained in:
vvzvlad
2026-06-16 21:09:40 +03:00
parent 447d2508ae
commit c6edd73324
8 changed files with 511 additions and 87 deletions

View File

@@ -4,8 +4,12 @@
* Walks the configured space's page tree and writes one self-contained `.md`
* per page under `<vaultPath>/<...ancestors>/<Title>.md`. This increment is
* READ-ONLY toward Docmost (no writes, no git) — it only fetches and writes
* local files. The meta block inside each file carries pageId/slugId/
* parentPageId (identity), so no external map file is needed.
* local files. The meta block inside each file carries
* `{ version, pageId, slugId, title, spaceId, parentPageId }` (identity), so no
* external map file is needed.
*
* The pure tree -> path mapping lives in `./layout.js`; this file is a thin,
* fault-tolerant I/O loop around it.
*
* Requires a `.env` with real Docmost credentials. This file must COMPILE and
* be correct, but is not expected to be run without live access.
@@ -17,43 +21,13 @@ import { join } from "node:path";
import { pathToFileURL } from "node:url";
import { DocmostClient } from "docmost-client";
import { loadSettings } from "./settings.js";
import { sanitizeTitle, disambiguate } from "./sanitize.js";
import { buildVaultLayout, type PageNode } from "./layout.js";
/** Flat page node as returned by listAllSpacePages (no content). */
interface PageNode {
id: string;
title?: string;
slugId?: string;
parentPageId?: string | null;
hasChildren?: boolean;
}
/**
* Compute a deterministic, collision-free folder/file name for a node among its
* siblings. `usedBySibling` maps a parent key -> set of names already taken, so
* two siblings that sanitize to the same name get a stable ` ~slugId` suffix
* (SPEC §12). The name is COSMETIC; identity lives in the file's meta block.
*/
function nameForNode(
node: PageNode,
usedBySibling: Map<string, Set<string>>,
): string {
const parentKey = node.parentPageId ?? "__root__";
let used = usedBySibling.get(parentKey);
if (!used) {
used = new Set<string>();
usedBySibling.set(parentKey, used);
}
let name = sanitizeTitle(node.title ?? "");
if (used.has(name)) {
// Sibling collision: disambiguate with the stable slugId (fall back to the
// pageId if no slugId is present).
name = disambiguate(name, node.slugId ?? node.id);
}
used.add(name);
return name;
}
// Number of pages fetched/written concurrently. Bounded so a large space does
// not open thousands of simultaneous requests/file handles.
const CONCURRENCY = 6;
// How often to log incremental progress (every N completed pages).
const PROGRESS_EVERY = 25;
async function main(): Promise<void> {
const s = loadSettings();
@@ -67,64 +41,74 @@ async function main(): Promise<void> {
const vaultRoot = s.vaultPath;
const pages: PageNode[] = await client.listAllSpacePages(spaceId);
const layout = buildVaultLayout(pages);
// Index pages by id so the parent chain can be walked.
const byId = new Map<string, PageNode>();
for (const p of pages) {
if (p && p.id) byId.set(p.id, p);
}
const total = pages.length;
let written = 0;
let failed = 0;
let completed = 0;
let nextIndex = 0;
// Resolve each node's display name once, deterministically, tracking sibling
// collisions per parent.
const usedBySibling = new Map<string, Set<string>>();
const nameById = new Map<string, string>();
for (const p of pages) {
if (p && p.id) nameById.set(p.id, nameForNode(p, usedBySibling));
}
// Build the folder path for a page by walking parentPageId to the root. The
// page's OWN name is the file stem; its ancestors become folders. A `visited`
// guard prevents an infinite loop on a malformed parent cycle.
const folderSegmentsFor = (node: PageNode): string[] => {
const ancestors: string[] = [];
const visited = new Set<string>();
let current: PageNode | undefined = node.parentPageId
? byId.get(node.parentPageId)
: undefined;
while (current && current.id && !visited.has(current.id)) {
visited.add(current.id);
ancestors.unshift(
nameById.get(current.id) ?? sanitizeTitle(current.title ?? ""),
// Pull + write a single page. Each call is wrapped so one bad page (network
// error, page deleted between the walk and the fetch, body conversion
// failure) NEVER aborts the whole pull — it is counted as a failure and the
// pool moves on. Mirrors the deliberately fault-tolerant enumerateSpacePages.
const pullOne = async (page: PageNode): Promise<void> => {
if (!page || !page.id) return;
const entry = layout.get(page.id);
if (!entry) return; // no layout entry (e.g. duplicate/skipped id)
try {
const dir = join(vaultRoot, ...entry.segments);
await mkdir(dir, { recursive: true });
// Body + meta only (no comments block) — SPEC §3.
const md = await client.exportPageBody(page.id);
await writeFile(join(dir, `${entry.stem}.md`), md, "utf8");
written++;
} catch (err) {
failed++;
console.error(
`pull: failed page ${page.id}:`,
err instanceof Error ? err.message : String(err),
);
current = current.parentPageId
? byId.get(current.parentPageId)
: undefined;
} finally {
completed++;
if (completed % PROGRESS_EVERY === 0) {
console.log(`pulled ${completed}/${total}`);
}
}
return ancestors;
};
let written = 0;
for (const p of pages) {
if (!p || !p.id) continue;
const segments = folderSegmentsFor(p);
const fileStem = nameById.get(p.id) ?? sanitizeTitle(p.title ?? "");
const dir = join(vaultRoot, ...segments);
await mkdir(dir, { recursive: true });
// A small dependency-free bounded-concurrency pool: a fixed set of runners
// each pull the next index until the list is exhausted.
const runner = async (): Promise<void> => {
while (true) {
const i = nextIndex++;
if (i >= pages.length) return;
await pullOne(pages[i]);
}
};
// Body + meta only (no comments block) — SPEC §3.
const fileMd = await client.exportPageBody(p.id);
await writeFile(join(dir, `${fileStem}.md`), fileMd, "utf8");
written++;
}
const runners = Array.from(
{ length: Math.min(CONCURRENCY, pages.length) },
() => runner(),
);
await Promise.all(runners);
console.log(
`pull complete: ${written} page(s) from space ${spaceId} into ${vaultRoot}`,
`pull complete: ${written} page(s) written, ${failed} failed, ` +
`out of ${total} from space ${spaceId} into ${vaultRoot}`,
);
// Signal a partial mirror so callers/CI can react. Use process.exitCode (not
// a hard process.exit) so any buffered output is flushed cleanly.
if (failed > 0) {
process.exitCode = 1;
}
}
// Only auto-run when invoked directly as the CLI entrypoint, not when this
// module is imported (e.g. by a unit test importing sanitizeTitle / path
// helpers), so the import does not trigger loadSettings() + process.exit.
// module is imported (e.g. by a unit test), so the import does not trigger
// loadSettings() + process.exit.
const invokedDirectly =
typeof process.argv[1] === "string" &&
import.meta.url === pathToFileURL(process.argv[1]).href;