build(git-sync): land the @docmost/git-sync package into develop, code-only (#326 step 1 / PR-A)

The git-sync converter + engine source lived only on the #119 branch; develop
had just the dead compiled build/. Bring the whole package (src + ~700 tests)
onto develop under CI, with NO consumer wired — git-sync stays fully inert in
develop (nothing in apps/server imports it), so runtime behavior is unchanged.
This unblocks #293 (extract the shared converter package from the landed source)
and lets #119's functionality land LAST, already writing the canonical format
(per the #326 landing order).

- packages/git-sync: src (lib converter + engine) + test corpus + configs.
- Remove develop's dead committed packages/git-sync/build/; gitignore it
  (built in CI/Docker via pnpm build, never committed — no src/build drift).
- pnpm-lock.yaml: add the @docmost/git-sync importer (a missing workspace
  package in the lock is a CI blocker). `pnpm install --frozen-lockfile` passes.
- NO server integration / loader / Dockerfile runtime changes (those come with
  #119 at step 6).

Verified: tsc clean; vitest 711 passed | 1 expected-fail, 0 failures, 0 type
errors; pnpm --frozen-lockfile EXIT 0; apps/server has no git-sync import.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
claude code agent 227
2026-07-04 06:21:41 +03:00
parent 0a3e32e7f6
commit 24b903aaf3
130 changed files with 24311 additions and 7692 deletions
@@ -0,0 +1,136 @@
/**
* The client seam. `pull.ts`/`push.ts` depend on a narrow STRUCTURAL interface
* rather than any concrete client, because the gitmost server writes NATIVELY —
* through repositories + collab `openDirectConnection`.
*
* `GitSyncClient` is that interface: the native datasource (server side)
* implements it, and the engine only ever uses `Pick<GitSyncClient, ...>`
* subsets of it. The signatures below MIRROR exactly the methods the engine's
* `pull.ts`/`push.ts` actually call (arg shapes + the fields the engine reads
* off each result), so a REST-style client is still structurally assignable and
* the native adapter has a precise contract.
*/
/**
* A page node as returned by `listSpaceTree` (the sidebar/tree walk, no body).
* The engine layout (`buildVaultLayout`) consumes `PageNode` from `./layout`,
* which only requires `id` (+ optional `title`/`slugId`/`parentPageId`); this
* lite shape documents the fields the tree walk surfaces. Real tree nodes also
* carry `position`, `icon`, `hasChildren` — kept open via the index signature.
*/
export interface GitSyncPageNodeLite {
id: string;
slugId?: string;
title?: string;
parentPageId?: string | null;
hasChildren?: boolean;
/** `listSpaceTree` nodes carry extra fields (position, icon, …). */
[key: string]: unknown;
}
/**
* The structural client the engine depends on. Only `Pick<GitSyncClient, ...>`
* subsets are ever used:
* - pull reads: `getPageJson` (+ the tree walk's `listSpaceTree`),
* - push writes: `importPageMarkdown` / `createPage` / `deletePage` /
* `movePage` / `renamePage`,
* - continuous (phase B+): `listRecentSince` / `listTrash` / `restorePage`.
*/
export interface GitSyncClient {
// --- reads (pull) ---------------------------------------------------------
/**
* Full tree of page nodes for the space (or the subtree rooted at
* `rootPageId`), each WITHOUT body content. `complete` is `false` when the
* walk was truncated / a fetch failed — the pull side suppresses absence
* deletions on an incomplete tree (SPEC §8). Native impl returns
* `complete: true` always (reads the DB, not a paginated REST endpoint).
*/
listSpaceTree(
spaceId: string,
rootPageId?: string,
): Promise<{ pages: GitSyncPageNodeLite[]; complete: boolean }>;
/**
* One page WITH its ProseMirror body content. `applyPullActions` reads
* `id`, `slugId`, `title`, `parentPageId`, `spaceId` (for the file meta) and
* `content` (to stabilize/serialize). `updatedAt` is carried for the
* poll-suppression loop-guard.
*/
getPageJson(pageId: string): Promise<{
id: string;
slugId: string;
title: string;
parentPageId: string | null;
spaceId: string;
updatedAt: string;
content: unknown;
}>;
// --- writes (push) --------------------------------------------------------
/**
* Merge a page's markdown BODY into the live page. `applyPushActions` passes
* the file's body with the frontmatter AND any git conflict markers already
* stripped — NOT the raw self-contained file — so `fullMarkdown` here is clean
* body text (the datasource re-parses defensively). The collab/Yjs write path
* (SPEC §2/§15.6) — never a raw jsonb overwrite. `applyPushActions` reads only
* an optional `updatedAt` off the result (via `extractUpdatedAt`).
*
* `baseMarkdown` is the last-synced body (from `refs/docmost/last-pushed`,
* likewise stripped), the common ancestor for a THREE-WAY merge against the
* live doc so concurrent human edits survive (review #5). Optional/null -> 2-way.
*/
importPageMarkdown(
pageId: string,
fullMarkdown: string,
baseMarkdown?: string | null,
): Promise<{ updatedAt?: string; [key: string]: unknown }>;
/**
* Create a new page and return the assigned id at `data.id`
* (`applyPushActions` reads `result.data.id`, then writes it back into the
* file's meta). An optional top-level/`data.updatedAt` feeds the loop-guard.
*/
createPage(
title: string,
content: string,
spaceId: string,
parentPageId?: string,
): Promise<{ data: { id: string }; updatedAt?: string; [key: string]: unknown }>;
/** Soft-delete a page to Trash (SPEC §8). Result is not inspected. */
deletePage(pageId: string): Promise<unknown>;
/**
* Reparent a page (and optionally set its fractional-index `position`). The
* engine passes `position` UNDEFINED for now; the native impl computes a
* default between siblings. Result is not inspected.
*/
movePage(
pageId: string,
parentPageId: string | null,
position?: string,
): Promise<unknown>;
/** Change a page's title only (no body touch). Result is not inspected. */
renamePage(pageId: string, title: string): Promise<unknown>;
// --- continuous (phase B+) ------------------------------------------------
/**
* Pages updated since `sinceIso` (the poll-safety reconciliation, SPEC §8).
* `spaceId` may be undefined (all spaces); `hardPageCap` bounds the walk.
*/
listRecentSince(
spaceId: string | undefined,
sinceIso: string | null,
hardPageCap?: number,
): Promise<unknown[]>;
/** List soft-deleted (trashed) pages for the space (deletion detection). */
listTrash(spaceId: string): Promise<unknown[]>;
/** Restore a soft-deleted page from Trash. Result is not inspected. */
restorePage(pageId: string): Promise<unknown>;
}
+244
View File
@@ -0,0 +1,244 @@
import { VaultGit, DEFAULT_BRANCH } from "./git.js";
import { GitSyncClient } from "./client.types.js";
import { Settings } from "./settings.js";
import { readExisting, computePullActions, applyPullActions } from "./pull.js";
import { runPush } from "./push.js";
import { assertVaultPathSafe, type PathGuardIo } from "./path-guard.js";
/**
* Absolute-path filesystem primitives the cycle needs. Injected (not imported)
* so the engine stays IO-free and unit-testable. `mkdir` is recursive; `rm` is
* force (a missing file is a no-op).
*
* `lstat`/`realpath` back the SYMLINK GUARD (see ./path-guard.ts): every
* read/write/mkdir is screened so a pushed symlink (e.g. `leak.md -> /etc/passwd`
* or `-> .env`) cannot be followed to publish or overwrite a file outside the
* vault. Both MUST resolve to `null` on ENOENT and reject on any other error.
*/
export interface CycleFs extends PathGuardIo {
readFile: (absPath: string) => Promise<string>;
writeFile: (absPath: string, text: string) => Promise<void>;
mkdir: (absDir: string) => Promise<void>;
rm: (absPath: string) => Promise<void>;
}
export interface RunCycleDeps {
spaceId: string;
/** The Docmost seam (reads for pull, writes for push). */
client: GitSyncClient;
/** The per-space git vault (a real working repo). */
vault: VaultGit;
/** Engine settings; `vaultPath` roots the relPath -> absolute-path mapping. */
settings: Settings;
fs: CycleFs;
log: (line: string) => void;
/**
* Optional cooperative-abort signal. The caller (orchestrator) wires this to
* the per-space lock: if a heartbeat refresh cannot CONFIRM the lock is still
* held (CAS-miss / Redis error), the signal is aborted and the cycle bails at
* its next checkpoint (before the pull-apply and before the push-apply — the
* two destructive write phases) instead of writing blind after a possible
* lock loss. This is a COARSE best-effort guard; a fully fenced cross-process
* single-writer still needs the fencing-token redesign (follow-up).
*/
signal?: AbortSignal;
}
export interface RunCycleResult {
ran: boolean;
/** Set when the cycle short-circuited without running pull/push. */
skipped?: "merge-in-progress";
pull?: { written: number; deleted: number; conflict: boolean };
push?: { mode: string; failures: number };
/**
* Forwarded from the push result: `true` when the push REFUSED to fast-forward
* a divergent `docmost` mirror (the §5 invariant — `docmost` mirrors what
* Docmost contains — is broken). Surfaced here so a caller driving `runCycle`
* can detect the breach without scraping logs (red-team #15).
*/
divergentDocmost?: boolean;
}
/**
* Run ONE full reconcile cycle for a space: PULL (Docmost -> vault) then PUSH
* (vault -> Docmost), under the engine's required branch choreography. This is
* the single entry point the app drives — it owns the staging order so it can
* never drift from the engine it ships with.
*
* Staging (the ⭐ data-loss-critical order, SPEC §6/§9):
* 1. assertGitAvailable + ensureRepo (the git state store must exist).
* 2. refuse on an unresolved merge (a prior conflicting pull); next checkout
* would fail otherwise.
* 3. ensureBranch('docmost','main') + checkout('docmost'). Pull writes MUST
* land on `docmost`, not `main`: applyPullActions commits on `docmost`,
* then checks out `main` and merges docmost -> main. Writing Docmost
* content straight onto `main` would clobber local file edits before push
* can diff them.
* 4. PULL: readExisting -> listSpaceTree -> computePullActions -> apply.
* 5. PUSH: vault -> Docmost apply.
*
* Lock POLICY lives in the caller; this owns only the mechanics. Deletes are
* soft (Trash, reversible) and always logged, so there is no per-cycle
* delete-cap — engine convergence is the guard against phantom deletions.
*/
export async function runCycle(deps: RunCycleDeps): Promise<RunCycleResult> {
const { spaceId, client, vault, settings, fs, log, signal } = deps;
const vaultRoot = settings.vaultPath;
const abs = (relPath: string) => `${vaultRoot}/${relPath}`;
// SYMLINK GUARD (defense-in-depth, see ./path-guard.ts). Wrap the injected
// read/write/mkdir primitives so EVERY engine file access is screened: a path
// that is — or traverses — a symlink, or whose realpath escapes the vault, is
// refused. `rm` is deliberately NOT wrapped: removing a path only deletes the
// link itself (force, non-recursive), never the target, and we WANT to be able
// to clean up a stray pushed symlink. A refusal THROWS; the pull/push loops
// already isolate per-file errors (skip + log), so a single poisoned entry is
// skipped while the rest of the space keeps syncing.
const guard = (p: string) => assertVaultPathSafe(fs, vaultRoot, p);
const safeFs = {
readFile: async (p: string): Promise<string> => {
await guard(p);
return fs.readFile(p);
},
writeFile: async (p: string, text: string): Promise<void> => {
await guard(p);
return fs.writeFile(p, text);
},
mkdir: async (p: string): Promise<void> => {
await guard(p);
return fs.mkdir(p);
},
rm: (p: string): Promise<void> => fs.rm(p),
};
// 1. The engine state store is git: make sure the repo + branches exist
// before any tracked-file listing or diff.
await vault.assertGitAvailable();
await vault.ensureRepo();
// 1b. CLEAR stale git lock files left by an interrupted git op (bug D3-N3). A
// hard crash / OOM-kill / abrupt container stop mid `git add`/`commit`/
// `checkout` leaves a `.git/index.lock` (or a ref `*.lock`); git then refuses
// every later op ("Unable to create '…/index.lock': File exists"), wedging the
// space forever with no self-heal. Only locks OLDER than the staleness
// threshold are removed (a fresh lock from a concurrent replica in the
// TTL-lapse window is preserved), before the merge check + any checkout/diff
// below.
await vault.clearStaleGitLocks();
// 1c. RESTORE a missing `main` branch (bug D3-N1). Ref-store damage can leave an
// existing repo without `main`; the ensureBranch("docmost","main") + checkout
// below would then throw every cycle ("pathspec 'main' did not match"),
// wedging the space forever. Re-create it from `docmost`/HEAD before use.
await vault.ensureMainBranch();
// 2. RECOVER from a vault left mid-merge by a PRIOR cycle (SPEC §9 wedge fix).
// A leftover merge used to WEDGE THE WHOLE SPACE: this check returned
// `skipped: "merge-in-progress"` so EVERY later cycle skipped the entire
// space (all pages, both directions) forever, with no recovery. The pull
// phase below no longer leaves the vault mid-merge (it commits a conflicting
// merge with markers and isolates the one bad page), but a vault wedged by a
// PRE-FIX build (or a manual/interrupted git op) must still self-heal.
// So instead of skipping, ABORT the stale half-merge and continue — the
// fresh pull re-runs and, on a real conflict, commits-with-markers rather
// than re-wedging. A stray unmerged index that `merge --abort` can't clear
// (no MERGE_HEAD) is force-cleared with a hard reset to HEAD.
if (await vault.isMergeInProgress()) {
log(
`vault was left mid-merge by a prior cycle — aborting the stale merge and ` +
`continuing so the space is not wedged (SPEC §9 recovery).`,
);
await vault.abortMerge();
if (await vault.isMergeInProgress()) {
log(
`vault still mid-merge after 'merge --abort' — hard-resetting to HEAD ` +
`to recover (SPEC §9).`,
);
await vault.resetHardToHead();
}
}
try {
// 3. Pull writes happen on `docmost`; be on it BEFORE applying (see docstring).
await vault.ensureBranch("docmost", "main");
await vault.checkout("docmost");
// 4. PULL ------------------------------------------------------------------
const existing = await readExisting({
listTracked: () => vault.listTrackedFiles("*.md"),
readFile: (relPath) => safeFs.readFile(abs(relPath)),
});
const tree = await client.listSpaceTree(spaceId);
const pullActions = computePullActions({
pages: tree.pages,
treeComplete: tree.complete,
existing,
});
// Bail before the first destructive write phase if the lock was lost.
signal?.throwIfAborted();
const pullResult = await applyPullActions(
{
client,
git: vault,
writeFile: (absPath, text) => safeFs.writeFile(absPath, text),
mkdir: (absDir) => safeFs.mkdir(absDir),
rm: (absPath) => safeFs.rm(absPath),
log,
},
pullActions,
vaultRoot,
);
// 5. PUSH ------------------------------------------------------------------
const pushDeps = {
settings,
git: vault,
makeClient: () => client,
readFile: (relPath: string) => safeFs.readFile(abs(relPath)),
writeFile: (relPath: string, text: string) =>
safeFs.writeFile(abs(relPath), text),
log,
};
// Bail before pushing to Docmost if the lock was lost during pull.
signal?.throwIfAborted();
const pushResult = await runPush(pushDeps, { dryRun: false });
return {
ran: true,
pull: {
written: pullResult.written,
deleted: pullResult.deleted,
conflict: pullResult.merge.conflict,
},
push: {
mode: pushResult.mode,
failures: pushResult.failures?.length ?? 0,
},
// Forward a divergent-`docmost` escalation so the caller can act on the §5
// invariant breach without scraping logs (red-team #15).
divergentDocmost: pushResult.divergentDocmost ?? false,
};
} finally {
// STABLE SERVED HEAD (bug #3). The pull transiently checks out the read-only
// `docmost` mirror, and the smart-HTTP host advertises whatever HEAD resolves
// to — so a clone racing a cycle could default to `docmost`. The happy path
// already ends on `main` (runPush), but a throw mid-pull would leave HEAD on
// `docmost`; restore it here so the advertised default branch is `main` BETWEEN
// cycles. Best-effort: skipped if the lock was lost (do not write the working
// tree after a possible takeover), and a failing checkout (e.g. a dirty tree
// from an aborted write) is swallowed — the next cycle's recovery resyncs and
// the read advertisement pins HEAD under the lock regardless.
if (!signal?.aborted) {
try {
await vault.checkout(DEFAULT_BRANCH);
} catch {
/* best-effort: next cycle recovers; advertisement pins HEAD under lock */
}
}
}
}
+850
View File
@@ -0,0 +1,850 @@
/**
* Thin async wrapper over the system `git` binary (SPEC §5: state store = git).
*
* IMPORTANT — VAULT-SCOPED: every operation here runs with `cwd = vaultPath`,
* which is the vault's OWN git repository (default `data/vault`), SEPARATE from
* the gitmost application repo. This module MUST NEVER run git against the
* application repo. `data/` is gitignored, so a nested repo under `data/vault`
* is safe. The pull cycle is READ-ONLY toward Docmost; this module only touches
* the local vault git, never a git remote (push is deferred, see SPEC §7).
*
* Implementation notes:
* - We shell out via `node:child_process` `execFile` (promisified), passing
* ARGS AS AN ARRAY — no shell, so there is no command injection surface even
* if a page title / branch name contains shell metacharacters.
* - EVERY git invocation funnels through the single `runRaw` primitive, which
* ALWAYS prepends `--no-pager -c core.quotepath=false` to the argv (so git
* never blocks on a pager and always prints verbatim UTF-8 paths). There is
* no exception — even the `git --version` preflight goes through `runRaw`.
* - "nothing to commit" is treated as a graceful no-op, not an error.
*/
import { execFile } from "node:child_process";
import { mkdir, rm, stat } from "node:fs/promises";
import { promisify } from "node:util";
const execFileAsync = promisify(execFile);
// Safety net: kill a hung git subprocess. This engine performs only LOCAL git
// operations (no network pushes), so a legitimate call never approaches this
// bound; it only prevents an indefinitely-stuck subprocess from wedging a sync
// cycle (the same risk the http-backend watchdog guards on the server side).
const GIT_EXEC_TIMEOUT_MS = 120_000;
// A live git op holds a lock for at most GIT_EXEC_TIMEOUT_MS (then it is killed),
// so a lock whose mtime is older than a few multiples of that cannot have a live
// holder — it is a genuine crash-leftover, safe to remove. A fresh lock from a
// concurrently-running replica (mtime within the timeout) is preserved so we never
// delete a lock a live git process still holds.
const STALE_LOCK_MIN_AGE_MS = GIT_EXEC_TIMEOUT_MS * 3; // 6 min, >> max git-op duration
/** Bot identity used for engine-authored vault commits (SPEC §7.3). */
export const BOT_AUTHOR_NAME = "Docmost Sync";
export const BOT_AUTHOR_EMAIL = "docmost-sync@local";
/** Default branch the vault repo is initialized on. */
export const DEFAULT_BRANCH = "main";
/**
* One row of `git diff --name-status` (SPEC §6 "FS -> Docmost"). `status` is the
* single-letter change code (`-M` rename detection on), `path` is the (new) file
* path; for a rename/copy (`R`/`C`) `oldPath` is the source and `path` is the
* destination, with `score` carrying git's similarity index (0–100).
*/
export interface DiffEntry {
status: "A" | "M" | "D" | "R" | "C";
/** New (destination) path. For A/M/D it is the only path. */
path: string;
/** Source path — present only for R/C. */
oldPath?: string;
/** Rename/copy similarity score (0–100) — present only for R/C. */
score?: number;
}
/** Result of a `merge`: whether it succeeded cleanly or left conflict markers. */
export interface MergeResult {
/** True when the merge applied cleanly (fast-forward or clean 3-way). */
ok: boolean;
/** True when the merge stopped on conflicts (markers left in the worktree). */
conflict: boolean;
/** Raw combined stdout+stderr, for logging/diagnostics. */
output: string;
}
/** Options for an engine-authored commit (provenance, SPEC §7.3). */
export interface CommitOptions {
authorName: string;
authorEmail: string;
/**
* Trailer lines appended to the commit message body (e.g.
* `Docmost-Sync-Source: docmost`). These are the machine-readable provenance
* the loop-guard keys on (SPEC §12, "commit-attribution").
*/
trailers?: string[];
}
/**
* A git wrapper bound to a single vault path. Construct once per vault; every
* method runs git with `cwd = vaultPath`.
*/
export class VaultGit {
constructor(private readonly vaultPath: string) {}
/**
* Preflight: verify a runnable `git` binary is on PATH. The daemon shells out
* to system `git` for every vault operation, so a missing binary (e.g. a slim
* container image without git) must fail fast with an actionable message
* rather than a cryptic ENOENT deep inside the first real git call. Presence
* check only — we do NOT gate on a specific version. Runs `git --version`
* with NO `cwd` (the vault dir may not exist yet at preflight time).
*/
async assertGitAvailable(): Promise<void> {
// Goes through the single `runRaw` primitive like every other invocation.
// `cwd: null` means "do not set a cwd" — the vault dir may not exist yet at
// preflight time, so we must not point git at a missing directory.
const r = await this.runRaw(["--version"], { cwd: null });
if (r.code !== 0) {
const detail = (r.stderr || r.stdout || "").trim();
throw new Error(
"git binary not found or not runnable — install git (the vault state " +
`store requires it). Underlying error: ${detail}`,
);
}
}
/**
* Run a git command in the vault and return trimmed stdout. THIN wrapper over
* the single `runRaw` primitive: throws a clear, unified Error (including
* stderr/stdout) on a non-zero exit.
*/
private async run(
args: string[],
opts?: { cwd?: string | null; env?: Record<string, string> },
): Promise<string> {
const r = await this.runRaw(args, opts);
if (r.code !== 0) {
const detail = (r.stderr || r.stdout || "").trim();
throw new Error(`git ${args.join(" ")} failed: ${detail}`);
}
return r.stdout.trim();
}
/**
* The ONE primitive every git invocation in this module flows through. Builds
* the full argv (`--no-pager -c core.quotepath=false <args>`), env, cwd, and
* maxBuffer, runs git, and NEVER throws — it returns the exit info so callers
* can treat a non-zero exit as either an error (`run`) or a meaningful state
* (e.g. a merge conflict, a porcelain diff that "fails" deliberately).
*
* - argv: ALWAYS prepends `--no-pager -c core.quotepath=false`, so git never
* blocks on a pager and always prints verbatim UTF-8 paths (no octal
* escaping/quoting). `quotepath=false` is the baseline for ALL path-
* printing commands (ls-files, diff --name-only, …).
* - cwd: `opts.cwd === null` -> do NOT set cwd (the preflight, where the
* vault dir may not exist); otherwise `opts.cwd ?? this.vaultPath`.
* - env: `vaultGitEnv(opts?.env)` (cwd-isolation + caller extras).
* - On a spawn/exec error we capture the error `message` too, so a failure
* before git could write to stderr (e.g. ENOENT) is NOT lost.
*/
private async runRaw(
args: string[],
opts?: { cwd?: string | null; env?: Record<string, string> },
): Promise<{ code: number; stdout: string; stderr: string }> {
const cwd = opts?.cwd === null ? undefined : (opts?.cwd ?? this.vaultPath);
try {
const { stdout, stderr } = await execFileAsync(
"git",
["--no-pager", "-c", "core.quotepath=false", ...args],
{
// Generous buffer: file listings / porcelain output on a large vault
// can be sizable.
...(cwd !== undefined ? { cwd } : {}),
maxBuffer: 64 * 1024 * 1024,
timeout: GIT_EXEC_TIMEOUT_MS,
env: vaultGitEnv(opts?.env),
},
);
return { code: 0, stdout, stderr };
} catch (err: unknown) {
const e = err as {
code?: number;
stdout?: string;
stderr?: string;
message?: string;
};
return {
code: typeof e.code === "number" ? e.code : 1,
stdout: e.stdout ?? "",
// Preserve the error message when there is no stderr (e.g. a spawn
// failure like ENOENT, where promisified execFile sets stderr to an
// EMPTY STRING — so `||`, not `??`, to fall through to `message`).
stderr: e.stderr || e.message || "",
};
}
}
/**
* Ensure the vault directory exists and is an initialized git repo on `main`
* with an initial (empty) commit so branches exist. Idempotent: safe to call
* on every run. Sets a LOCAL bot identity for the vault repo if none is set
* (so engine commits never fall back to a global/unset identity).
*/
async ensureRepo(): Promise<void> {
await mkdir(this.vaultPath, { recursive: true });
if (!(await this.isRepo())) {
// `git init -b main` sets the initial branch on modern git; we still
// guard the branch name below for safety on older binaries.
await this.run(["init", "-b", DEFAULT_BRANCH]);
}
// Set a local identity for the vault repo if unset, so engine commits have
// a deterministic committer even on a machine with no global git config.
if (!(await this.hasLocalConfig("user.name"))) {
await this.run(["config", "user.name", BOT_AUTHOR_NAME]);
}
if (!(await this.hasLocalConfig("user.email"))) {
await this.run(["config", "user.email", BOT_AUTHOR_EMAIL]);
}
// Neutralize correctness-affecting git config in the vault's LOCAL config so
// a user's GLOBAL/system config cannot change porcelain BEHAVIOR (not just
// output) and corrupt the vault. The vault is OUR dedicated repo, so LOCAL
// values (which override global/system) are the right scope. Set
// UNCONDITIONALLY every run — idempotent and cheap; `git config <key>`
// writes to `--local` by default inside the repo. These MUST be in place
// before any add/commit/checkout that could be affected, hence they run
// before the initial-commit block below.
// - core.autocrlf=false — CRITICAL (SPEC §11): a global core.autocrlf=true
// would rewrite LF<->CRLF on add/checkout, making our deterministic,
// byte-stable markdown churn and breaking the round-trip invariant.
// `false` guarantees git stores/checks out verbatim bytes.
// - core.safecrlf=false — avoid CRLF-related warnings/aborts on add.
// - commit.gpgsign=false — the headless daemon must never try to GPG-sign
// a commit (would fail/hang; we already set GIT_TERMINAL_PROMPT=0).
// - core.attributesFile=/dev/null — neutralize the user's GLOBAL
// gitattributes so a global clean/smudge filter (filter.<name>.clean)
// cannot rewrite the STORED blob and break §11 byte-stability (a config
// that core.autocrlf=false does not cover). POSIX-only path, which is
// fine: the daemon runs on Linux (Docker) / macOS. A system
// /etc/gitattributes remains the host admin's domain (out of scope).
// - merge.conflictStyle=merge — CRITICAL (SPEC §9, conflict-marker leak):
// a global `merge.conflictStyle=diff3`/`zdiff3` makes a conflicting merge
// emit an EXTRA `|||||||` base-marker section. The conflict-marker
// scrub on the push side (`stripConflictMarkers`) handles `|||||||` too,
// but pinning the classic `merge` style keeps the markers the engine
// produces to the canonical three (`<<<<<<<`/`=======`/`>>>>>>>`) so
// behavior is deterministic regardless of the operator's global config.
// NOTE: these stay PERSISTED LOCAL config (not `-c` flags) on purpose — a
// human running git by hand in the vault must inherit the same neutralized
// behavior; a transient `-c` would not persist. (core.quotepath, by
// contrast, only affects OUR parsing of output and so is baked into the
// `runRaw` argv baseline instead.)
try {
await this.run(["config", "core.autocrlf", "false"]);
await this.run(["config", "core.safecrlf", "false"]);
await this.run(["config", "commit.gpgsign", "false"]);
await this.run(["config", "core.attributesFile", "/dev/null"]);
await this.run(["config", "merge.conflictStyle", "merge"]);
} catch (err: unknown) {
const detail = err instanceof Error ? err.message : String(err);
throw new Error(
`failed to pin vault git config (SPEC §11) — ensure ${this.vaultPath}` +
"/.git/config is writable and not locked (e.g. stale config.lock): " +
detail,
);
}
// Create the initial empty commit on `main` if the repo has no commits yet,
// so both `main` and (later) `docmost` branches have a common base.
if (!(await this.hasAnyCommit())) {
// Make sure we are on the default branch before the first commit (covers
// the older-git case where `init -b` was not honored).
await this.run(["checkout", "-B", DEFAULT_BRANCH]);
await this.commitRaw("init vault", {
authorName: BOT_AUTHOR_NAME,
authorEmail: BOT_AUTHOR_EMAIL,
allowEmpty: true,
});
}
}
/** True if `cwd` is inside a git work-tree (the vault is initialized). */
private async isRepo(): Promise<boolean> {
const r = await this.runRaw(["rev-parse", "--is-inside-work-tree"]);
return r.code === 0 && r.stdout.trim() === "true";
}
/** True if a LOCAL git config key is set in the vault repo. */
private async hasLocalConfig(key: string): Promise<boolean> {
const r = await this.runRaw(["config", "--local", "--get", key]);
return r.code === 0 && r.stdout.trim().length > 0;
}
/** True if the repo has at least one commit (HEAD resolves). */
private async hasAnyCommit(): Promise<boolean> {
const r = await this.runRaw(["rev-parse", "--verify", "HEAD"]);
return r.code === 0;
}
/** True if a branch with the given name exists. */
async branchExists(name: string): Promise<boolean> {
const r = await this.runRaw([
"rev-parse",
"--verify",
`refs/heads/${name}`,
]);
return r.code === 0;
}
/**
* Create `name` from `fromBranch` if it does not already exist. No-op (and no
* checkout) when the branch is already present.
*/
async ensureBranch(name: string, fromBranch: string): Promise<void> {
if (await this.branchExists(name)) return;
await this.run(["branch", name, fromBranch]);
}
/**
* Re-create a MISSING `main` branch (bug D3-N1). Ref-store damage (a deleted
* `refs/heads/main`, a bad ref update) can leave an existing repo without
* `main`. Every cycle then throws (`ensureBranch("docmost","main")` /
* `checkout main` -> "pathspec 'main' did not match"), wedging the space
* FOREVER with no self-heal — `ensureRepo` only creates branches on a FRESH
* `git init`. Restore `main` in the preflight from the best available source:
* the `docmost` mirror branch if present (they track each other), else the
* current `HEAD` commit. If the repo has no commit at all, ensureRepo's
* fresh-init path owns it — nothing to do here.
*/
async ensureMainBranch(): Promise<void> {
if (await this.branchExists(DEFAULT_BRANCH)) return;
if (await this.branchExists("docmost")) {
await this.run(["branch", DEFAULT_BRANCH, "docmost"]);
return;
}
const head = await this.runRaw(["rev-parse", "--verify", "--quiet", "HEAD"]);
if (head.code === 0 && head.stdout.trim().length > 0) {
await this.run(["branch", DEFAULT_BRANCH, head.stdout.trim()]);
}
}
/** Name of the currently checked-out branch. */
async currentBranch(): Promise<string> {
return this.run(["rev-parse", "--abbrev-ref", "HEAD"]);
}
/** Check out an existing branch. */
async checkout(name: string): Promise<void> {
await this.run(["checkout", name]);
}
/** Stage everything (adds, modifications, deletions). */
async stageAll(): Promise<void> {
await this.run(["add", "-A"]);
}
/**
* True if the vault is mid-merge (an unresolved merge from a previous run,
* SPEC §9 / §12). Detected via a `MERGE_HEAD` ref OR any unmerged
* (conflicted) index entries (`git ls-files -u`). The pull cycle checks this
* BEFORE any checkout so a left-over merge produces a clear, actionable
* message instead of a raw "you need to resolve your current index first"
* failure deep inside `checkout`. This is what makes re-runs converge
* (resumability, SPEC §12).
*/
async isMergeInProgress(): Promise<boolean> {
// MERGE_HEAD exists exactly while a merge is in progress.
const mergeHead = await this.runRaw([
"rev-parse",
"--verify",
"--quiet",
"MERGE_HEAD",
]);
if (mergeHead.code === 0 && mergeHead.stdout.trim().length > 0) return true;
// Fallback / belt-and-suspenders: any unmerged index entries also mean the
// working tree is mid-conflict and a checkout would refuse.
const unmerged = await this.runRaw(["ls-files", "-u"]);
return unmerged.code === 0 && unmerged.stdout.trim().length > 0;
}
/**
* Remove STALE git lock files left by an INTERRUPTED git operation (a hard
* crash / OOM-kill / abrupt container stop mid `git add`/`commit`/`checkout`
* leaves `.git/index.lock`; interrupted ref updates leave `*.lock` files). Git
* then refuses EVERY subsequent operation ("Unable to create '…/index.lock':
* File exists"), which WEDGES the space's sync loop indefinitely with no
* self-heal (bug D3-N3). We target the known/fixed set of index + ref lock
* files this engine itself writes (a hardcoded list, NOT a `*.lock` glob), and
* remove a lock ONLY when it is provably stale by mtime: a lock older than
* STALE_LOCK_MIN_AGE_MS cannot have a live holder (a live git op is killed
* after GIT_EXEC_TIMEOUT_MS), so it is a genuine crash-leftover. A FRESH lock —
* e.g. one a concurrently-running replica still holds during the documented
* multi-replica TTL-lapse window — is PRESERVED, so we never delete a lock a
* live git process is still using (which would corrupt the index/refs). Clear
* them best-effort in the cycle preflight, alongside the mid-merge recovery.
* Missing files are a no-op.
*/
async clearStaleGitLocks(): Promise<void> {
const gitDir = `${this.vaultPath}/.git`;
const locks = [
"index.lock",
"HEAD.lock",
"config.lock",
"packed-refs.lock",
"MERGE_HEAD.lock",
"ORIG_HEAD.lock",
"refs/heads/main.lock",
"refs/heads/docmost.lock",
"refs/docmost/last-pushed.lock",
];
await Promise.all(
locks.map(async (rel) => {
const path = `${gitDir}/${rel}`;
try {
const stats = await stat(path);
// Only remove a lock old enough that no live git process can hold it.
// A fresh lock (mtime within the staleness window) is left in place.
if (Date.now() - stats.mtimeMs >= STALE_LOCK_MIN_AGE_MS) {
await rm(path, { force: true }).catch(() => undefined);
}
} catch {
// Missing lock (ENOENT) or unreadable — nothing to clear.
}
}),
);
}
/**
* Commit the currently STAGED changes with an explicit author/committer
* identity and the given trailers appended to the message body (SPEC §7.3
* provenance). Returns `true` if a commit was made, `false` if there was
* nothing to commit (graceful no-op). The caller is expected to have staged
* its changes first (e.g. via `stageAll`).
*/
async commit(message: string, opts: CommitOptions): Promise<boolean> {
// Nothing staged -> nothing to commit. Treat as a no-op (SPEC §11: a
// deterministic re-pull of unchanged pages produces identical bytes, so
// git sees no diff and we must not error).
const staged = await this.runRaw([
"diff",
"--cached",
"--quiet",
]);
// `diff --cached --quiet` exits 0 when the index matches HEAD (nothing
// staged), 1 when there are staged changes.
if (staged.code === 0) return false;
await this.commitRaw(message, opts);
return true;
}
/**
* Low-level commit used by both `commit` and `ensureRepo`'s initial commit.
* Builds the full message with appended trailers and sets author + committer
* identity via env vars (so the committer matches the author, not the repo
* default).
*/
private async commitRaw(
message: string,
opts: CommitOptions & { allowEmpty?: boolean },
): Promise<void> {
const fullMessage = buildCommitMessage(message, opts.trailers);
// `--no-verify` skips pre-commit/commit-msg hooks: a global core.hooksPath
// (or any injected hook) must never interfere with engine commits in our
// dedicated vault repo.
const args = ["commit", "--no-verify", "-m", fullMessage];
if (opts.allowEmpty) args.push("--allow-empty");
// Route through the single `runRaw` primitive; set author + committer
// identity via env vars (so the committer matches the author, not the repo
// default). Throw via the same unified message on a non-zero exit.
const r = await this.runRaw(args, {
env: {
GIT_AUTHOR_NAME: opts.authorName,
GIT_AUTHOR_EMAIL: opts.authorEmail,
GIT_COMMITTER_NAME: opts.authorName,
GIT_COMMITTER_EMAIL: opts.authorEmail,
},
});
if (r.code !== 0) {
const detail = (r.stderr || r.stdout || "").trim();
throw new Error(`git ${args.join(" ")} failed: ${detail}`);
}
}
/**
* Merge `fromBranch` into the current branch (`git merge --no-edit`).
* Fast-forwards when possible; performs a real 3-way merge otherwise. Conflict
* state is SURFACED (returned), NOT auto-resolved (SPEC §9): the conflict
* markers are left in the worktree for manual resolution by a later increment,
* and — critically — nothing is pushed to Docmost (we never write to Docmost
* anyway).
*/
async merge(fromBranch: string): Promise<MergeResult> {
const r = await this.runRaw(["merge", "--no-edit", fromBranch]);
const output = `${r.stdout}\n${r.stderr}`.trim();
if (r.code === 0) {
return { ok: true, conflict: false, output };
}
// A non-zero exit on merge most commonly means a conflict. Confirm by
// checking for unmerged paths (porcelain "U" status) so we don't mislabel
// an unrelated failure as a conflict.
const conflict = await this.hasUnmergedPaths();
return { ok: false, conflict, output };
}
/** True if the index has any unmerged (conflicted) paths. */
private async hasUnmergedPaths(): Promise<boolean> {
const r = await this.runRaw(["diff", "--name-only", "--diff-filter=U"]);
return r.code === 0 && r.stdout.trim().length > 0;
}
/**
* The vault-relative (forward-slash) paths with UNMERGED (conflicted) index
* entries after a conflicting merge. NUL-delimited + `core.quotepath=false`
* (the `runRaw` baseline) so Cyrillic/space paths come back verbatim. Used by
* the pull cycle to LOG and ISOLATE the conflicted page(s) when it commits a
* conflicted merge instead of leaving the whole vault wedged (SPEC §9 wedge
* fix). Returns `[]` on any error (best-effort diagnostics).
*/
async listUnmergedPaths(): Promise<string[]> {
const r = await this.runRaw([
"diff",
"--name-only",
"--diff-filter=U",
"-z",
]);
if (r.code !== 0) return [];
return r.stdout.split("\0").filter((p) => p.length > 0);
}
/**
* Commit an IN-PROGRESS (conflicted) merge AS-IS so the vault is NOT left
* wedged mid-merge (SPEC §9 wedge fix). A `git merge` that conflicts leaves
* `MERGE_HEAD` + unmerged index entries; the next cycle's `isMergeInProgress`
* check would then skip the ENTIRE space forever (the reported wedge). Instead
* we stage everything — including the conflicted file(s), whose conflict
* markers are PRESERVED in the committed tree — and record the two-parent merge
* commit. The cleanly-merged pages land normally; the conflicted page carries
* its markers on `main`, where the push side isolates it (a per-page push
* failure when `autoMergeConflicts` is off; the markers never reach Docmost)
* while every other page keeps syncing. Recovery: resolve the markers in git
* and the next push sends the clean body.
*
* `--allow-empty` guards the degenerate case where the staged conflict
* resolution nets to no tree change; while `MERGE_HEAD` exists `git commit`
* still records the merge commit so the half-merge is cleared.
*/
async commitMerge(message: string, opts: CommitOptions): Promise<void> {
await this.run(["add", "-A"]);
await this.commitRaw(message, { ...opts, allowEmpty: true });
}
/**
* Abort an in-progress merge (`git merge --abort`), restoring the pre-merge
* working tree + index. Best-effort: a non-zero exit (e.g. no MERGE_HEAD) is
* swallowed. Used by the cycle's RECOVERY path to unwedge a vault that a
* PRIOR (pre-fix) cycle left mid-merge, so the fresh pull can re-run instead of
* skipping the space forever (SPEC §9 wedge recovery).
*/
async abortMerge(): Promise<void> {
await this.runRaw(["merge", "--abort"]);
}
/** Hard-reset the working tree + index to HEAD (drops a stray half-merge that
* `merge --abort` could not clear — no MERGE_HEAD but lingering unmerged
* entries). Best-effort recovery primitive (SPEC §9). */
async resetHardToHead(): Promise<void> {
await this.runRaw(["reset", "--hard", "HEAD"]);
}
/**
* List tracked files on the current branch (paths relative to the vault
* root, forward-slash separated). An optional glob (a git pathspec) narrows
* the listing, e.g. `"*.md"`.
*
* The target wiki is RUSSIAN, so vault file names routinely contain Cyrillic
* (e.g. `Column.md` in Cyrillic). With git's DEFAULT `core.quotepath=true`, `ls-files`
* returns non-ASCII paths octal-escaped and double-quoted (`"\320\232..."`),
* which `src/pull.ts` `readExisting` would then parse as garbage paths,
* breaking move/duplicate detection. We defeat that two ways at once:
* - `core.quotepath=false` disables the octal-escape/quoting. It is now the
* `runRaw` argv baseline (prepended to EVERY invocation), so we no longer
* pass it inline here.
* - `-z` emits NUL-delimited RAW UTF-8 paths (no quoting, no newline
* ambiguity), which we split on `\0`.
* We read the RAW stdout (NOT the trimming `run()` helper, which would mangle
* the NUL-delimited bytes) and split on `\0`, dropping empty entries. Paths
* are returned verbatim — git already emits forward slashes.
*/
async listTrackedFiles(glob?: string): Promise<string[]> {
const r = await this.runRaw(["ls-files", "-z", ...(glob ? [glob] : [])]);
if (r.code !== 0) {
const detail = (r.stderr || r.stdout || "").trim();
throw new Error(`git ls-files failed: ${detail}`);
}
return r.stdout.split("\0").filter((p) => p.length > 0);
}
/**
* Diff two refs with `--name-status -M -z` and parse the NUL-delimited output
* (SPEC §6: the FS→Docmost push direction diffs `main` against
* `refs/docmost/last-pushed`). Rename detection is ON (`-M`), so a moved/renamed
* file is reported as a single `R` row with both its old and new path instead
* of a delete+add pair — that distinction is what lets the push planner tell a
* move from a delete+create (SPEC §8 "Move vs delete").
*
* `-z` makes git emit NUL-delimited RAW UTF-8 records (the Russian wiki has
* Cyrillic file names) with NO quoting/escaping. The record shape differs by
* status:
* - A/M/D: `status\0path\0`
* - R/C: `Rnnn\0oldPath\0newPath\0` (nnn = similarity score, e.g. `R100`)
* We read the RAW stdout (not the trimming `run()` helper, which would mangle
* the NUL bytes), split on `\0`, drop the trailing empty entry, and walk the
* tokens pulling 1 or 2 path tokens per status. Paths are returned verbatim.
*/
async diffNameStatus(
fromRef: string,
toRef: string,
): Promise<DiffEntry[]> {
const r = await this.runRaw([
"diff",
"--name-status",
"-M",
"-z",
fromRef,
toRef,
]);
if (r.code !== 0) {
const detail = (r.stderr || r.stdout || "").trim();
throw new Error(`git diff --name-status failed: ${detail}`);
}
// Tokens alternate: <status> <path...> <status> <path...> ... With `-z`,
// each token (status code AND each path) is its own NUL-delimited field.
const tokens = r.stdout.split("\0").filter((t) => t.length > 0);
const entries: DiffEntry[] = [];
let i = 0;
while (i < tokens.length) {
const raw = tokens[i++];
// The status token is e.g. `A`, `M`, `D`, or `R100` / `C075`. The leading
// letter is the change kind; any trailing digits are the similarity score.
const letter = raw[0] as DiffEntry["status"];
if (letter === "R" || letter === "C") {
const score = Number.parseInt(raw.slice(1), 10);
const oldPath = tokens[i++];
const path = tokens[i++];
if (oldPath === undefined || path === undefined) break; // malformed tail
entries.push({
status: letter,
path,
oldPath,
...(Number.isFinite(score) ? { score } : {}),
});
} else if (letter === "A" || letter === "M" || letter === "D") {
const path = tokens[i++];
if (path === undefined) break; // malformed tail
entries.push({ status: letter, path });
} else {
// Unknown/other status (e.g. T type-change, U unmerged) — consume one
// path token defensively so the walk stays aligned, but do not emit it
// (the push planner only handles A/M/D/R/C).
i++;
}
}
return entries;
}
/**
* Resolve a ref/commit-ish to its full SHA, or `null` if it does not exist.
* `rev-parse --verify --quiet` exits non-zero (and prints nothing) for an
* unknown ref, so a non-zero exit maps cleanly to `null`. Used to read
* `refs/docmost/last-pushed` (SPEC §5) — which is absent before the first push.
*/
async revParse(ref: string): Promise<string | null> {
const r = await this.runRaw(["rev-parse", "--verify", "--quiet", ref]);
if (r.code !== 0) return null;
const sha = r.stdout.trim();
return sha.length > 0 ? sha : null;
}
/**
* Read a ref to its SHA, or `null` if unset. Thin alias over `revParse`,
* named for the push direction's marker `refs/docmost/last-pushed` (SPEC §5:
* "what of `main` is already reflected in Docmost").
*/
async readRef(ref: string): Promise<string | null> {
return this.revParse(ref);
}
/**
* Point `ref` at `target` (`git update-ref <ref> <target>`). Used to advance
* `refs/docmost/last-pushed` to the just-pushed `main` commit after a push
* (SPEC §6 step 3 / §5). `target` may be a SHA or any commit-ish git accepts.
*/
async updateRef(ref: string, target: string): Promise<void> {
await this.run(["update-ref", ref, target]);
}
/**
* Fast-forward `branch` to `toCommit` — but ONLY if it is a TRUE fast-forward,
* i.e. the current `branch` tip is an ancestor of `toCommit` (verified via
* `git merge-base --is-ancestor <branch> <toCommit>`). Used to advance the
* `docmost` mirror branch after a clean push (SPEC §6 step 3 / §10): once a
* push succeeds, Docmost already contains the pushed `main` content, so the
* mirror must reflect it — otherwise the NEXT pull would diff our own write
* back and re-pull it (loop-guard).
*
* SAFETY — never force, never clobber divergent history:
* - If `branch` IS an ancestor of `toCommit`, advance it with
* `git update-ref refs/heads/<branch> <toCommit>`. The `docmost` branch is
* NOT checked out during a push (push works on `main`), so updating the ref
* directly is safe and avoids any working-tree touch.
* - If `branch` is NOT an ancestor (divergent / would-be non-fast-forward),
* do NOT move it — return `{ ok: false, reason: 'not-fast-forward' }` and
* let the caller log it. We must never overwrite a `docmost` history that
* has commits the push base does not contain.
*
* Returns `{ ok: true }` when the branch was advanced (or already at
* `toCommit`, a degenerate fast-forward), `{ ok: false, reason }` otherwise.
* A missing `branch` or `toCommit` also yields `{ ok: false }` with a reason.
*/
async fastForwardBranch(
branch: string,
toCommit: string,
): Promise<{ ok: boolean; reason?: string }> {
const branchRef = `refs/heads/${branch}`;
// Resolve both endpoints first so a missing ref is a clean refusal, not a
// confusing `merge-base` failure.
const branchSha = await this.revParse(branchRef);
if (branchSha === null) {
return { ok: false, reason: `branch ${branch} does not exist` };
}
const targetSha = await this.revParse(toCommit);
if (targetSha === null) {
return { ok: false, reason: `target ${toCommit} does not resolve` };
}
// Already at the target -> a no-op fast-forward (still ok).
if (branchSha === targetSha) return { ok: true };
// `merge-base --is-ancestor A B` exits 0 iff A is an ancestor of B. Only a
// true ancestor is a fast-forward; anything else is divergent and refused.
const ancestor = await this.runRaw([
"merge-base",
"--is-ancestor",
branchSha,
targetSha,
]);
if (ancestor.code !== 0) {
return { ok: false, reason: "not-fast-forward" };
}
// Safe to advance: the branch is not checked out during push, so a direct
// ref update avoids a checkout/working-tree touch.
await this.updateRef(branchRef, targetSha);
return { ok: true };
}
/**
* Read a file's content at a specific ref (`git show <ref>:<path>`), or `null`
* if the path does not exist there. Used by the push direction to read the
* PRE-IMAGE of a DELETED file (e.g. at `refs/docmost/last-pushed`) so its
* `docmost:meta` — and therefore its `pageId` — can be recovered to translate
* the deletion into a `delete_page` (SPEC §6/§8: only TRACKED files, i.e. ones
* that had a pageId, are deleted in Docmost). A non-zero exit (path absent at
* that ref) maps to `null` rather than throwing.
*/
async showFileAtRef(ref: string, path: string): Promise<string | null> {
// `git show <ref>:<path>` requires the path relative to the repo root; pass
// it verbatim (forward-slash, matching `listTrackedFiles` / diff output).
const r = await this.runRaw(["show", `${ref}:${path}`]);
if (r.code !== 0) return null;
return r.stdout;
}
/**
* Read ONE side of a conflicted file from the merge index (`git show :N:path`),
* where the stage `N` is the standard 3-way merge slot:
* 1 = merge BASE (common ancestor), 2 = OURS (the current branch = `main`),
* 3 = THEIRS (the merged-in branch = `docmost`).
* Returns the blob text, or `null` when that stage is absent (e.g. an add/add
* conflict has no base, a modify/delete conflict has only one content side).
*
* Used by the pull cycle (SPEC §9) to RESOLVE a conflicted docmost->main merge
* deterministically instead of committing raw conflict markers onto the
* published `main`: a conflict whose two sides differ ONLY in trailing/empty
* lines is SPURIOUS (normalize -> identical -> clean), and a genuine conflict is
* resolved to a clean side (no `<<<<<<<`/`>>>>>>>` markers ever reach `main`).
*/
async showStage(stage: 1 | 2 | 3, path: string): Promise<string | null> {
const r = await this.runRaw(["show", `:${stage}:${path}`]);
if (r.code !== 0) return null;
return r.stdout;
}
/**
* Pin the repo's symbolic `HEAD` to `main` WITHOUT touching the working tree or
* index (`git symbolic-ref HEAD refs/heads/main`). The smart-HTTP host advertises
* whatever `HEAD` resolves to as the clone's default branch, so a clone that
* races a cycle mid-pull (when the engine has transiently checked out the
* read-only `docmost` mirror) would otherwise default to `docmost`. Pinning HEAD
* back to the canonical writable branch makes the advertised symref deterministic.
*
* symbolic-ref only rewrites `.git/HEAD`; it does NOT move the working tree, so
* it must only ever run when the working tree is ALREADY on `main` (between
* cycles / under the per-space lock with no cycle in flight) — otherwise HEAD and
* the index would desync. Callers serialize this with the engine via the lock.
*/
async pinHeadToMain(): Promise<void> {
await this.run(["symbolic-ref", "HEAD", `refs/heads/${DEFAULT_BRANCH}`]);
}
}
/**
* Build the environment for a vault git invocation (SPEC §12 cwd-isolation).
* Used by the single `runRaw` primitive every git command flows through, so
* these pins apply uniformly (including the `git --version` preflight).
*
* cwd-isolation is this module's central safety guarantee: every git command
* MUST operate on the vault repo at `cwd: vaultPath` and nothing else. An
* inherited `GIT_DIR` / `GIT_WORK_TREE` in `process.env` would silently
* redirect the operation away from `cwd` (e.g. to the source repo or another
* checkout), defeating that guarantee. So we always strip them, regardless of
* whatever else the caller adds (author/committer identity, etc.).
*
* Exported for unit testing.
*/
export function vaultGitEnv(
extra?: Record<string, string>,
): NodeJS.ProcessEnv {
const env: NodeJS.ProcessEnv = {
...process.env,
// Locale-independent output (defense in depth). We never parse localized
// prose, but pinning the locale prevents a future regression where some
// git message we DO key on is translated by an inherited LC_ALL/LANG.
LC_ALL: "C",
LANG: "C",
// Never page (we already pass --no-pager, but a stray GIT_PAGER could still
// bite) and never block on an interactive prompt (e.g. credentials) — the
// daemon runs unattended and must not hang.
GIT_PAGER: "cat",
GIT_TERMINAL_PROMPT: "0",
...extra,
};
delete env.GIT_DIR;
delete env.GIT_WORK_TREE;
return env;
}
/**
* Build a commit message body with trailer lines appended (SPEC §7.3). The
* trailers are separated from the subject by a blank line so `git interpret-
* trailers` / `git log --format=%(trailers)` parse them as trailers.
* Exported for unit testing.
*/
export function buildCommitMessage(
subject: string,
trailers?: string[],
): string {
if (!trailers || trailers.length === 0) return subject;
return `${subject}\n\n${trailers.join("\n")}`;
}
+202
View File
@@ -0,0 +1,202 @@
/**
* Pure page-tree -> vault path mapping (SPEC §12).
*
* Given the flat list of page nodes for a space (as returned by
* `listAllSpacePages`), compute for every page a deterministic, collision-free
* destination: a folder path (root -> leaf ancestors) plus a file stem (the
* page's own name, no extension). This module is intentionally PURE and
* dependency-free apart from the sanitization helpers, so the whole tree ->
* path logic is unit-testable without any I/O. The names are COSMETIC; identity
* lives in each file's meta block (pageId / slugId).
*/
import { sanitizeTitle, disambiguate } from "./sanitize.js";
/** Flat page node as returned by `listAllSpacePages` (no content). */
export interface PageNode {
id: string;
title?: string;
slugId?: string;
parentPageId?: string | null;
hasChildren?: boolean;
}
/** A page's resolved vault destination: folder path + file stem. */
export interface VaultEntry {
/** Folder path, root -> leaf (the page's ancestors). Empty for a root page. */
segments: string[];
/** The page's own file name without extension. */
stem: string;
}
/**
* Build the full vault layout for a space.
*
* Returns a Map keyed by pageId -> `{ segments, stem }`. The result is
* deterministic for a given input and guarantees every full destination path
* (`[...segments, stem].join("/")`) is unique, so no page can silently overwrite
* another.
*
* Disambiguation is layered:
* 1. Sibling collisions (same sanitized title under the same parent) are
* resolved with a stable ` ~<slugId>` suffix (the suffix is itself
* sanitized, since slugId/id is untrusted data that must never inject a
* path separator).
* 2. A final full-path pass catches residual collisions that sibling-scoping
* cannot see — e.g. two pages whose parents are BOTH outside the input set
* both bucket at the root with `segments: []`.
*/
export function buildVaultLayout(pages: PageNode[]): Map<string, VaultEntry> {
// Index pages by id so the parent chain can be walked. Guard against
// duplicate ids in the input (first one wins).
const byId = new Map<string, PageNode>();
for (const p of pages) {
if (p && p.id && !byId.has(p.id)) byId.set(p.id, p);
}
// Resolve each node's display name once, deterministically. The bucket key is
// the node's parent ONLY when that parent is actually present in `byId`;
// otherwise (null parent, or an orphan whose parent is outside the input set)
// the node buckets at `"__root__"`. This is critical: orphans land at the vault
// root (see `folderSegmentsFor`), so they MUST share the root bucket with real
// root pages to be disambiguated against each other here — making `nameById`
// final before any `segments` are computed, so no ancestor name can drift.
const parentKeyOf = (p: PageNode): string =>
p.parentPageId && byId.has(p.parentPageId) ? p.parentPageId : "__root__";
// Group nodes by (parentKey, sanitized base title) so sibling collisions are
// resolved by a STABLE rule that does NOT depend on input array order. Dedupe
// ids (first occurrence wins, matching `byId`).
const siblingGroups = new Map<string, PageNode[]>();
const namedIds = new Set<string>();
for (const p of pages) {
if (!p || !p.id || namedIds.has(p.id)) continue;
namedIds.add(p.id);
const key = `${parentKeyOf(p)}\u0000${sanitizeTitle(p.title ?? "")}`;
const bucket = siblingGroups.get(key);
if (bucket) bucket.push(p);
else siblingGroups.set(key, [p]);
}
// Assign each node its display name. Within a colliding group, sort the
// siblings by their stable disambiguation key (`slugId` else `id`) and let the
// FIRST keep the bare sanitized title; every OTHER gets the ` ~<slugId>`
// suffix. This makes `nameById` a pure function of the page SET — reordering
// the input never moves the suffix onto a different page (red-team #4a). The
// suffix is itself sanitized (the slugId/id is untrusted and must never inject
// a path separator).
const nameById = new Map<string, string>();
const disambKeyOf = (p: PageNode): string => p.slugId ?? p.id;
for (const bucket of siblingGroups.values()) {
const base = sanitizeTitle(bucket[0].title ?? "");
if (bucket.length === 1) {
nameById.set(bucket[0].id, base);
continue;
}
const sorted = [...bucket].sort((a, b) => {
const ka = disambKeyOf(a);
const kb = disambKeyOf(b);
return ka < kb ? -1 : ka > kb ? 1 : 0;
});
sorted.forEach((p, i) => {
nameById.set(
p.id,
i === 0 ? base : disambiguate(base, sanitizeTitle(disambKeyOf(p))),
);
});
}
// Every id we index above MUST get a resolved name; this helper returns it
// and THROWS if it is somehow absent, rather than silently recomputing a
// DIFFERENT, non-disambiguated name (which would desync a folder segment from
// its target file).
const nameOf = (id: string): string => {
const name = nameById.get(id);
if (name === undefined) {
throw new Error(`buildVaultLayout: no resolved name for page id ${id}`);
}
return name;
};
// Build the folder path for a page by walking parentPageId to the root. The
// page's OWN name is the file stem; its ancestors become folders. A `visited`
// guard prevents an infinite loop on a malformed parent cycle.
const folderSegmentsFor = (node: PageNode): string[] => {
const ancestors: string[] = [];
const visited = new Set<string>();
let current: PageNode | undefined = node.parentPageId
? byId.get(node.parentPageId)
: undefined;
while (current && current.id && !visited.has(current.id)) {
visited.add(current.id);
ancestors.unshift(nameOf(current.id));
current = current.parentPageId
? byId.get(current.parentPageId)
: undefined;
}
return ancestors;
};
// First pass: compute the provisional { segments, stem } for every node.
const layout = new Map<string, VaultEntry>();
for (const p of pages) {
if (!p || !p.id || layout.has(p.id)) continue;
layout.set(p.id, {
segments: folderSegmentsFor(p),
stem: nameOf(p.id),
});
}
// FOLDER-NOTE transform (native-Obsidian layout): a page WITH CHILDREN lives at
// `<…>/<stem>/<stem>.md` — its body is the folder-note INSIDE its own folder
// (LostPaul Folder Notes convention), and its children sit alongside it in that
// folder. A leaf stays `<…>/<stem>.md`. Children's segments already point into
// the parent's folder (folderSegmentsFor walks ancestor NAMES), so only the
// parent's own file relocates here; the sibling name pass above already made
// the parent name unique, so folder == file name stays consistent.
for (const p of pages) {
if (!p || !p.id) continue;
const entry = layout.get(p.id);
if (entry && p.hasChildren) {
entry.segments = [...entry.segments, entry.stem];
}
}
// Final full-path uniqueness pass — a belt-and-suspenders safety net. Note
// that cross-bucket (orphan/root) collisions are now resolved in the name pass
// above (orphans share the "__root__" bucket), so ancestor names are final
// before `segments` are built and this pass should rarely/never re-stem an
// ancestor. It only re-stems the colliding LATER leaf via the sanitized
// slugId/id, then (if still colliding) appends the id.
//
// Process FOLDER-NOTES (pages with children) FIRST so a parent claims its
// canonical `<name>/<name>.md` before a same-named CHILD — the child (a leaf)
// is the one that disambiguates, never the folder-note.
const usedPaths = new Set<string>();
const seenIds = new Set<string>();
const pathKey = (e: VaultEntry): string => [...e.segments, e.stem].join("/");
const ordered = pages
.filter((p): p is PageNode => Boolean(p && p.id))
.sort(
(a, b) =>
Number(Boolean(b.hasChildren)) - Number(Boolean(a.hasChildren)),
);
for (const p of ordered) {
if (seenIds.has(p.id)) continue;
seenIds.add(p.id);
const entry = layout.get(p.id);
if (!entry) continue;
if (usedPaths.has(pathKey(entry))) {
// First attempt: disambiguate the stem with the sanitized slugId (or id).
entry.stem = disambiguate(entry.stem, sanitizeTitle(p.slugId ?? p.id));
if (usedPaths.has(pathKey(entry))) {
// Still colliding: append the (sanitized) id as a last resort. The id
// is globally unique, so this always resolves the collision.
entry.stem = disambiguate(entry.stem, sanitizeTitle(p.id));
}
}
usedPaths.add(pathKey(entry));
}
return layout;
}
@@ -0,0 +1,29 @@
/**
* Loop-guard primitives (SPEC §10). The sync engine must never re-pull its OWN
* write as if it were a remote edit: after a push, the next poll will see the
* page it just wrote with a fresh `updatedAt`. To suppress that, we key on two
* signals — the body HASH of what we pushed (this module) and the `updatedAt`
* returned by the write — recorded per page at push time.
*
* This module owns the PURE, deterministic body-hash. The CONSUMPTION on the
* pull side (comparing an incoming page's body hash against the last pushed hash
* to decide "this is our own write, ignore it") is a future increment — here we
* only PRODUCE the hash and the per-page push record (see `src/push.ts`).
*/
import { createHash } from "node:crypto";
/**
* Stable hash of a page's markdown BODY (SPEC §10 "body hash"). Deterministic:
* the same input string always yields the same digest, a different input a
* different one. Used to recognize our own write later (loop suppression).
*
* We hash the body STRING as-is (UTF-8) with SHA-256 and return lowercase hex.
* SPEC §10 keys on the body hash rather than file bytes; callers decide WHAT
* counts as "the body" (here it is the exact string passed in — typically the
* self-contained markdown that was pushed). No normalization is applied: the
* caller is responsible for passing a canonical/stable representation if it
* wants hash equality across cosmetic-only differences.
*/
export function bodyHash(markdownBody: string): string {
return createHash("sha256").update(markdownBody, "utf8").digest("hex");
}
+132
View File
@@ -0,0 +1,132 @@
/**
* Vault path guard (security, defense-in-depth).
*
* A user with push access to a git-sync space could commit a `.md` entry that is
* a SYMLINK (e.g. `leak.md -> /etc/passwd` or `-> <server>/.env`). On the next
* cycle a naive `fs.readFile` would follow the link and PUBLISH the target's
* contents as a Docmost page (a read primitive that escalates a writer to
* arbitrary server-file disclosure — including the JWT secret / DB creds in
* `.env`); a symlinked DIRECTORY gives the inverse write-outside-the-vault
* primitive on pull. The primary defense is `core.symlinks=false` in each
* vault's git config (git then materializes a pushed symlink as a PLAIN FILE
* holding the link text, never a real link). This module is the second layer:
* before every engine read/write/mkdir we reject a path that IS — or traverses —
* a symlink, or whose real location escapes the vault root.
*
* IO-free by construction: the `lstat`/`realpath` primitives are injected
* (mirroring the rest of the engine) so the rules are unit-testable with fakes
* and the engine never imports `node:fs`. Path math uses `node:path`, which is
* pure.
*/
import { isAbsolute, relative, resolve, sep } from "node:path";
/** Why a path was refused. */
export type VaultPathUnsafeReason = "symlink" | "escape";
/**
* Thrown when a path is refused by the guard. Engine read/write loops already
* isolate per-file errors (skip + log), so throwing here yields the review's
* required "skip+log" behavior without a separate control channel.
*/
export class VaultPathUnsafeError extends Error {
constructor(
readonly absPath: string,
readonly reason: VaultPathUnsafeReason,
readonly vaultRoot: string,
) {
super(
reason === "symlink"
? `git-sync: refusing to access '${absPath}' — it is (or traverses) a ` +
`symlink under vault '${vaultRoot}' (symlink guard)`
: `git-sync: refusing to access '${absPath}' — it resolves outside ` +
`vault '${vaultRoot}' (symlink guard)`,
);
this.name = "VaultPathUnsafeError";
}
}
/**
* The injected IO the guard needs. Both MUST resolve to `null` on ENOENT (the
* normal case for a not-yet-created file on a write/mkdir) and reject on any
* other error.
*/
export interface PathGuardIo {
/** lstat WITHOUT following the final symlink. `null` when the path is absent. */
lstat: (absPath: string) => Promise<{ isSymbolicLink: boolean } | null>;
/** realpath (follows symlinks). `null` when the path is absent. */
realpath: (absPath: string) => Promise<string | null>;
}
/**
* Lexical containment: is `target` EQUAL to, or NESTED under, `root`? Catches a
* `..` traversal baked into a relPath before any IO. Both operands are resolved
* first so `.`/`..` segments are normalized.
*/
export function isWithinRoot(root: string, target: string): boolean {
const r = resolve(root);
const t = resolve(target);
if (t === r) return true;
const rel = relative(r, t);
return rel.length > 0 && !rel.startsWith(`..${sep}`) && rel !== ".." && !isAbsolute(rel);
}
/**
* Reject `absPath` (resolving silently when it is safe) if it:
* - escapes `vaultRoot` lexically (a `..` traversal), OR
* - IS, or traverses, a symlink at any EXISTING segment from the root down
* (a symlinked ancestor dir, or the target file/dir itself), OR
* - resolves (realpath of its deepest existing ancestor) outside the vault.
*
* Absent leaf segments — the normal case when writing/mkdir'ing a NEW file — are
* safe: the walk stops at the first non-existent segment (nothing to follow).
*/
export async function assertVaultPathSafe(
io: PathGuardIo,
vaultRoot: string,
absPath: string,
): Promise<void> {
const root = resolve(vaultRoot);
const target = resolve(absPath);
// 1. Lexical containment — a `..` in a relPath never even reaches an lstat.
if (!isWithinRoot(root, target)) {
throw new VaultPathUnsafeError(absPath, "escape", vaultRoot);
}
// 2. lstat-walk: reject a symlink at ANY existing level between the root and
// the target (inclusive). A symlinked ancestor or a symlinked target both
// let a follow-the-link read/write escape; rejecting the link itself is the
// surgical guard.
if (target !== root) {
const segments = relative(root, target)
.split(sep)
.filter((s) => s.length > 0);
let cur = root;
for (const segment of segments) {
cur = resolve(cur, segment);
const st = await io.lstat(cur);
if (st === null) break; // absent from here down — nothing left to follow
if (st.isSymbolicLink) {
throw new VaultPathUnsafeError(cur, "symlink", vaultRoot);
}
}
}
// 3. realpath belt-and-suspenders: the deepest EXISTING ancestor must resolve
// inside the vault root's realpath. Catches an ancestor relocated via a
// symlink the lexical check would miss (e.g. the data dir itself being a
// link farm) and bounds the lstat→use TOCTOU window.
const realRoot = await io.realpath(root);
if (realRoot === null) return; // root absent — ensureRepo creates it first
let probe = target;
let realProbe = await io.realpath(probe);
while (realProbe === null && probe !== root) {
const parent = resolve(probe, "..");
if (parent === probe) break; // reached the filesystem root
probe = parent;
realProbe = await io.realpath(probe);
}
if (realProbe !== null && !isWithinRoot(realRoot, realProbe)) {
throw new VaultPathUnsafeError(absPath, "escape", vaultRoot);
}
}
+545
View File
@@ -0,0 +1,545 @@
/**
* Pull cycle — Docmost -> vault (SPEC §6 "Docmost -> FS").
*
* This increment turns the read-only mirror into the git-backed pull cycle:
*
* 1. ensureRepo(vault); refuse if a merge is in progress (SPEC §9/§12);
* ensureBranch("docmost", "main") (SPEC §5 branches)
* 2. checkout docmost
* 3. fetch the live tree (listSpaceTree -> {pages, complete}) -> compute the
* desired `live` files (relPath via the pure sanitize/disambiguation layout)
* 4. parse `existing` tracked .md files (pageId + relPath from gitmost_id frontmatter)
* 5. plan = planReconciliation(live, existing) (pure, SPEC §5/§8); toDelete
* is absence-only, moves are separate
* 6. decideAbsenceDeletions: SUPPRESS absence deletions on an incomplete tree
* fetch (SPEC §8) and behind the mass-delete guard (defense in depth)
* 7. write each live page in its fixpoint form (normalize-on-write, SPEC §11);
* apply moved-old-path removals (only when the move write SUCCEEDED) and
* absence-delete removals (only when the decision allowed them)
* 8. stageAll + commit on `docmost` with the provenance trailer (SPEC §7.3)
* 9. checkout main + merge docmost (conflicts are surfaced, NOT auto-resolved,
* SPEC §9); push is deferred (SPEC §7)
* 10. one-line summary
*
* DIRECTION IS Docmost -> vault ONLY. Nothing here ever writes to Docmost
* (read-only: listSpaceTree + getPageJson). All git operations run against
* the vault repo (`cwd = vaultPath`), never the source repo (see ./git.ts).
*
* The client seam is the native `GitSyncClient` (`Pick<GitSyncClient, ...>`);
* the gitmost server drives the engine in-process (there is no standalone CLI
* entry point).
*/
import { dirname } from "node:path";
import { sep } from "node:path";
import { parsePageFile, serializePageFile } from "../lib/page-file.js";
import type { GitSyncClient } from "./client.types.js";
import { buildVaultLayout, type PageNode } from "./layout.js";
import {
VaultGit,
BOT_AUTHOR_NAME,
BOT_AUTHOR_EMAIL,
DEFAULT_BRANCH,
} from "./git.js";
import {
planReconciliation,
decideAbsenceDeletions,
type LiveEntry,
type MovedEntry,
type DeletionDecision,
} from "./reconcile.js";
import { stabilizePageBody } from "./stabilize.js";
// Engine-only mirror branch (SPEC §5): the engine writes here, humans never do.
const DOCMOST_BRANCH = "docmost";
// Machine-readable provenance the loop-guard keys on (SPEC §7.3 / §12).
const SOURCE_TRAILER = "Docmost-Sync-Source: docmost";
// Number of pages fetched/stabilized concurrently. Bounded so a large space
// does not open thousands of simultaneous requests/conversions at once.
const CONCURRENCY = 6;
// How often to log incremental progress (every N completed pages).
const PROGRESS_EVERY = 25;
/** Convert a vault-relative path (forward-slash) to an absolute FS path. */
function relToAbs(vaultRoot: string, relPath: string): string {
return [vaultRoot, ...relPath.split("/")].join("/");
}
/**
* Canonicalize a file's TRAILING whitespace: drop any trailing blank /
* whitespace-only lines (and trailing spaces on the last line) and end with
* exactly one newline; an empty body becomes a single "\n". This matches
* `serializePageFile`'s trailing form (`body.trim()` + a single "\n").
*
* Why (SPEC §9 spurious-conflict fix): the engine writes pages in their
* normalize-on-write form (one trailing newline), but a user can push a `.md` to
* `main` with EXTRA trailing/empty lines (e.g. a double-blank-line append). When
* the docmost mirror (normalized) and `main` (raw) both change near end-of-file,
* git's line-based 3-way merge reports a CONFLICT even though the only difference
* is trailing blank lines. Normalizing BOTH sides before comparing collapses that
* difference to nothing, so the pull cycle can recognize the conflict as SPURIOUS
* and resolve it cleanly instead of committing raw conflict markers onto `main`.
*/
function normalizeTrailingWhitespace(text: string): string {
const body = text.replace(/[\s]+$/, "");
return body.length > 0 ? `${body}\n` : "\n";
}
/** Convert an absolute/relative segment list under the vault to a relPath. */
function segmentsToRelPath(segments: string[], stem: string): string {
return [...segments, `${stem}.md`].join("/");
}
/**
* Injectable IO for `readExisting` (R-Pull-1, test-strategy report §5). The real
* `main` wires these to `git.listTrackedFiles("*.md")` and an `fs.readFile`
* rooted at the vault; tests pass fakes so the parsing/skip rules are unit-
* testable without a real git repo or filesystem.
*/
export interface ReadExistingDeps {
/** List tracked .md paths (forward-slash, vault-relative). */
listTracked: () => Promise<string[]>;
/** Read a tracked file's text by its (forward-slash) vault-relative path. */
readFile: (relPath: string) => Promise<string>;
}
/**
* Read every tracked .md file in the vault and recover `{ pageId, relPath }` from
* its `gitmost_id` frontmatter (native-Obsidian format). Files without a
* `gitmost_id` are skipped (they are not engine-tracked pages yet — e.g. a stray
* hand-written Obsidian file; PUSH adopts those separately).
*
* The IO is injected (R-Pull-1) so this is testable with fakes. Skip rules:
* - a `readFile` rejection (tracked but missing on disk, a mid-operation race)
* -> skipped, NOT thrown; the next pull converges;
* - no `gitmost_id` frontmatter (`parsePageFile` -> id null) -> skipped.
*/
export async function readExisting(
deps: ReadExistingDeps,
): Promise<{ pageId: string; relPath: string }[]> {
const tracked = await deps.listTracked();
const existing: { pageId: string; relPath: string }[] = [];
for (const relPath of tracked) {
// git ls-files always emits forward-slash paths; normalize just in case.
const rel = relPath.split(sep).join("/");
let text: string;
try {
text = await deps.readFile(rel);
} catch {
// Tracked but missing on disk (mid-operation race) — skip; the next pull
// converges.
continue;
}
const { id } = parsePageFile(text);
if (id) existing.push({ pageId: id, relPath: rel });
}
return existing;
}
/**
* Input to the PURE `computePullActions` (R-Pull-2). All data, no IO: the live
* tree nodes + completeness flag (from `listSpaceTree`) and the parsed
* `existing` tracked files (from `readExisting`).
*/
export interface PullActionsInput {
/** Live page nodes for the space (from `listSpaceTree`). */
pages: PageNode[];
/** Whether the live tree fetch was COMPLETE (SPEC §8 suppression). */
treeComplete: boolean;
/** Parsed tracked files: `{ pageId, relPath }` (from `readExisting`). */
existing: { pageId: string; relPath: string }[];
}
/**
* The PURE decisions object computed by `computePullActions` (no IO). It holds
* the reconciliation plan plus the SPEC §8 absence-deletion decision, with the
* suppression already folded in: `toDelete` is the POST-suppression set the
* caller should actually remove (empty when `deletionDecision.apply` is false).
*/
export interface PullActions {
/** Pages to (re)write at their relPath (add + update + move target). */
toWrite: { pageId: string; relPath: string }[];
/** Moves: write new path, then remove old path (only on a successful write). */
moved: MovedEntry[];
/**
* Absence-based paths to delete AFTER suppression. Empty when the decision
* suppressed deletions this cycle, so the caller can apply it unconditionally.
*/
toDelete: string[];
/** Why absence deletions were (or were not) applied (for logging + tests). */
deletionDecision: DeletionDecision;
/** Tracked-file count (for the suppression log messages). */
existingCount: number;
/** Planned absence-delete count BEFORE suppression (for the log message). */
plannedDeleteCount: number;
}
/**
* PURE pull-action planner (R-Pull-2, test-strategy report §5). Takes the live
* tree nodes + completeness + existing tracked files and returns the full set of
* decisions with NO IO:
*
* - builds the vault layout (deterministic relPath per live page),
* - `planReconciliation` -> toWrite / moved / absence-toDelete,
* - `decideAbsenceDeletions` -> the SPEC §8 suppression (incomplete-fetch +
* empty-live + mass-delete guard), folded IN here so `toDelete` is the
* POST-suppression set (empty when suppressed).
*
* Moves are NOT governed by the suppression: a moved page is present in `live`,
* so its old-path removal is real (the caller still gates it on the write
* succeeding). The expensive content fetch / file write / git ops happen in the
* thin `applyPullActions`.
*/
export function computePullActions(input: PullActionsInput): PullActions {
const { pages, treeComplete, existing } = input;
const layout = buildVaultLayout(pages);
const live: LiveEntry[] = [];
for (const p of pages) {
if (!p || !p.id) continue;
const entry = layout.get(p.id);
if (!entry) continue;
live.push({
pageId: p.id,
relPath: segmentsToRelPath(entry.segments, entry.stem),
});
}
// Plan reconciliation (pure). `plan.toDelete` is ABSENCE-based only;
// `plan.moved` carries move old-path removals separately.
const plan = planReconciliation(live, existing);
// Decide whether the ABSENCE-based deletions may be applied this cycle
// (SPEC §8): incomplete-fetch suppression + empty-live + mass-delete guard.
// Moves are NOT governed by this.
const deletionDecision = decideAbsenceDeletions({
treeComplete,
liveCount: live.length,
existingCount: existing.length,
deleteCount: plan.toDelete.length,
});
return {
toWrite: plan.toWrite,
moved: plan.moved,
// Fold the suppression in: a suppressed cycle deletes nothing.
toDelete: deletionDecision.apply ? plan.toDelete : [],
deletionDecision,
existingCount: existing.length,
plannedDeleteCount: plan.toDelete.length,
};
}
/**
* Injectable IO for `applyPullActions` (R-Pull-2). The real `main` wires these
* to the live client, the vault git wrapper, and `node:fs/promises`; tests pass
* fakes that RECORD calls so the ordering + the move-on-success data-loss guard
* are testable without real git/fs/network.
*/
export interface ApplyPullActionsDeps {
client: Pick<GitSyncClient, "getPageJson">;
git: Pick<
VaultGit,
| "stageAll"
| "commit"
| "checkout"
| "merge"
| "listUnmergedPaths"
| "commitMerge"
| "showStage"
>;
/** Write a file by ABSOLUTE path (mkdir of the parent is done internally). */
writeFile: (absPath: string, text: string) => Promise<void>;
/** Recursive mkdir of an ABSOLUTE directory path. */
mkdir: (absDir: string) => Promise<void>;
/** Remove a file by ABSOLUTE path (force: a missing file is a no-op). */
rm: (absPath: string) => Promise<void>;
/**
* Injected logger for cycle diagnostics (mirrors the push side). Optional —
* falls back to `console.log` so existing callers stay green.
*/
log?: (line: string) => void;
}
/** Outcome counters from `applyPullActions` (for the summary + tests). */
export interface ApplyResult {
written: number;
movedApplied: number;
deleted: number;
failed: number;
committed: boolean;
merge: { ok: boolean; conflict: boolean; output: string };
/**
* Vault-relative paths of the page(s) that had a GENUINE same-block conflict in
* the docmost -> main merge and were AUTO-RESOLVED to the git/main side (git
* wins, SPEC §9) — committed CLEAN, never with raw conflict markers. Empty on a
* clean merge AND when the only conflicts were spurious trailing-whitespace
* differences (those are normalized, not reported). Surfaced for logging /
* /status visibility; the docmost-side content stays recoverable via the
* `docmost` branch + page history.
*/
conflictedPaths: string[];
}
/**
* THIN IO applier (R-Pull-2). Performs the side effects in the EXACT current
* order, with all the original safety guards preserved bit-for-bit:
*
* 1. for each `toWrite`: fetch content (`client.getPageJson`) -> stabilize
* (normalize-on-write fixpoint, SPEC §11) -> mkdir + write. One bad page
* never aborts the pull (bounded-concurrency pool, fault-tolerant).
* 2. apply MOVE old-path removals — ONLY when the planner marked the old path
* removable AND the new-path write SUCCEEDED (the ⭐ data-loss guard: a
* failed move-write keeps the old path so the page never vanishes).
* 3. apply (post-suppression) absence deletes.
* 4. stageAll + commit on `docmost` (subject from ACTUAL written/deleted
* counts) + checkout main + merge docmost (conflicts surfaced, SPEC §9).
*
* `vaultRoot` roots the relPath -> absolute-path conversion for the fs deps.
*/
export async function applyPullActions(
deps: ApplyPullActionsDeps,
actions: PullActions,
vaultRoot: string,
): Promise<ApplyResult> {
const { client, git } = deps;
// One channel, mirroring the push side: route every cycle diagnostic through
// the injected logger; fall back to `console.log` when none is supplied.
const log = deps.log ?? ((line: string) => console.log(line));
// Emit the SPEC §8 suppression warnings (preserved from the original `main`).
const decision = actions.deletionDecision;
if (!decision.apply) {
if (decision.reason === "incomplete-fetch") {
log(
"pull: tree fetch incomplete — deletions suppressed this cycle (SPEC §8)",
);
} else if (decision.reason === "empty-live") {
log(
`pull: live fetch returned 0 pages but ${actions.existingCount} file(s) are ` +
`tracked — deletions suppressed this cycle (SPEC §8). Re-run when ` +
`Docmost is reachable.`,
);
} else {
log(
`pull: plan would delete ${actions.plannedDeleteCount} of ${actions.existingCount} ` +
`tracked file(s) (mass-delete guard) — deletions suppressed this ` +
`cycle (SPEC §8). Verify the live Docmost tree, then re-run.`,
);
}
}
// 1. Write each live page in its fixpoint form (normalize-on-write, SPEC §11).
let written = 0;
let failed = 0;
let completed = 0;
let nextIndex = 0;
// pageIds whose write FAILED. A moved page whose new-path write failed must
// NOT have its old path removed (otherwise the page vanishes entirely).
const failedPageIds = new Set<string>();
const writeOne = async (w: {
pageId: string;
relPath: string;
}): Promise<void> => {
try {
const page = await client.getPageJson(w.pageId);
// Native-Obsidian format: a minimal `gitmost_id` frontmatter + the fixpoint
// markdown body. title/parent/space are DERIVED (filename / folder / repo),
// so nothing but the pageId is persisted as meta.
const text = serializePageFile(
page.id,
await stabilizePageBody(page.content),
);
const abs = relToAbs(vaultRoot, w.relPath);
await deps.mkdir(dirname(abs));
await deps.writeFile(abs, text);
written++;
} catch (err) {
failed++;
failedPageIds.add(w.pageId);
log(
`pull: failed page ${w.pageId}: ` +
(err instanceof Error ? err.message : String(err)),
);
} finally {
completed++;
if (completed % PROGRESS_EVERY === 0) {
log(`pulled ${completed}/${actions.toWrite.length}`);
}
}
};
// Bounded-concurrency pool (dependency-free): a fixed set of runners each
// take the next index until the write list is exhausted. One bad page never
// aborts the whole pull (mirrors the fault-tolerant tree walk).
const runner = async (): Promise<void> => {
while (true) {
const i = nextIndex++;
if (i >= actions.toWrite.length) return;
await writeOne(actions.toWrite[i]);
}
};
await Promise.all(
Array.from(
{ length: Math.min(CONCURRENCY, actions.toWrite.length) || 1 },
() => runner(),
),
);
// Helper: `rm` with force:true is a no-op if the file is already gone.
const removePath = async (rel: string, what: string): Promise<boolean> => {
try {
await deps.rm(relToAbs(vaultRoot, rel));
return true;
} catch (err) {
log(
`pull: failed to ${what} ${rel}: ` +
(err instanceof Error ? err.message : String(err)),
);
return false;
}
};
// 2. Apply MOVE old-path removals. A moved page IS present in `live`, so its
// old path is genuinely stale — NOT subject to the incomplete-fetch
// suppression. BUT only remove the old path when (a) the planner marked it
// removable (not reused by another live page) AND (b) the new-path write
// actually SUCCEEDED — otherwise we would delete the only copy of a page
// whose move-write failed (⭐ data-loss guard).
let movedApplied = 0;
for (const m of actions.moved) {
if (!m.removeOldPath) continue;
if (failedPageIds.has(m.pageId)) {
log(
`pull: move write for ${m.pageId} failed — keeping old path ` +
`${m.fromRelPath} (SPEC §8)`,
);
continue;
}
if (await removePath(m.fromRelPath, "remove moved old path")) movedApplied++;
}
// 3. Apply ABSENCE-based deletions — `actions.toDelete` is ALREADY the
// post-suppression set (empty when the decision suppressed them, SPEC §8).
let deleted = 0;
for (const rel of actions.toDelete) {
if (await removePath(rel, "delete")) deleted++;
}
// 4. Stage + commit on `docmost` (only if there is something to commit).
// Deterministic stabilized output means unchanged pages produce identical
// bytes -> git sees no diff -> no churn (SPEC §11). The subject reflects the
// ACTUAL work applied (pages written + files deleted), not the planned size,
// so a run with failures does not over-report (SPEC §5 nit).
const subject =
deleted > 0
? `docmost: sync ${written} page(s), ${deleted} deleted`
: `docmost: sync ${written} page(s)`;
await git.stageAll();
const committed = await git.commit(subject, {
authorName: BOT_AUTHOR_NAME,
authorEmail: BOT_AUTHOR_EMAIL,
trailers: [SOURCE_TRAILER],
});
// Merge docmost -> main. A CONFLICT must NOT wedge the whole space (the
// reported bug: ONE same-line conflict on ONE page froze sync for EVERY page
// in both directions because the next cycle's `isMergeInProgress` check kept
// skipping the entire space). It must ALSO never commit raw `<<<<<<<`/`>>>>>>>`
// markers onto the published `main` (round-1 round-2: external clones would see
// the markers AND the body re-conflicts every cycle while git and Docmost
// silently diverge). So on a conflict we RESOLVE each conflicted file to a
// clean, marker-free form and commit that (SPEC §9):
//
// - SPURIOUS conflict — the ROOT CAUSE of the leak: the two sides differ ONLY
// in trailing/empty-line normalization (the engine writes one trailing
// newline; a user pushed extra blank lines). Once both sides are
// `normalizeTrailingWhitespace`d they are IDENTICAL, so this is no real
// conflict at all: write the normalized form. Content stays in sync; git
// and the page never diverge.
// - GENUINE same-block conflict: resolve to OURS (the `main`/git side), so git
// wins the published branch — mirroring the live-doc 3-way "git wins" rule.
// The docmost-side content is preserved on the `docmost` branch and remains
// recoverable via page history; the next push carries git's body to Docmost,
// so both sides converge. No markers ever reach `main`.
await git.checkout(DEFAULT_BRANCH);
const merge = await git.merge(DOCMOST_BRANCH);
let conflictedPaths: string[] = [];
let mergeResult = merge;
if (merge.conflict) {
const unmerged = await git.listUnmergedPaths();
const genuine: string[] = [];
for (const rel of unmerged) {
const ours = await git.showStage(2, rel); // main side
const theirs = await git.showStage(3, rel); // docmost side
if (
ours !== null &&
theirs !== null &&
normalizeTrailingWhitespace(ours) === normalizeTrailingWhitespace(theirs)
) {
// SPURIOUS: identical once trailing/empty-line normalization is applied.
// Commit the canonical (normalized) form — no conflict, no markers.
await deps.writeFile(
relToAbs(vaultRoot, rel),
normalizeTrailingWhitespace(theirs),
);
} else {
// GENUINE conflict: resolve to the non-null side (OURS preferred so git
// wins the published branch; THEIRS kept when OURS is absent — e.g. a
// modify/delete conflict — to avoid dropping the remaining content). If
// BOTH are null (delete/delete) leave it; commitMerge's `git add -A`
// stages the deletion.
genuine.push(rel);
const resolved = ours ?? theirs;
if (resolved !== null) {
await deps.writeFile(relToAbs(vaultRoot, rel), resolved);
}
}
}
conflictedPaths = genuine;
await git.commitMerge(
genuine.length > 0
? `docmost: sync, ${genuine.length} page(s) auto-resolved (git wins, SPEC §9)`
: `docmost: sync (trailing-whitespace conflicts normalized, SPEC §9)`,
{
authorName: BOT_AUTHOR_NAME,
authorEmail: BOT_AUTHOR_EMAIL,
trailers: [SOURCE_TRAILER],
},
);
// The committed tree is CLEAN (every conflicted file was overwritten with a
// marker-free resolution). `conflict` now reflects only the GENUINE conflicts
// that were auto-resolved (git won); a merge that conflicted ONLY on trailing
// whitespace is reported as clean so /status does not cry wolf.
mergeResult = { ok: true, conflict: genuine.length > 0, output: merge.output };
if (genuine.length > 0) {
log(
`pull: merge of docmost -> main had ${genuine.length} GENUINE conflict(s) ` +
`auto-resolved to the git/main side (git wins, SPEC §9): ` +
`${genuine.join(", ")}. NO conflict markers were written to main; the ` +
`docmost-side content is on the 'docmost' branch and recoverable via ` +
`page history, and the next push reconciles Docmost to the git body.`,
);
} else {
log(
`pull: merge of docmost -> main conflicted ONLY on trailing/empty-line ` +
`normalization (${unmerged.length} file(s)) — auto-normalized, no ` +
`markers, content stays in sync (SPEC §9 spurious-conflict fix).`,
);
}
} else if (!merge.ok) {
log(`pull: merge of docmost -> main failed: ${merge.output}`);
}
log("pull: git push to remote is DEFERRED in this increment (SPEC §7).");
return {
written,
movedApplied,
deleted,
failed,
committed,
merge: mergeResult,
conflictedPaths,
};
}
File diff suppressed because it is too large Load Diff
+200
View File
@@ -0,0 +1,200 @@
/**
* Pure reconciliation planner (SPEC §5/§6/§8).
*
* Given the desired live set of files (computed from the current Docmost tree)
* and the set of files currently tracked in the vault, compute what to write,
* what to move (old path to remove), and what to delete. Identity is `pageId`
* (the stable file<->page anchor, SPEC §4): a page that keeps its pageId but
* changes relPath is a MOVE, not delete+add; a tracked pageId that is gone from
* the live tree is a DELETE.
*
* This module is intentionally PURE (no IO, no git) so the whole plan is
* unit-testable. The actual file writing / git operations happen in pull.ts.
*/
/** A page that SHOULD exist in the vault at a given path. */
export interface LiveEntry {
pageId: string;
/** Vault-relative path (forward-slash), e.g. `Space/Parent/Child.md`. */
relPath: string;
}
/** A page currently tracked in the vault (pageId parsed from its meta). */
export interface ExistingEntry {
pageId: string;
/** Vault-relative path (forward-slash) of the tracked file. */
relPath: string;
}
/** A page to (re)write at its destination path. */
export interface WriteEntry {
pageId: string;
relPath: string;
}
/** A page that moved: written at its NEW relPath, with the OLD path removed. */
export interface MovedEntry {
pageId: string;
fromRelPath: string;
toRelPath: string;
/**
* Whether the old path (`fromRelPath`) is SAFE to remove. False when another
* live page will (re)write that exact path (path reuse): removing it would
* destroy real data, so the caller must skip the removal. The move itself is
* still recorded (the new path is written regardless).
*/
removeOldPath: boolean;
}
/** The full reconciliation plan. */
export interface ReconciliationPlan {
/**
* Pages present in `live` -> (re)write at their relPath. This naturally
* covers add, content-update (same path) AND move (same pageId, new path),
* since every live page is (re)written regardless of whether it existed.
*/
toWrite: WriteEntry[];
/**
* Vault-relative paths to delete because their tracked pageId is ABSENT from
* `live` (page removed/trashed). This set is ONLY absence-based deletions —
* the OLD paths of moved pages are NOT here (they live in `moved` and are
* applied separately by the caller). Keeping the two apart lets pull.ts gate
* absence deletions behind the incomplete-fetch suppression + mass-delete
* guard (SPEC §8) while still applying real moves.
*/
toDelete: string[];
/**
* Tracked pages whose relPath changed. The caller writes the page at
* `toRelPath`, then removes `fromRelPath` — but ONLY after the new-path write
* succeeded. The old path is NOT in `toDelete`.
*/
moved: MovedEntry[];
}
/**
* Compute the reconciliation plan.
*
* Rules:
* - Every `live` page is written at its relPath (covers add + update + move).
* - A tracked pageId present in `live` whose relPath changed is `moved`; its
* OLD relPath goes into `moved` ONLY (the caller removes it after the new
* path is written) and is NEVER added to `toDelete`.
* - A tracked pageId NOT present in `live` is an ABSENCE delete; its relPath
* is added to `toDelete`.
*
* Notes:
* - Safety filter (no data loss): no path that is a live TARGET path of any
* page is ever deleted/removed (a write owns it). This applies to BOTH the
* absence `toDelete` set AND a moved page's old-path removal — if a moved
* page's OLD path is reused by ANOTHER live page, the move records no old
* path to remove, because that path will be (re)written.
* - `existing` may legitimately contain duplicate pageIds (two stray files
* carrying the same meta pageId); each such file that is not the live target
* path is removed (as an absence/move) so the vault converges to exactly the
* live set.
*/
export function planReconciliation(
live: LiveEntry[],
existing: ExistingEntry[],
): ReconciliationPlan {
// Desired path for each live pageId.
const liveByPageId = new Map<string, string>();
// Set of all paths that WILL be written (never delete/remove one of these).
const liveTargetPaths = new Set<string>();
for (const e of live) {
liveByPageId.set(e.pageId, e.relPath);
liveTargetPaths.add(e.relPath);
}
const toWrite: WriteEntry[] = live.map((e) => ({
pageId: e.pageId,
relPath: e.relPath,
}));
const moved: MovedEntry[] = [];
// Absence-based deletions ONLY (tracked pageId absent from `live`). Use a Set
// so the same path coming from multiple existing rows is queued only once.
const toDeleteSet = new Set<string>();
for (const ex of existing) {
const liveRel = liveByPageId.get(ex.pageId);
if (liveRel === undefined) {
// Tracked page is gone from the live tree -> absence delete.
// Never queue a path a live page will (re)write (path reuse -> no loss).
if (!liveTargetPaths.has(ex.relPath)) toDeleteSet.add(ex.relPath);
continue;
}
if (liveRel !== ex.relPath) {
// Same pageId, different path -> a MOVE. Record it so the caller can write
// the new path first, then remove the old one. If the old path is itself a
// live target (reused by another page), it must NOT be removed — the write
// owns it — so flag `removeOldPath: false` (move still recorded).
moved.push({
pageId: ex.pageId,
fromRelPath: ex.relPath,
toRelPath: liveRel,
removeOldPath: !liveTargetPaths.has(ex.relPath),
});
}
// liveRel === ex.relPath -> content-update in place; nothing extra to do
// (the write above re-emits the file; identical bytes => git no-op).
}
const toDelete = [...toDeleteSet];
return { toWrite, toDelete, moved };
}
/**
* Below this many tracked files the mass-delete fraction guard is not applied
* (a tiny vault where deleting "most" files is normal, e.g. 1-of-2).
*/
export const MASS_DELETE_MIN_EXISTING = 4;
/** Fraction of tracked files above which a delete plan is a suspected wipe. */
export const MASS_DELETE_FRACTION = 0.5;
/** Why absence-based deletions were (or were not) applied this cycle. */
export type DeletionDecision =
| { apply: true }
| { apply: false; reason: "incomplete-fetch" | "empty-live" | "mass-delete" };
/**
* Pure decision: should the ABSENCE-based deletions (`plan.toDelete`) be applied
* this cycle? Encapsulates the SPEC §8 safety invariants so they are unit-
* testable without live creds or git:
*
* - `treeComplete === false` (a partial Docmost tree fetch) -> SUPPRESS. A page
* missing from a partial tree is NOT proof of deletion (SPEC §8); we must not
* delete merely-absent files this cycle. (Writes/updates/moves still happen.)
* - The live fetch returned 0 pages while files are tracked -> SUPPRESS
* (almost always a failed fetch, never a real "delete everything").
* - The plan would delete more than `MASS_DELETE_FRACTION` of a non-trivial
* vault -> SUPPRESS as a mass-deletion guard (defense in depth).
*
* Moves are NOT governed by this decision: a moved page IS present in `live`, so
* its old-path removal is real (handled by the caller separately).
*/
export function decideAbsenceDeletions(args: {
treeComplete: boolean;
liveCount: number;
existingCount: number;
deleteCount: number;
}): DeletionDecision {
const { treeComplete, liveCount, existingCount, deleteCount } = args;
// No tracked files, or nothing to delete -> trivially fine to "apply".
if (existingCount === 0 || deleteCount === 0) return { apply: true };
if (!treeComplete) return { apply: false, reason: "incomplete-fetch" };
if (liveCount === 0) return { apply: false, reason: "empty-live" };
if (
existingCount >= MASS_DELETE_MIN_EXISTING &&
deleteCount > existingCount * MASS_DELETE_FRACTION
) {
return { apply: false, reason: "mass-delete" };
}
return { apply: true };
}
+109
View File
@@ -0,0 +1,109 @@
/**
* Deterministic filename strategy (SPEC §12).
*
* The file name is COSMETIC — the source of truth for the file<->page link is
* `pageId` / `slugId` inside the meta block, so renaming a file is safe. These
* functions are intentionally dependency-free and pure, so they are trivially
* unit-testable.
*/
// Printable characters forbidden in file names on common filesystems (mainly
// Windows): / \ < > : " | ? *. Each match is replaced with a single "-".
// Spaces are NOT in this set; whitespace is normalized separately below.
// ASCII control characters (code points 0..31) are stripped in a separate pass
// (see stripControlChars) to keep this literal free of embedded control bytes.
const FORBIDDEN_PRINTABLE_RE = /[/\\<>:"|?*]/g;
// Runs of whitespace (including tabs/newlines) collapse to a single space.
const WHITESPACE_RUN_RE = /\s+/g;
// Reserved Windows device names (case-insensitive). A bare match (with or
// without an extension) is unusable as a file name, so it is prefixed with "_".
const RESERVED_WINDOWS_NAMES = new Set([
"con",
"prn",
"aux",
"nul",
"com1",
"com2",
"com3",
"com4",
"com5",
"com6",
"com7",
"com8",
"com9",
"lpt1",
"lpt2",
"lpt3",
"lpt4",
"lpt5",
"lpt6",
"lpt7",
"lpt8",
"lpt9",
]);
// Cap on the sanitized length to stay well within filesystem path-component
// limits (255 bytes on most FSes) while leaving room for an extension and a
// disambiguation suffix.
const MAX_LENGTH = 120;
/**
* Replace every ASCII control character (code points 0..31) with "-". Done by
* scanning code points rather than a control-range regex literal, so the source
* file carries no embedded control bytes.
*/
function stripControlChars(input: string): string {
let out = "";
for (let i = 0; i < input.length; i++) {
out += input.charCodeAt(i) < 32 ? "-" : input[i];
}
return out;
}
/**
* Sanitize a page title into a safe file-name component (WITHOUT extension).
*
* Steps: replace forbidden / control characters with "-", collapse whitespace
* runs to a single space, trim, cap the length, then guard against an empty
* result, an all-dots result, or a reserved Windows device name by prefixing
* with "_".
*/
export function sanitizeTitle(title: string): string {
let name = stripControlChars(title ?? "")
.replace(FORBIDDEN_PRINTABLE_RE, "-")
.replace(WHITESPACE_RUN_RE, " ")
.trim();
if (name.length > MAX_LENGTH) {
name = name.slice(0, MAX_LENGTH).trim();
}
// Compare the base name (before the first dot) against reserved names, so
// both "CON" and "con.md" are caught.
const base = name.split(".")[0]?.toLowerCase() ?? "";
// A name that is empty, consists only of dots ("." / ".." / "..."), or is a
// reserved Windows device name is unusable as a path component. The all-dots
// case is a path-traversal hazard in particular: an unprefixed ".." would
// become a parent-directory segment and let a page escape the vault, so it
// MUST be neutralized here (becomes "_..", which is a literal file name).
if (
name.length === 0 ||
/^\.+$/.test(name) ||
RESERVED_WINDOWS_NAMES.has(base)
) {
name = "_" + name;
}
return name;
}
/**
* Disambiguate a sanitized name when two siblings in the same folder collapse
* to the same name. Appends a stable suffix built from the page's `slugId`, so
* the result stays deterministic across runs (SPEC §12: `Title ~slugId`).
*/
export function disambiguate(name: string, slugId: string): string {
return `${name} ~${slugId}`;
}
+28
View File
@@ -0,0 +1,28 @@
/**
* Engine settings.
*
* The engine is driven IN-PROCESS by the NestJS server, which builds the
* `Settings` object from `EnvironmentService`. This module therefore exposes
* ONLY the `Settings` type the engine consumes — there is no `.env`-loading
* side-effecting entry point and no env-validation here (the server owns that).
*/
export type Settings = {
docmostSpaceId: string;
vaultPath: string;
gitRemote?: string;
pollIntervalMs: number;
debounceMs: number;
logLevel: 'debug' | 'info' | 'warn' | 'error';
/**
* Per-space PUSH policy for a page whose committed body still contains
* unresolved git conflict markers (`<<<<<<<` / `=======` / `>>>>>>>`):
* - false (DEFAULT, SAFE): SKIP that page's push (it is recorded as a push
* failure, so refs are NOT advanced) — the user must resolve the git
* conflict first before the page reaches Docmost.
* - true: strip the marker lines and push BOTH sides' content (the
* `stripConflictMarkers` behavior).
* Optional/undefined is treated as false.
*/
autoMergeConflicts?: boolean;
};
+78
View File
@@ -0,0 +1,78 @@
/**
* Normalize-on-write helper (SPEC §11 "Resolution").
*
* git diffs byte-for-byte, so writing a page in a NON-fixpoint markdown form
* would make the next pull re-export it to a slightly different (but stable)
* form and produce a phantom diff -> churny commits. The converter has a couple
* of known one-pass asymmetries (a block image after a paragraph adds an empty
* paragraph; a diagram materializes `data-align`), all of which converge to a
* fixpoint after ONE `export -> import -> export` round-trip.
*
* So at write time we run exactly that one pass and persist the fixpoint form.
* Already-stable content is unaffected (the pass is idempotent), so re-pulls of
* unchanged pages produce identical bytes and git sees no diff.
*/
import {
convertProseMirrorToMarkdown,
markdownToProseMirror,
serializeDocmostMarkdownBody,
type DocmostMdMeta,
} from "../lib/index.js";
/**
* Meta object as `exportPageBody` builds it (SPEC §4). Kept byte-for-byte
* compatible so files produced here match `exportPageBody`'s output exactly.
*/
export interface PageMeta {
version: 1;
pageId: string;
slugId: string;
title: string;
spaceId: string;
parentPageId: string | null;
}
/**
* Produce the self-contained `.md` file text for a page from its raw
* ProseMirror `content` + identity meta, in the verified fixpoint form.
*
* md1 = convertProseMirrorToMarkdown(content)
* doc2 = markdownToProseMirror(md1) // one import...
* stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export
* file = serializeDocmostMarkdownBody(meta, stableBody)
*
* The single export->import->export pass is the verified fixpoint (SPEC §11):
* idempotent for already-stable content, and the convergence point for the
* known converter asymmetries.
*/
export async function stabilizePageFile(
content: unknown,
meta: PageMeta,
): Promise<string> {
// The meta shape is exactly what `exportPageBody` writes; cast to the lib's
// DocmostMdMeta (a superset with optional fields) for the serializer.
return serializeDocmostMarkdownBody(
meta as DocmostMdMeta,
await stabilizePageBody(content),
);
}
/**
* The fixpoint markdown BODY for a page's ProseMirror `content`, WITHOUT any meta
* envelope:
*
* md1 = convertProseMirrorToMarkdown(content) // export...
* doc2 = markdownToProseMirror(md1) // ...import...
* stableBody = convertProseMirrorToMarkdown(doc2) // ...re-export
*
* The single export->import->export pass is the verified fixpoint (SPEC §11):
* idempotent for already-stable content, and the convergence point for the known
* converter asymmetries. The native-Obsidian writer (`serializePageFile`) wraps
* this body with a minimal `gitmost_id` frontmatter; determinism here is what
* keeps re-pulls of an unchanged page byte-identical (no churn, loop-guard).
*/
export async function stabilizePageBody(content: unknown): Promise<string> {
const md1 = convertProseMirrorToMarkdown(content);
const doc2 = await markdownToProseMirror(md1);
return convertProseMirrorToMarkdown(doc2);
}