diff --git a/Dockerfile b/Dockerfile index 5c62b5c..954478d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,13 @@ FROM node:22-slim WORKDIR /app +# The daemon shells out to the system `git` binary at runtime (git is the vault +# state store), but node:22-slim does NOT ship git. Install it and KEEP it in +# the final image. Placed before `npm ci` so this layer caches across rebuilds. +# `npm prune --omit=dev` below cannot remove this — it is an OS package, not an +# npm dependency. +RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/* + # Dependencies first (better layer caching): copy the root manifest, the lock, # and the workspace package manifest so `npm ci` can link the workspace. COPY package.json package-lock.json ./ diff --git a/src/git.ts b/src/git.ts index e6e982d..3cb7645 100644 --- a/src/git.ts +++ b/src/git.ts @@ -58,6 +58,26 @@ export interface CommitOptions { export class VaultGit { constructor(private readonly vaultPath: string) {} + /** + * Preflight: verify a runnable `git` binary is on PATH. The daemon shells out + * to system `git` for every vault operation, so a missing binary (e.g. a slim + * container image without git) must fail fast with an actionable message + * rather than a cryptic ENOENT deep inside the first real git call. Presence + * check only — we do NOT gate on a specific version. Runs `git --version` + * with NO `cwd` (the vault dir may not exist yet at preflight time). + */ + async assertGitAvailable(): Promise { + try { + await execFileAsync("git", ["--version"], { env: vaultGitEnv() }); + } catch (err: unknown) { + const e = err as { message?: string }; + throw new Error( + "git binary not found or not runnable — install git (the vault state " + + `store requires it). Underlying error: ${(e.message ?? "").trim()}`, + ); + } + } + /** * Run `git --no-pager ` in the vault. Returns trimmed stdout. * Throws a clear Error (including stderr) on a non-zero exit. @@ -132,6 +152,32 @@ export class VaultGit { await this.run(["config", "user.email", BOT_AUTHOR_EMAIL]); } + // Neutralize correctness-affecting git config in the vault's LOCAL config so + // a user's GLOBAL/system config cannot change porcelain BEHAVIOR (not just + // output) and corrupt the vault. The vault is OUR dedicated repo, so LOCAL + // values (which override global/system) are the right scope. Set + // UNCONDITIONALLY every run — idempotent and cheap; `git config ` + // writes to `--local` by default inside the repo. These MUST be in place + // before any add/commit/checkout that could be affected, hence they run + // before the initial-commit block below. + // - core.autocrlf=false — CRITICAL (SPEC §11): a global core.autocrlf=true + // would rewrite LF<->CRLF on add/checkout, making our deterministic, + // byte-stable markdown churn and breaking the round-trip invariant. + // `false` guarantees git stores/checks out verbatim bytes. + // - core.safecrlf=false — avoid CRLF-related warnings/aborts on add. + // - commit.gpgsign=false — the headless daemon must never try to GPG-sign + // a commit (would fail/hang; we already set GIT_TERMINAL_PROMPT=0). + // - core.attributesFile=/dev/null — neutralize the user's GLOBAL + // gitattributes so a global clean/smudge filter (filter..clean) + // cannot rewrite the STORED blob and break §11 byte-stability (a config + // that core.autocrlf=false does not cover). POSIX-only path, which is + // fine: the daemon runs on Linux (Docker) / macOS. A system + // /etc/gitattributes remains the host admin's domain (out of scope). + await this.run(["config", "core.autocrlf", "false"]); + await this.run(["config", "core.safecrlf", "false"]); + await this.run(["config", "commit.gpgsign", "false"]); + await this.run(["config", "core.attributesFile", "/dev/null"]); + // Create the initial empty commit on `main` if the repo has no commits yet, // so both `main` and (later) `docmost` branches have a common base. if (!(await this.hasAnyCommit())) { @@ -257,7 +303,10 @@ export class VaultGit { opts: CommitOptions & { allowEmpty?: boolean }, ): Promise { const fullMessage = buildCommitMessage(message, opts.trailers); - const args = ["commit", "-m", fullMessage]; + // `--no-verify` skips pre-commit/commit-msg hooks: a global core.hooksPath + // (or any injected hook) must never interfere with engine commits in our + // dedicated vault repo. + const args = ["commit", "--no-verify", "-m", fullMessage]; if (opts.allowEmpty) args.push("--allow-empty"); await execFileAsync("git", ["--no-pager", ...args], { @@ -308,13 +357,27 @@ export class VaultGit { * List tracked files on the current branch (paths relative to the vault * root, forward-slash separated). An optional glob (a git pathspec) narrows * the listing, e.g. `"*.md"`. + * + * The target wiki is RUSSIAN, so vault file names routinely contain Cyrillic + * (e.g. `Колонка.md`). With git's DEFAULT `core.quotepath=true`, `ls-files` + * returns non-ASCII paths octal-escaped and double-quoted (`"\320\232..."`), + * which `src/pull.ts` `readExisting` would then parse as garbage paths, + * breaking move/duplicate detection. We defeat that two ways at once: + * - `-c core.quotepath=false` disables the octal-escape/quoting. + * - `-z` emits NUL-delimited RAW UTF-8 paths (no quoting, no newline + * ambiguity), which we split on `\0`. + * We read the RAW stdout (NOT the trimming `run()` helper, which would mangle + * the NUL-delimited bytes) and split on `\0`, dropping empty entries. Paths + * are returned verbatim — git already emits forward slashes. */ async listTrackedFiles(glob?: string): Promise { - const args = ["ls-files"]; + const args = ["-c", "core.quotepath=false", "ls-files", "-z"]; if (glob) args.push(glob); - const out = await this.run(args); - if (out.length === 0) return []; - return out.split("\n").filter((l) => l.length > 0); + const r = await this.runRaw(args); + if (r.code !== 0) { + throw new Error(`git ls-files failed: ${r.stderr.trim()}`); + } + return r.stdout.split("\0").filter((p) => p.length > 0); } } @@ -331,7 +394,20 @@ export class VaultGit { function vaultGitEnv( extra?: Record, ): NodeJS.ProcessEnv { - const env: NodeJS.ProcessEnv = { ...process.env, ...extra }; + const env: NodeJS.ProcessEnv = { + ...process.env, + // Locale-independent output (defense in depth). We never parse localized + // prose, but pinning the locale prevents a future regression where some + // git message we DO key on is translated by an inherited LC_ALL/LANG. + LC_ALL: "C", + LANG: "C", + // Never page (we already pass --no-pager, but a stray GIT_PAGER could still + // bite) and never block on an interactive prompt (e.g. credentials) — the + // daemon runs unattended and must not hang. + GIT_PAGER: "cat", + GIT_TERMINAL_PROMPT: "0", + ...extra, + }; delete env.GIT_DIR; delete env.GIT_WORK_TREE; return env; diff --git a/src/pull.ts b/src/pull.ts index e948e6e..991368b 100644 --- a/src/pull.ts +++ b/src/pull.ts @@ -119,6 +119,9 @@ async function main(): Promise { // 1. Ensure the vault git repo exists with main + an initial commit, and the // engine-only `docmost` branch exists, branched from main. const git = new VaultGit(vaultRoot); + // Preflight: fail fast (with an actionable message via main().catch) if the + // git binary is missing — the entire vault state store relies on it. + await git.assertGitAvailable(); await git.ensureRepo(); // 1b. Refuse to run on top of an unresolved merge (SPEC §9 / §12). A previous diff --git a/test/git.test.ts b/test/git.test.ts index 7b56b30..a8a2e3a 100644 --- a/test/git.test.ts +++ b/test/git.test.ts @@ -1,5 +1,5 @@ import { execFile } from 'node:child_process'; -import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { promisify } from 'node:util'; @@ -106,6 +106,67 @@ describe('VaultGit (integration; temp repo)', () => { expect(count.trim()).toBe('1'); }); + it('ensureRepo neutralizes correctness-affecting LOCAL config', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + // These LOCAL values neutralize a hostile GLOBAL/system config that would + // otherwise change porcelain BEHAVIOR and corrupt the vault (SPEC §11 for + // core.autocrlf; gpgsign/safecrlf for the headless daemon). + const localConfig = async (key: string): Promise => { + const { stdout } = await execFileAsync( + 'git', + ['config', '--local', '--get', key], + { cwd: vault }, + ); + return stdout.trim(); + }; + expect(await localConfig('core.autocrlf')).toBe('false'); + expect(await localConfig('commit.gpgsign')).toBe('false'); + expect(await localConfig('core.safecrlf')).toBe('false'); + expect(await localConfig('core.attributesFile')).toBe('/dev/null'); + + // Idempotent: a second run leaves the same single values (no duplicates). + await git.ensureRepo(); + expect(await localConfig('core.autocrlf')).toBe('false'); + expect(await localConfig('commit.gpgsign')).toBe('false'); + expect(await localConfig('core.safecrlf')).toBe('false'); + }); + + it('preserves LF bytes verbatim on commit (SPEC §11: autocrlf=false)', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + // Write content with explicit LF line endings. With a hostile + // core.autocrlf=true git would translate these to CRLF in the stored blob, + // breaking the byte-stable round-trip invariant. ensureRepo pins + // core.autocrlf=false locally, so the stored bytes must round-trip exactly. + const fileName = 'lf.md'; + const content = 'line1\nline2\nline3\n'; + await writeFile(join(vault, fileName), content, 'utf8'); + await git.stageAll(); + const made = await git.commit('add LF file', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + }); + expect(made).toBe(true); + + // Read the STORED blob (not the worktree file) and assert verbatim bytes: + // still LF-only, no CRLF translation. + const { stdout: stored } = await execFileAsync( + 'git', + ['--no-pager', 'show', `HEAD:${fileName}`], + { cwd: vault, encoding: 'buffer' }, + ); + const storedBuf = stored as unknown as Buffer; + expect(storedBuf.includes(Buffer.from('\r\n'))).toBe(false); + expect(storedBuf.toString('utf8')).toBe(content); + }); + it('ensureBranch creates the docmost branch from main', async () => { if (!available) return; const vault = await freshDir(); @@ -289,4 +350,53 @@ describe('VaultGit (integration; temp repo)', () => { const all = await git.listTrackedFiles(); expect(new Set(all)).toEqual(new Set(['keep.md', 'note.txt'])); }); + + it('listTrackedFiles returns RAW UTF-8 Cyrillic paths (not octal-escaped/quoted)', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + await git.ensureRepo(); + + // The target wiki is Russian, so file names contain Cyrillic. With git's + // DEFAULT core.quotepath=true these come back as `"\320\232..."` from + // ls-files; `listTrackedFiles` must return them verbatim as UTF-8. + const topName = 'Колонка.md'; + const nestedDir = 'Раздел'; + const nestedName = 'Подстраница.md'; + await writeFile(join(vault, topName), 'top\n', 'utf8'); + await mkdir(join(vault, nestedDir), { recursive: true }); + await writeFile(join(vault, nestedDir, nestedName), 'nested\n', 'utf8'); + await git.stageAll(); + await git.commit('add cyrillic files', { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + }); + + const md = await git.listTrackedFiles('*.md'); + // Exact UTF-8 names, forward-slash separated for the nested one — NOT an + // escaped/quoted form like `"\320\232..."`. + expect(new Set(md)).toEqual( + new Set([topName, `${nestedDir}/${nestedName}`]), + ); + // Guard explicitly against the quotepath regression: no entry is quoted or + // contains a backslash escape sequence. + for (const p of md) { + expect(p.startsWith('"')).toBe(false); + expect(p.includes('\\')).toBe(false); + } + + // No-glob listing also returns the raw Cyrillic names. + const all = await git.listTrackedFiles(); + expect(all).toContain(topName); + expect(all).toContain(`${nestedDir}/${nestedName}`); + }); + + it('assertGitAvailable resolves when git is present', async () => { + if (!available) return; + const vault = await freshDir(); + const git = new VaultGit(vault); + // No repo needed: it only probes `git --version` (and the vault dir need + // not even exist yet). + await expect(git.assertGitAvailable()).resolves.toBeUndefined(); + }); });