fix(sync): robust git coupling — non-ASCII paths, config neutralization, runtime git

Address git-integration fragility (output is not parsed for control flow; we rely
on exit codes + plumbing — but porcelain BEHAVIOR is config-sensitive, and the
runtime image lacked git).

- listTrackedFiles: `git -c core.quotepath=false ls-files -z` + NUL split — fixes
  Cyrillic/UTF-8 vault filenames being returned octal-escaped/quoted
- Dockerfile: install git (node:22-slim ships none; the daemon shells out at runtime)
- VaultGit env: LC_ALL=C/LANG=C, GIT_PAGER=cat, GIT_TERMINAL_PROMPT=0; keep
  stripping GIT_DIR/GIT_WORK_TREE (cwd-isolation, §12)
- ensureRepo local config: core.autocrlf=false + core.safecrlf=false (protect §11
  byte-stability from a global autocrlf=true), commit.gpgsign=false, and
  core.attributesFile=/dev/null (neutralize a global clean/smudge filter that
  would rewrite the stored blob); commit uses --no-verify (skip injected hooks)
- assertGitAvailable() preflight: clear error if the git binary is missing
- tests: Cyrillic listTrackedFiles, LF byte-preservation of the stored blob,
  local-config neutralization incl. attributesFile (590+ green)
This commit is contained in:
vvzvlad
2026-06-17 00:15:17 +03:00
parent 531b320776
commit ec0a3d47c7
4 changed files with 203 additions and 7 deletions

View File

@@ -2,6 +2,13 @@ FROM node:22-slim
WORKDIR /app
# The daemon shells out to the system `git` binary at runtime (git is the vault
# state store), but node:22-slim does NOT ship git. Install it and KEEP it in
# the final image. Placed before `npm ci` so this layer caches across rebuilds.
# `npm prune --omit=dev` below cannot remove this — it is an OS package, not an
# npm dependency.
RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
# Dependencies first (better layer caching): copy the root manifest, the lock,
# and the workspace package manifest so `npm ci` can link the workspace.
COPY package.json package-lock.json ./

View File

@@ -58,6 +58,26 @@ export interface CommitOptions {
export class VaultGit {
constructor(private readonly vaultPath: string) {}
/**
* Preflight: verify a runnable `git` binary is on PATH. The daemon shells out
* to system `git` for every vault operation, so a missing binary (e.g. a slim
* container image without git) must fail fast with an actionable message
* rather than a cryptic ENOENT deep inside the first real git call. Presence
* check only — we do NOT gate on a specific version. Runs `git --version`
* with NO `cwd` (the vault dir may not exist yet at preflight time).
*/
async assertGitAvailable(): Promise<void> {
try {
await execFileAsync("git", ["--version"], { env: vaultGitEnv() });
} catch (err: unknown) {
const e = err as { message?: string };
throw new Error(
"git binary not found or not runnable — install git (the vault state " +
`store requires it). Underlying error: ${(e.message ?? "").trim()}`,
);
}
}
/**
* Run `git --no-pager <args...>` in the vault. Returns trimmed stdout.
* Throws a clear Error (including stderr) on a non-zero exit.
@@ -132,6 +152,32 @@ export class VaultGit {
await this.run(["config", "user.email", BOT_AUTHOR_EMAIL]);
}
// Neutralize correctness-affecting git config in the vault's LOCAL config so
// a user's GLOBAL/system config cannot change porcelain BEHAVIOR (not just
// output) and corrupt the vault. The vault is OUR dedicated repo, so LOCAL
// values (which override global/system) are the right scope. Set
// UNCONDITIONALLY every run — idempotent and cheap; `git config <key>`
// writes to `--local` by default inside the repo. These MUST be in place
// before any add/commit/checkout that could be affected, hence they run
// before the initial-commit block below.
// - core.autocrlf=false — CRITICAL (SPEC §11): a global core.autocrlf=true
// would rewrite LF<->CRLF on add/checkout, making our deterministic,
// byte-stable markdown churn and breaking the round-trip invariant.
// `false` guarantees git stores/checks out verbatim bytes.
// - core.safecrlf=false — avoid CRLF-related warnings/aborts on add.
// - commit.gpgsign=false — the headless daemon must never try to GPG-sign
// a commit (would fail/hang; we already set GIT_TERMINAL_PROMPT=0).
// - core.attributesFile=/dev/null — neutralize the user's GLOBAL
// gitattributes so a global clean/smudge filter (filter.<name>.clean)
// cannot rewrite the STORED blob and break §11 byte-stability (a config
// that core.autocrlf=false does not cover). POSIX-only path, which is
// fine: the daemon runs on Linux (Docker) / macOS. A system
// /etc/gitattributes remains the host admin's domain (out of scope).
await this.run(["config", "core.autocrlf", "false"]);
await this.run(["config", "core.safecrlf", "false"]);
await this.run(["config", "commit.gpgsign", "false"]);
await this.run(["config", "core.attributesFile", "/dev/null"]);
// Create the initial empty commit on `main` if the repo has no commits yet,
// so both `main` and (later) `docmost` branches have a common base.
if (!(await this.hasAnyCommit())) {
@@ -257,7 +303,10 @@ export class VaultGit {
opts: CommitOptions & { allowEmpty?: boolean },
): Promise<void> {
const fullMessage = buildCommitMessage(message, opts.trailers);
const args = ["commit", "-m", fullMessage];
// `--no-verify` skips pre-commit/commit-msg hooks: a global core.hooksPath
// (or any injected hook) must never interfere with engine commits in our
// dedicated vault repo.
const args = ["commit", "--no-verify", "-m", fullMessage];
if (opts.allowEmpty) args.push("--allow-empty");
await execFileAsync("git", ["--no-pager", ...args], {
@@ -308,13 +357,27 @@ export class VaultGit {
* List tracked files on the current branch (paths relative to the vault
* root, forward-slash separated). An optional glob (a git pathspec) narrows
* the listing, e.g. `"*.md"`.
*
* The target wiki is RUSSIAN, so vault file names routinely contain Cyrillic
* (e.g. `Колонка.md`). With git's DEFAULT `core.quotepath=true`, `ls-files`
* returns non-ASCII paths octal-escaped and double-quoted (`"\320\232..."`),
* which `src/pull.ts` `readExisting` would then parse as garbage paths,
* breaking move/duplicate detection. We defeat that two ways at once:
* - `-c core.quotepath=false` disables the octal-escape/quoting.
* - `-z` emits NUL-delimited RAW UTF-8 paths (no quoting, no newline
* ambiguity), which we split on `\0`.
* We read the RAW stdout (NOT the trimming `run()` helper, which would mangle
* the NUL-delimited bytes) and split on `\0`, dropping empty entries. Paths
* are returned verbatim — git already emits forward slashes.
*/
async listTrackedFiles(glob?: string): Promise<string[]> {
const args = ["ls-files"];
const args = ["-c", "core.quotepath=false", "ls-files", "-z"];
if (glob) args.push(glob);
const out = await this.run(args);
if (out.length === 0) return [];
return out.split("\n").filter((l) => l.length > 0);
const r = await this.runRaw(args);
if (r.code !== 0) {
throw new Error(`git ls-files failed: ${r.stderr.trim()}`);
}
return r.stdout.split("\0").filter((p) => p.length > 0);
}
}
@@ -331,7 +394,20 @@ export class VaultGit {
function vaultGitEnv(
extra?: Record<string, string>,
): NodeJS.ProcessEnv {
const env: NodeJS.ProcessEnv = { ...process.env, ...extra };
const env: NodeJS.ProcessEnv = {
...process.env,
// Locale-independent output (defense in depth). We never parse localized
// prose, but pinning the locale prevents a future regression where some
// git message we DO key on is translated by an inherited LC_ALL/LANG.
LC_ALL: "C",
LANG: "C",
// Never page (we already pass --no-pager, but a stray GIT_PAGER could still
// bite) and never block on an interactive prompt (e.g. credentials) — the
// daemon runs unattended and must not hang.
GIT_PAGER: "cat",
GIT_TERMINAL_PROMPT: "0",
...extra,
};
delete env.GIT_DIR;
delete env.GIT_WORK_TREE;
return env;

View File

@@ -119,6 +119,9 @@ async function main(): Promise<void> {
// 1. Ensure the vault git repo exists with main + an initial commit, and the
// engine-only `docmost` branch exists, branched from main.
const git = new VaultGit(vaultRoot);
// Preflight: fail fast (with an actionable message via main().catch) if the
// git binary is missing — the entire vault state store relies on it.
await git.assertGitAvailable();
await git.ensureRepo();
// 1b. Refuse to run on top of an unresolved merge (SPEC §9 / §12). A previous

View File

@@ -1,5 +1,5 @@
import { execFile } from 'node:child_process';
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
@@ -106,6 +106,67 @@ describe('VaultGit (integration; temp repo)', () => {
expect(count.trim()).toBe('1');
});
it('ensureRepo neutralizes correctness-affecting LOCAL config', async () => {
if (!available) return;
const vault = await freshDir();
const git = new VaultGit(vault);
await git.ensureRepo();
// These LOCAL values neutralize a hostile GLOBAL/system config that would
// otherwise change porcelain BEHAVIOR and corrupt the vault (SPEC §11 for
// core.autocrlf; gpgsign/safecrlf for the headless daemon).
const localConfig = async (key: string): Promise<string> => {
const { stdout } = await execFileAsync(
'git',
['config', '--local', '--get', key],
{ cwd: vault },
);
return stdout.trim();
};
expect(await localConfig('core.autocrlf')).toBe('false');
expect(await localConfig('commit.gpgsign')).toBe('false');
expect(await localConfig('core.safecrlf')).toBe('false');
expect(await localConfig('core.attributesFile')).toBe('/dev/null');
// Idempotent: a second run leaves the same single values (no duplicates).
await git.ensureRepo();
expect(await localConfig('core.autocrlf')).toBe('false');
expect(await localConfig('commit.gpgsign')).toBe('false');
expect(await localConfig('core.safecrlf')).toBe('false');
});
it('preserves LF bytes verbatim on commit (SPEC §11: autocrlf=false)', async () => {
if (!available) return;
const vault = await freshDir();
const git = new VaultGit(vault);
await git.ensureRepo();
// Write content with explicit LF line endings. With a hostile
// core.autocrlf=true git would translate these to CRLF in the stored blob,
// breaking the byte-stable round-trip invariant. ensureRepo pins
// core.autocrlf=false locally, so the stored bytes must round-trip exactly.
const fileName = 'lf.md';
const content = 'line1\nline2\nline3\n';
await writeFile(join(vault, fileName), content, 'utf8');
await git.stageAll();
const made = await git.commit('add LF file', {
authorName: BOT_AUTHOR_NAME,
authorEmail: BOT_AUTHOR_EMAIL,
});
expect(made).toBe(true);
// Read the STORED blob (not the worktree file) and assert verbatim bytes:
// still LF-only, no CRLF translation.
const { stdout: stored } = await execFileAsync(
'git',
['--no-pager', 'show', `HEAD:${fileName}`],
{ cwd: vault, encoding: 'buffer' },
);
const storedBuf = stored as unknown as Buffer;
expect(storedBuf.includes(Buffer.from('\r\n'))).toBe(false);
expect(storedBuf.toString('utf8')).toBe(content);
});
it('ensureBranch creates the docmost branch from main', async () => {
if (!available) return;
const vault = await freshDir();
@@ -289,4 +350,53 @@ describe('VaultGit (integration; temp repo)', () => {
const all = await git.listTrackedFiles();
expect(new Set(all)).toEqual(new Set(['keep.md', 'note.txt']));
});
it('listTrackedFiles returns RAW UTF-8 Cyrillic paths (not octal-escaped/quoted)', async () => {
if (!available) return;
const vault = await freshDir();
const git = new VaultGit(vault);
await git.ensureRepo();
// The target wiki is Russian, so file names contain Cyrillic. With git's
// DEFAULT core.quotepath=true these come back as `"\320\232..."` from
// ls-files; `listTrackedFiles` must return them verbatim as UTF-8.
const topName = 'Колонка.md';
const nestedDir = 'Раздел';
const nestedName = 'Подстраница.md';
await writeFile(join(vault, topName), 'top\n', 'utf8');
await mkdir(join(vault, nestedDir), { recursive: true });
await writeFile(join(vault, nestedDir, nestedName), 'nested\n', 'utf8');
await git.stageAll();
await git.commit('add cyrillic files', {
authorName: BOT_AUTHOR_NAME,
authorEmail: BOT_AUTHOR_EMAIL,
});
const md = await git.listTrackedFiles('*.md');
// Exact UTF-8 names, forward-slash separated for the nested one — NOT an
// escaped/quoted form like `"\320\232..."`.
expect(new Set(md)).toEqual(
new Set([topName, `${nestedDir}/${nestedName}`]),
);
// Guard explicitly against the quotepath regression: no entry is quoted or
// contains a backslash escape sequence.
for (const p of md) {
expect(p.startsWith('"')).toBe(false);
expect(p.includes('\\')).toBe(false);
}
// No-glob listing also returns the raw Cyrillic names.
const all = await git.listTrackedFiles();
expect(all).toContain(topName);
expect(all).toContain(`${nestedDir}/${nestedName}`);
});
it('assertGitAvailable resolves when git is present', async () => {
if (!available) return;
const vault = await freshDir();
const git = new VaultGit(vault);
// No repo needed: it only probes `git --version` (and the vault dir need
// not even exist yet).
await expect(git.assertGitAvailable()).resolves.toBeUndefined();
});
});