fix(git-sync): push 503 starvation + concurrent-edit marker leak/silent loss
Bug #1 (push 503 starvation): an external receive-pack that briefly overlapped a poll cycle immediately 503'd because the per-space single-writer lock was held. Add a BOUNDED retry-acquire on the PUSH path only (SpaceLockService .withSpaceLock acquireRetry: capped exponential backoff up to ~5s); a transient overlap now waits and succeeds, a genuinely stuck cycle still 503s after the bound. The poll cycle passes no retry (immediate skip). Push result stays deterministic: the receive-pack only runs once the lock is held, so a 503 never leaves a half-applied ref. Bug #2 (concurrent-edit marker leak + silent same-block loss): - Marker leak (a): the push UPDATE path stripped markers for the body sent to Docmost but left raw <<<<<<</>>>>>>> committed on the published `main` vault forever (autoMergeConflicts ON). Now the cleaned body is written back to the vault file + recorded in writtenBack so runPush commits it on `main` and the vault converges to clean bytes. - Marker leak (b): pin merge.conflictStyle=merge in ensureRepo and teach stripConflictMarkers/hasConflictMarkers about the diff3 `|||||||` base section (drop the marker AND the stale base region) so diff3/zdiff3 conflicts can never leak `|||||||` + base content into a page. Also scrub the 3-way merge BASE markdown. - Silent same-block loss: the block 3-way merge still resolves same-block conflicts deterministically to git, but it is no longer silent: diff3Plan now reports a conflict count (mergeXmlFragments3WayWithStats), gitSyncWriteBody logs it, and the persistence boundary-snapshot now fires for git-sync writes over a non-git-sync baseline so the human's pre-merge content is preserved in page history (recoverable). Full both-preserved persisted-conflict UI remains the deferred redesign. Tests: space-lock bounded-retry (success/stuck/poll-immediate); push vault-clean + diff3 ||||||| strip; ensureRepo conflictStyle pin; diff3Plan/3-way conflict counts; persistence git-sync boundary snapshot. Server tsc clean; git-sync vitest + server collaboration/git-sync jest all green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,11 @@ import { EnvironmentService } from '../../environment/environment.service';
|
||||
import { GitmostDataSourceService } from './gitmost-datasource.service';
|
||||
import { VaultRegistryService } from './vault-registry.service';
|
||||
import { SpaceLockService } from './space-lock.service';
|
||||
import {
|
||||
GIT_SYNC_PUSH_LOCK_RETRY_BASE_MS,
|
||||
GIT_SYNC_PUSH_LOCK_RETRY_MAX_MS,
|
||||
GIT_SYNC_PUSH_LOCK_RETRY_TOTAL_MS,
|
||||
} from '../git-sync.constants';
|
||||
|
||||
/** A space the poll loop should reconcile: its id + the workspace it lives in. */
|
||||
interface EnabledSpace {
|
||||
@@ -244,7 +249,9 @@ export class GitSyncOrchestrator implements OnModuleInit, OnModuleDestroy {
|
||||
}
|
||||
const serviceUserId = this.environmentService.getGitSyncServiceUserId();
|
||||
|
||||
const result = await this.spaceLock.withSpaceLock(spaceId, async (signal) => {
|
||||
const result = await this.spaceLock.withSpaceLock(
|
||||
spaceId,
|
||||
async (signal) => {
|
||||
// 1) Stream the receive-pack to the client (durable commits land on main).
|
||||
// Pass the lost-lock signal so the receive-pack child is killed if the lock
|
||||
// lapses mid-write (no concurrent working-tree writer across replicas).
|
||||
@@ -273,7 +280,23 @@ export class GitSyncOrchestrator implements OnModuleInit, OnModuleDestroy {
|
||||
);
|
||||
}
|
||||
return;
|
||||
});
|
||||
},
|
||||
// BOUNDED retry-acquire (push path only): a push that briefly overlaps a
|
||||
// poll cycle waits a moment (capped backoff up to the budget) instead of
|
||||
// immediately 503-ing — the cycle releases the lock in well under a second
|
||||
// for most spaces, so this turns a transient overlap into a SUCCESS rather
|
||||
// than a spurious failure. A genuinely long/stuck cycle still skips after
|
||||
// the bound -> GitSyncLockHeldError -> 503, and git retries the whole push
|
||||
// (the receive-pack only runs once the lock is held, so there is never a
|
||||
// half-applied ref on a 503).
|
||||
{
|
||||
acquireRetry: {
|
||||
timeoutMs: GIT_SYNC_PUSH_LOCK_RETRY_TOTAL_MS,
|
||||
baseMs: GIT_SYNC_PUSH_LOCK_RETRY_BASE_MS,
|
||||
maxMs: GIT_SYNC_PUSH_LOCK_RETRY_MAX_MS,
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
// The lock was held (in-progress or another replica) — surface to the caller
|
||||
// so the HTTP handler can answer 503 and let git retry.
|
||||
|
||||
@@ -123,6 +123,65 @@ describe('SpaceLockService', () => {
|
||||
});
|
||||
});
|
||||
|
||||
// Bug #1 (push 503 starvation): the PUSH path passes a bounded acquireRetry so a
|
||||
// transient overlap with a poll cycle is retried (and succeeds) instead of an
|
||||
// immediate 503. A genuinely stuck lock still skips after the bound. The poll
|
||||
// cycle passes NO retry (immediate skip), so only the push path waits.
|
||||
describe('bounded acquire-retry (push path)', () => {
|
||||
const retry = { timeoutMs: 5_000, baseMs: 100, maxMs: 500 };
|
||||
|
||||
it('retries the acquire and SUCCEEDS when the lock is briefly held then released', async () => {
|
||||
const { service, redis } = build();
|
||||
// First acquire attempt fails (lock briefly held by a cycle), the next
|
||||
// succeeds — the bounded retry must turn this into a SUCCESS, not a skip.
|
||||
redis.set
|
||||
.mockResolvedValueOnce(null) // attempt 1: held
|
||||
.mockResolvedValueOnce(null) // attempt 2: still held
|
||||
.mockResolvedValue('OK'); // attempt 3+: released -> acquired
|
||||
const fn = jest.fn(async () => 'pushed');
|
||||
|
||||
const result = await service.withSpaceLock('space-1', fn, {
|
||||
acquireRetry: retry,
|
||||
});
|
||||
|
||||
expect(result).toBe('pushed');
|
||||
expect(fn).toHaveBeenCalledTimes(1);
|
||||
expect(redis.set.mock.calls.length).toBeGreaterThanOrEqual(3);
|
||||
// The acquired lock is released in finally (DEL-CAS eval).
|
||||
expect(redis.eval).toHaveBeenCalledTimes(1);
|
||||
expect(redis.eval.mock.calls[0][0]).toContain('del');
|
||||
});
|
||||
|
||||
it('still skips (lock-held) after the bound when the lock stays stuck — and never runs fn', async () => {
|
||||
const { service, redis } = build();
|
||||
redis.set.mockResolvedValue(null); // permanently held
|
||||
const fn = jest.fn(async () => 'pushed');
|
||||
|
||||
const result = await service.withSpaceLock('space-1', fn, {
|
||||
acquireRetry: { timeoutMs: 300, baseMs: 50, maxMs: 100 },
|
||||
});
|
||||
|
||||
expect(result).toEqual({ skipped: 'lock-held' });
|
||||
expect(fn).not.toHaveBeenCalled();
|
||||
// It retried more than once before giving up (bound > one interval).
|
||||
expect(redis.set.mock.calls.length).toBeGreaterThan(1);
|
||||
// Never acquired -> never released.
|
||||
expect(redis.eval).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('without acquireRetry (poll path) a held lock skips IMMEDIATELY (single attempt)', async () => {
|
||||
const { service, redis } = build();
|
||||
redis.set.mockResolvedValue(null);
|
||||
const fn = jest.fn(async () => 'cycle');
|
||||
|
||||
const result = await service.withSpaceLock('space-1', fn);
|
||||
|
||||
expect(result).toEqual({ skipped: 'lock-held' });
|
||||
expect(redis.set).toHaveBeenCalledTimes(1); // no retry
|
||||
expect(fn).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('fn throwing', () => {
|
||||
it('propagates the throw AND still releases (eval) in finally', async () => {
|
||||
const { service, redis } = build();
|
||||
|
||||
@@ -120,41 +120,57 @@ export class SpaceLockService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Run `fn` under the per-space lock: the in-process mutex (no overlapping
|
||||
* cycles on this instance) AND the Redis leader lock (single writer across
|
||||
* replicas). Returns `fn`'s result, or a skip sentinel when the lock could not
|
||||
* be acquired — `{ skipped: 'in-progress' }` (this instance is mid-cycle) or
|
||||
* `{ skipped: 'lock-held' }` (another replica holds the Redis lock). The mutex
|
||||
* + Redis lock are always released in a `finally`, even when `fn` throws (the
|
||||
* throw propagates to the caller). This is the single reusable wrapper shared
|
||||
* by `runOnce` (the poll/admin cycle) and `ingestExternalPush` (a push from a
|
||||
* git client over HTTP) so both serialize against each other identically.
|
||||
* Options for `withSpaceLock`. `acquireRetry` (PUSH path only) bounds a
|
||||
* retry-acquire loop: if the lock cannot be entered on the first try, keep
|
||||
* retrying with a capped exponential backoff until `timeoutMs` elapses before
|
||||
* returning the skip sentinel. The poll cycle holds the lock while it
|
||||
* processes a whole space, so a legitimate external push that briefly overlaps
|
||||
* a cycle should WAIT a moment rather than immediately 503 (bug: ~60% of
|
||||
* pushes 503'd under continuous polling). The poll cycle passes NO retry (it
|
||||
* just skips and the next tick reconciles).
|
||||
*/
|
||||
async withSpaceLock<T>(
|
||||
spaceId: string,
|
||||
fn: (signal: AbortSignal) => Promise<T>,
|
||||
options?: {
|
||||
acquireRetry?: { timeoutMs: number; baseMs: number; maxMs: number };
|
||||
},
|
||||
): Promise<T | { skipped: 'lock-held' | 'in-progress' }> {
|
||||
if (this.running.has(spaceId)) {
|
||||
return { skipped: 'in-progress' };
|
||||
}
|
||||
// Cross-instance, same-process single-writer guard: another live holder (a
|
||||
// different SpaceLockService in this process) is mid-cycle for this space.
|
||||
// This survives a swallowed heartbeat / Redis TTL lapse, so a second writer
|
||||
// in the process cannot race the working tree — it is rejected 'lock-held'.
|
||||
if (SpaceLockService.liveLocks.has(spaceId)) {
|
||||
return { skipped: 'lock-held' };
|
||||
}
|
||||
// Reserve the in-process slot synchronously (before any await) so two
|
||||
// concurrent same-space calls on THIS instance cannot both pass the guard and
|
||||
// race acquire(). Redis NX is already authoritative across replicas; this just
|
||||
// closes the in-process TOCTOU window. Released in the outer finally on every
|
||||
// path (acquire-failure, fn-throw, normal completion).
|
||||
this.running.add(spaceId);
|
||||
SpaceLockService.liveLocks.set(spaceId, this.instanceId);
|
||||
try {
|
||||
if (!(await this.acquire(spaceId))) {
|
||||
const retry = options?.acquireRetry;
|
||||
// Deadline for the bounded retry-acquire (push path). `Date.now()` once so a
|
||||
// slow first attempt does not over-extend the budget.
|
||||
const deadline = retry ? Date.now() + retry.timeoutMs : 0;
|
||||
let attempt = 0;
|
||||
for (;;) {
|
||||
// Reserve the in-process slot synchronously (before any await) so two
|
||||
// concurrent same-space calls on THIS instance cannot both pass the guard
|
||||
// and race acquire(). On any failure this is released before we retry/skip.
|
||||
const reservation = this.tryReserveInProcess(spaceId);
|
||||
if (reservation) {
|
||||
// Could not even reserve in-process (this instance mid-cycle, or another
|
||||
// live holder in the process). Retry within the bound, else skip.
|
||||
if (retry && Date.now() < deadline) {
|
||||
await this.sleep(this.nextBackoff(attempt++, retry, deadline));
|
||||
continue;
|
||||
}
|
||||
return reservation;
|
||||
}
|
||||
// Reserved in-process — now contend for the Redis leader lock. Release the
|
||||
// in-process slot on EVERY non-running path so a retry/skip leaves no leak.
|
||||
let acquired = false;
|
||||
try {
|
||||
acquired = await this.acquire(spaceId);
|
||||
} finally {
|
||||
if (!acquired) this.releaseInProcess(spaceId);
|
||||
}
|
||||
if (!acquired) {
|
||||
if (retry && Date.now() < deadline) {
|
||||
await this.sleep(this.nextBackoff(attempt++, retry, deadline));
|
||||
continue;
|
||||
}
|
||||
return { skipped: 'lock-held' };
|
||||
}
|
||||
// Both locks held — run `fn` under the heartbeat, releasing in `finally`.
|
||||
// Lost-lock signal: a failed/CAS-missed heartbeat refresh aborts this so the
|
||||
// protected fn can stop instead of writing blind after our lock lapsed.
|
||||
const controller = new AbortController();
|
||||
@@ -172,10 +188,64 @@ export class SpaceLockService {
|
||||
} finally {
|
||||
clearInterval(heartbeat);
|
||||
await this.release(spaceId);
|
||||
this.releaseInProcess(spaceId);
|
||||
}
|
||||
} finally {
|
||||
this.running.delete(spaceId);
|
||||
SpaceLockService.liveLocks.delete(spaceId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronously try to reserve the in-process single-writer slot for a space.
|
||||
* Returns a skip sentinel when another holder is live (this instance mid-cycle
|
||||
* -> 'in-progress'; another SpaceLockService in this process -> 'lock-held'),
|
||||
* or `null` when the slot was reserved (caller MUST `releaseInProcess` later).
|
||||
* Both checks + the reservation happen with NO await between them so two
|
||||
* concurrent same-space calls cannot both pass.
|
||||
*/
|
||||
private tryReserveInProcess(
|
||||
spaceId: string,
|
||||
): { skipped: 'lock-held' | 'in-progress' } | null {
|
||||
if (this.running.has(spaceId)) {
|
||||
return { skipped: 'in-progress' };
|
||||
}
|
||||
// Cross-instance, same-process single-writer guard: another live holder (a
|
||||
// different SpaceLockService in this process) is mid-cycle for this space.
|
||||
// This survives a swallowed heartbeat / Redis TTL lapse, so a second writer
|
||||
// in the process cannot race the working tree — it is rejected 'lock-held'.
|
||||
if (SpaceLockService.liveLocks.has(spaceId)) {
|
||||
return { skipped: 'lock-held' };
|
||||
}
|
||||
this.running.add(spaceId);
|
||||
SpaceLockService.liveLocks.set(spaceId, this.instanceId);
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Release the in-process single-writer slot reserved by tryReserveInProcess. */
|
||||
private releaseInProcess(spaceId: string): void {
|
||||
this.running.delete(spaceId);
|
||||
SpaceLockService.liveLocks.delete(spaceId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Backoff (ms) before the next push lock-acquire attempt: capped exponential
|
||||
* (`baseMs * 2^attempt`, ceilinged at `maxMs`) clamped so it never overshoots
|
||||
* the retry `deadline`. Deterministic (no jitter) so the bound is testable.
|
||||
*/
|
||||
private nextBackoff(
|
||||
attempt: number,
|
||||
retry: { baseMs: number; maxMs: number },
|
||||
deadline: number,
|
||||
): number {
|
||||
const exp = retry.baseMs * 2 ** attempt;
|
||||
const capped = Math.min(exp, retry.maxMs);
|
||||
const remaining = Math.max(0, deadline - Date.now());
|
||||
return Math.max(0, Math.min(capped, remaining));
|
||||
}
|
||||
|
||||
/** Promise-based delay (extracted so tests can reason about the retry loop). */
|
||||
private sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
const t = setTimeout(resolve, ms);
|
||||
t.unref?.();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user