From a20f4c3876580709ab7837e964e89df1ab5f04ad Mon Sep 17 00:00:00 2001 From: claude code agent 227 Date: Sun, 21 Jun 2026 04:14:38 +0300 Subject: [PATCH] fix(mcp): close the brute-force limiter check-then-act race (#83) isBlocked was checked synchronously but recordFailure ran only AFTER the bcrypt awaits, so N concurrent /mcp Basic requests for one email all slipped past the threshold. Add FailedLoginLimiter.tryReserve (atomic synchronous check+increment) + release (undo), and reserve all 3 keys BEFORE any await so the (threshold+1)-th concurrent attempt is rejected before its bcrypt runs. The reservation IS the failure record (post-await recordFailure removed -> counted exactly once). Non- credential early throws (missing workspace, SSO/MFA gate) and business errors release the reservation so they don't burn a victim's budget; success clears. Tests prove login() runs exactly threshold times under concurrency and that gate/config rejects don't consume budget. Co-Authored-By: Claude Opus 4.8 --- .../src/integrations/mcp/mcp-auth.helpers.ts | 174 +++++++++++++----- .../src/integrations/mcp/mcp.service.spec.ts | 159 ++++++++++++++++ 2 files changed, 286 insertions(+), 47 deletions(-) diff --git a/apps/server/src/integrations/mcp/mcp-auth.helpers.ts b/apps/server/src/integrations/mcp/mcp-auth.helpers.ts index c3aa3292..f71dff9a 100644 --- a/apps/server/src/integrations/mcp/mcp-auth.helpers.ts +++ b/apps/server/src/integrations/mcp/mcp-auth.helpers.ts @@ -84,6 +84,39 @@ export class FailedLoginLimiter { b.count += 1; } + /** + * Atomic check-and-reserve: if the key is already at/over the threshold this + * window, return false (blocked). Otherwise count this in-flight attempt + * (count += 1) and return true. Being synchronous, concurrent callers cannot + * interleave between the check and the increment, so the (threshold+1)-th + * concurrent attempt is rejected even before its bcrypt runs. + * + * This is the brute-force fix for the /mcp Basic path: the increment happens + * BEFORE the async credential check, not after it, so N concurrent requests for + * one email cannot all observe count=0 and all run bcrypt. A failed login then + * leaves the reservation in place (it IS the recorded failure); a SUCCESSFUL + * login clears it via reset(); a non-credential business error releases it via + * release() so it does not count as a guessed-password signal. + */ + tryReserve(key: string, now: number = Date.now()): boolean { + const b = this.bucket(key, now); + if (b.count >= this.threshold) return false; + b.count += 1; + return true; + } + + /** + * Undo a previous tryReserve for the key within the same window (count -= 1, + * floored at 0). Used to release an optimistic in-flight reservation when the + * attempt turned out NOT to be a password-guess signal (e.g. an "email not + * verified" business error), so it does not burn a victim's limiter budget. + * A no-op if the bucket rolled over to a fresh window in the meantime. + */ + release(key: string, now: number = Date.now()): void { + const b = this.bucket(key, now); + if (b.count > 0) b.count -= 1; + } + /** Clear the key after a successful login so it does not accumulate. */ reset(key: string): void { this.buckets.delete(key); @@ -530,51 +563,84 @@ export async function resolveMcpSessionConfig( // trusted proxy is configured; the per-email global key is the part that // does NOT depend on a trustworthy IP and is the real brute-force backstop. const emailKey = `email:${emailLc}`; - if ( - deps.limiter.isBlocked(ipKey) || - deps.limiter.isBlocked(ipEmailKey) || - deps.limiter.isBlocked(emailKey) - ) { + // Atomic check-AND-reserve, synchronously and BEFORE any await. The old code + // did a read-only isBlocked() pre-check here and only recordFailure()'d the + // failure AFTER the awaited bcrypt login — so N concurrent requests for one + // email all saw count=0, all ran bcrypt, all failed, and only then all + // recorded, blowing far past the threshold. tryReserve() folds the check and + // the increment into one synchronous, non-interleavable step: it counts this + // in-flight attempt NOW, so the (threshold+1)-th concurrent attempt is + // rejected before its bcrypt ever runs. The reservation IS the recorded + // failure (no separate recordFailure on the failure path below); a successful + // login clears it via reset(), and a non-credential business error releases + // it via release(). Reserve ALL keys so each per-key budget is charged. + const ipOk = deps.limiter.tryReserve(ipKey); + const ipEmailOk = deps.limiter.tryReserve(ipEmailKey); + const emailOk = deps.limiter.tryReserve(emailKey); + if (!ipOk || !ipEmailOk || !emailOk) { + // At least one key is at/over threshold: blocked. Release the keys we DID + // manage to reserve in this same call so a rejected (already-throttled) + // request does not over-charge the keys that were still under budget — the + // same observable outcome as the old isBlocked() pre-check, which never + // incremented on a blocked request. + if (ipOk) deps.limiter.release(ipKey); + if (ipEmailOk) deps.limiter.release(ipEmailKey); + if (emailOk) deps.limiter.release(emailKey); throw new UnauthorizedException( 'Too many failed MCP login attempts. Try again later.', ); } - const workspace = await deps.findWorkspace(); - if (!workspace) { - throw new UnauthorizedException('No workspace is configured.'); - } - - // SSO/MFA pre-token gate (BLOCKER fix): replicate the AuthController.login - // gates BEFORE any token is issued on the Basic path. If the workspace - // enforces SSO, or the EE MFA module is bundled and this user/workspace - // requires MFA, this throws and we never mint a token. The Bearer path is - // intentionally NOT gated here (its JWT was already minted post-gate). This - // runs on BOTH init and subsequent Basic requests, but it must run before - // login()/verifyCredentials so an SSO/MFA user cannot authenticate at all. - // We do NOT count a gate rejection toward the brute-force limiter: it is not - // a password-guess signal. - if (deps.enforceBasicGate) { - await deps.enforceBasicGate(workspace, { - email: basic.email, - password: basic.password, - }); - } - - // Fix 1 (init vs subsequent): - // - SESSION INIT (no mcp-session-id): full login() mints the user JWT - // (the one allowed session creation + audit event for this MCP - // session). The DocmostClient caches that token, so later tool calls - // never re-login. - // - SUBSEQUENT request (has mcp-session-id): we only need to re-validate - // the caller's credentials for anti-fixation. verifyCredentials() does - // the SAME lookup/password/email-verified/disabled checks as login() - // but mints NO session, writes NO audit row and updates NO lastLoginAt, - // so a correct repeat does not spawn a DB session per request while a - // wrong password still 401s. The getToken here is never used to mint a - // new session: on a subsequent request the existing session already - // holds its token; this config is only consulted at init. + // Everything from here through the credential evaluation runs UNDER one + // try/catch so a SINGLE rule governs the reservation we took above: + // "release the reserved keys unless the error is a genuine credential + // failure." That covers all three early-throw paths uniformly — + // (a) findWorkspace() returning null (a CONFIG error), + // (b) the SSO/MFA enforceBasicGate throwing (a BUSINESS error), + // (c) login()/verifyCredentials() throwing a non-credential business error + // (e.g. "email not verified") — + // none of which are password-guess signals, so none may burn a victim's + // limiter budget. Only a genuine credential failure (isCredentialsFailure) + // leaves the reservation in place, because the reservation IS its recorded + // failure. Without this, an attacker could exhaust a victim's per-email + // backstop with SSO/MFA-gated or misconfigured-workspace requests that never + // even run bcrypt. The reservation stays at the TOP (before any await) so the + // concurrency race the #83 fix closed is NOT re-introduced. try { + const workspace = await deps.findWorkspace(); + if (!workspace) { + throw new UnauthorizedException('No workspace is configured.'); + } + + // SSO/MFA pre-token gate (BLOCKER fix): replicate the AuthController.login + // gates BEFORE any token is issued on the Basic path. If the workspace + // enforces SSO, or the EE MFA module is bundled and this user/workspace + // requires MFA, this throws and we never mint a token. The Bearer path is + // intentionally NOT gated here (its JWT was already minted post-gate). This + // runs on BOTH init and subsequent Basic requests, but it must run before + // login()/verifyCredentials so an SSO/MFA user cannot authenticate at all. + // We do NOT count a gate rejection toward the brute-force limiter: it is + // not a password-guess signal (the catch below releases the reservation). + if (deps.enforceBasicGate) { + await deps.enforceBasicGate(workspace, { + email: basic.email, + password: basic.password, + }); + } + + // Fix 1 (init vs subsequent): + // - SESSION INIT (no mcp-session-id): full login() mints the user JWT + // (the one allowed session creation + audit event for this MCP + // session). The DocmostClient caches that token, so later tool calls + // never re-login. + // - SUBSEQUENT request (has mcp-session-id): we only need to re-validate + // the caller's credentials for anti-fixation. verifyCredentials() does + // the SAME lookup/password/email-verified/disabled checks as login() + // but mints NO session, writes NO audit row and updates NO lastLoginAt, + // so a correct repeat does not spawn a DB session per request while a + // wrong password still 401s. The getToken here is never used to mint a + // new session: on a subsequent request the existing session already + // holds its token; this config is only consulted at init. if (deps.isSessionInit) { const authToken = await deps.login( { email: basic.email, password: basic.password }, @@ -593,14 +659,19 @@ export async function resolveMcpSessionConfig( workspace.id, ); } catch (err) { - // Only count an actual CREDENTIALS failure (wrong email/password) toward - // the brute-force limiter. Business errors like "email not verified" are - // a 401/400 surface but are NOT a guessed-password signal, so they must - // not let an attacker burn a victim's limiter budget or mask brute-force. - if (isCredentialsFailure(err)) { - deps.limiter.recordFailure(ipKey); - deps.limiter.recordFailure(ipEmailKey); - deps.limiter.recordFailure(emailKey); + // The in-flight reservation taken above already counted this attempt, so + // an actual CREDENTIALS failure (wrong email/password) needs NO separate + // recordFailure — the reservation IS the recorded failure (avoiding the + // old double-count). But ANY other throw between the reservation and here + // — a missing-workspace config error, an SSO/MFA gate rejection, or a + // business error like "email not verified" — is a 401/400 surface, NOT a + // guessed-password signal, so it must not burn a victim's limiter budget: + // release the optimistic reservation (only the keys we actually reserved, + // which on this non-blocked path is all three) in that case. + if (!isCredentialsFailure(err)) { + deps.limiter.release(ipKey); + deps.limiter.release(ipEmailKey); + deps.limiter.release(emailKey); } const message = err instanceof Error && err.message @@ -616,8 +687,17 @@ export async function resolveMcpSessionConfig( // email. The global email key is reset ONLY on a session-INIT login() // success (above), which is a single deliberate authentication, not a // high-frequency re-validation. + // + // Under the reserve model we DID optimistically increment emailKey up front + // (tryReserve), so a plain "leave it intact" would let every periodic tool + // call of the victim's own live session permanently grow their email bucket + // and throttle THEMSELVES. release() undoes exactly the one increment THIS + // call took (count -= 1), restoring the pre-request budget — it does NOT + // clear a parallel attacker's accumulated failures (that's reset()), so the + // brute-force backstop survives while the victim's success is budget-neutral. deps.limiter.reset(ipKey); deps.limiter.reset(ipEmailKey); + deps.limiter.release(emailKey); return { config: { apiUrl, getToken: async () => '' }, identity: `basic:${emailLc}`, diff --git a/apps/server/src/integrations/mcp/mcp.service.spec.ts b/apps/server/src/integrations/mcp/mcp.service.spec.ts index cfa8472d..a6f93d5d 100644 --- a/apps/server/src/integrations/mcp/mcp.service.spec.ts +++ b/apps/server/src/integrations/mcp/mcp.service.spec.ts @@ -209,6 +209,71 @@ describe('FailedLoginLimiter', () => { expect(lim.isBlocked(k, 1000)).toBe(false); }); + describe('tryReserve (atomic check-and-increment, brute-force race fix)', () => { + it('allows exactly `threshold` reserves then blocks within the window', () => { + const lim = new FailedLoginLimiter(3, 1000); + const k = 'ip:1.2.3.4'; + // threshold (3) successful reserves return true... + expect(lim.tryReserve(k, 0)).toBe(true); + expect(lim.tryReserve(k, 0)).toBe(true); + expect(lim.tryReserve(k, 0)).toBe(true); + // ...the next one is blocked (count is now at threshold). + expect(lim.tryReserve(k, 0)).toBe(false); + // A blocked reserve does NOT increment, so isBlocked stays true at threshold. + expect(lim.isBlocked(k, 0)).toBe(true); + }); + + it('reserves again after the window rolls over', () => { + const lim = new FailedLoginLimiter(2, 1000); + const k = 'ip:1.2.3.4'; + expect(lim.tryReserve(k, 0)).toBe(true); + expect(lim.tryReserve(k, 0)).toBe(true); + expect(lim.tryReserve(k, 0)).toBe(false); // blocked in this window + // Past windowMs (>= is inclusive): a fresh bucket, so reserve succeeds again. + expect(lim.tryReserve(k, 1000)).toBe(true); + }); + + it('reset releases the reservation (reserve succeeds again after reset)', () => { + const lim = new FailedLoginLimiter(1, 1000); + const k = 'ip:1.2.3.4'; + expect(lim.tryReserve(k, 0)).toBe(true); + expect(lim.tryReserve(k, 0)).toBe(false); // at threshold 1 -> blocked + lim.reset(k); + expect(lim.tryReserve(k, 0)).toBe(true); // reset cleared the bucket + }); + + it('release undoes one reservation without clearing accumulated failures', () => { + const lim = new FailedLoginLimiter(2, 1000); + const k = 'email:victim@example.com'; + expect(lim.tryReserve(k, 0)).toBe(true); // count 1 + expect(lim.tryReserve(k, 0)).toBe(true); // count 2 == threshold + expect(lim.isBlocked(k, 0)).toBe(true); + lim.release(k, 0); // undo exactly one -> count 1 + expect(lim.isBlocked(k, 0)).toBe(false); + expect(lim.tryReserve(k, 0)).toBe(true); // count 2 again + expect(lim.tryReserve(k, 0)).toBe(false); // blocked: prior failures survived + }); + + it('RACE: threshold+1 SYNCHRONOUS reserves (no await) yield only `threshold` trues', () => { + // Simulate N concurrent /mcp requests hitting the check-and-increment with + // zero interleaved awaits — the very scenario the old isBlocked()-then- + // recordFailure() flow lost to (all saw count=0, all ran bcrypt). Because + // tryReserve folds check+increment into one synchronous step, only the + // first `threshold` callers win; the (threshold+1)-th is rejected up front. + const threshold = 5; + const lim = new FailedLoginLimiter(threshold, 60_000); + const k = 'email:victim@example.com'; + const results: boolean[] = []; + for (let i = 0; i < threshold + 1; i++) { + results.push(lim.tryReserve(k, 0)); + } + expect(results.filter((r) => r === true)).toHaveLength(threshold); + expect(results.filter((r) => r === false)).toHaveLength(1); + // The rejected one is the LAST: the first `threshold` all reserved. + expect(results[threshold]).toBe(false); + }); + }); + describe('sweep (expired-bucket eviction, injectable clock)', () => { // sweep() drops buckets whose windowStart is older than windowMs so // never-revisited keys cannot accumulate forever. It takes an injectable @@ -406,6 +471,44 @@ describe('resolveMcpSessionConfig', () => { ).rejects.toThrow(/Too many failed MCP login attempts/); }); + it('concurrent Basic requests cannot bypass the limiter (atomic reserve before bcrypt)', async () => { + // The race the fix closes: fire threshold+ concurrent /mcp Basic logins for + // one email. Each login() (bcrypt-bearing) resolves only after all requests + // have entered the flow, so under the OLD check-then-act code every request + // would pass the read-only isBlocked() pre-check (count=0) and run bcrypt. + // With the atomic reserve, only `threshold` requests get past the synchronous + // tryReserve; the rest are throttled BEFORE login() is invoked. + const threshold = 5; + const limiter = new FailedLoginLimiter(threshold, 60_000); + let release!: () => void; + const gate = new Promise((r) => { + release = r; + }); + const login = jest.fn().mockImplementation(async () => { + await gate; // hold every in-flight login open until we release the gate + throw new UnauthorizedException('Email or password does not match'); + }); + const total = threshold + 4; + const calls = Array.from({ length: total }, () => + resolveMcpSessionConfig( + basicHeader('victim@example.com', 'wrong'), + makeDeps({ login, limiter, clientIp: '10.0.0.1' }), + ).then( + () => 'resolved' as const, + (e) => (/Too many failed/.test(e.message) ? 'throttled' : 'badcreds'), + ), + ); + release(); + const outcomes = await Promise.all(calls); + // Only `threshold` requests ever reached bcrypt/login(); the extras were + // rejected up front by the atomic reserve, never invoking login(). + expect(login).toHaveBeenCalledTimes(threshold); + expect(outcomes.filter((o) => o === 'badcreds')).toHaveLength(threshold); + expect(outcomes.filter((o) => o === 'throttled')).toHaveLength( + total - threshold, + ); + }); + it('Bearer -> verifies as ACCESS and returns a getToken config', async () => { const verifyAccessJwt = jest .fn() @@ -606,6 +709,62 @@ describe('resolveMcpSessionConfig', () => { expect(login).not.toHaveBeenCalled(); }); + it('SSO/MFA gate rejection does NOT burn the limiter budget (no token, no count)', async () => { + // Follow-up to #83: the brute-force keys are reserved at the TOP of the + // Basic flow (before any await) to close the concurrency race. But an + // enforceBasicGate rejection is a BUSINESS error (SSO enforced / MFA + // required), NOT a password-guess signal, so it must release the reservation + // — otherwise an attacker could exhaust an SSO/MFA victim's per-email + // backstop by firing gate-rejected requests with any password (no bcrypt + // even runs). Drive threshold+1 such requests and confirm none are blocked: + // every one reaches the gate (proving the email bucket never filled). + const threshold = 3; + const limiter = new FailedLoginLimiter(threshold, 60_000); + const login = jest.fn().mockResolvedValue('issued-user-jwt'); + const enforceBasicGate = jest + .fn() + .mockRejectedValue( + new UnauthorizedException('This workspace has enforced SSO login.'), + ); + for (let i = 0; i < threshold + 1; i++) { + await expect( + resolveMcpSessionConfig( + basicHeader('victim@example.com', `pw-${i}`), + makeDeps({ login, enforceBasicGate, limiter }), + ), + ).rejects.toThrow(/enforced SSO/); + } + // The gate fired on every attempt (the limiter never throttled before it), + // and login() never ran: the victim's budget was preserved. + expect(enforceBasicGate).toHaveBeenCalledTimes(threshold + 1); + expect(login).not.toHaveBeenCalled(); + // The global per-email backstop is still fully under budget afterwards. + expect(limiter.isBlocked('email:victim@example.com')).toBe(false); + }); + + it('missing-workspace config error does NOT burn the limiter budget', async () => { + // findWorkspace() returning undefined is a CONFIG error, not a brute-force + // signal, so (like the gate) it must release the up-front reservation. With + // threshold 1, a counted attempt would throttle the very next one; instead + // every attempt reaches findWorkspace() and surfaces the same config 401. + const limiter = new FailedLoginLimiter(1, 60_000); + const findWorkspace = jest.fn().mockResolvedValue(undefined); + const login = jest.fn().mockResolvedValue('issued-user-jwt'); + const deps = () => + makeDeps({ findWorkspace, login, limiter, clientIp: '10.0.0.42' }); + await expect( + resolveMcpSessionConfig(basicHeader('user@example.com', 'pw'), deps()), + ).rejects.toThrow(/No workspace is configured/); + // If the first attempt had counted, threshold 1 would now throttle. Instead + // the second attempt must reach findWorkspace() again (same config error). + await expect( + resolveMcpSessionConfig(basicHeader('user@example.com', 'pw'), deps()), + ).rejects.toThrow(/No workspace is configured/); + expect(findWorkspace).toHaveBeenCalledTimes(2); + expect(login).not.toHaveBeenCalled(); + expect(limiter.isBlocked('email:user@example.com')).toBe(false); + }); + it('Bearer path is NOT subjected to the Basic SSO/MFA gate', async () => { // The gate is only consulted on the Basic branch. A Bearer token (minted // post-gate by the normal login) must not be blocked by it.