fix(mcp): close the brute-force limiter check-then-act race (#83)

isBlocked was checked synchronously but recordFailure ran only AFTER the bcrypt awaits, so N concurrent /mcp Basic requests for one email all slipped past the threshold. Add FailedLoginLimiter.tryReserve (atomic synchronous check+increment) + release (undo), and reserve all 3 keys BEFORE any await so the (threshold+1)-th concurrent attempt is rejected before its bcrypt runs. The reservation IS the failure record (post-await recordFailure removed -> counted exactly once). Non- credential early throws (missing workspace, SSO/MFA gate) and business errors release the reservation so they don't burn a victim's budget; success clears. Tests prove login() runs exactly threshold times under concurrency and that gate/config rejects don't consume budget. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 04:14:38 +03:00
parent 31fcb764d7
commit a20f4c3876
2 changed files with 286 additions and 47 deletions
--- a/apps/server/src/integrations/mcp/mcp-auth.helpers.ts
+++ b/apps/server/src/integrations/mcp/mcp-auth.helpers.ts
@@ -84,6 +84,39 @@ export class FailedLoginLimiter {
    b.count += 1;
  }

+  /**
+   * Atomic check-and-reserve: if the key is already at/over the threshold this
+   * window, return false (blocked). Otherwise count this in-flight attempt
+   * (count += 1) and return true. Being synchronous, concurrent callers cannot
+   * interleave between the check and the increment, so the (threshold+1)-th
+   * concurrent attempt is rejected even before its bcrypt runs.
+   *
+   * This is the brute-force fix for the /mcp Basic path: the increment happens
+   * BEFORE the async credential check, not after it, so N concurrent requests for
+   * one email cannot all observe count=0 and all run bcrypt. A failed login then
+   * leaves the reservation in place (it IS the recorded failure); a SUCCESSFUL
+   * login clears it via reset(); a non-credential business error releases it via
+   * release() so it does not count as a guessed-password signal.
+   */
+  tryReserve(key: string, now: number = Date.now()): boolean {
+    const b = this.bucket(key, now);
+    if (b.count >= this.threshold) return false;
+    b.count += 1;
+    return true;
+  }
+
+  /**
+   * Undo a previous tryReserve for the key within the same window (count -= 1,
+   * floored at 0). Used to release an optimistic in-flight reservation when the
+   * attempt turned out NOT to be a password-guess signal (e.g. an "email not
+   * verified" business error), so it does not burn a victim's limiter budget.
+   * A no-op if the bucket rolled over to a fresh window in the meantime.
+   */
+  release(key: string, now: number = Date.now()): void {
+    const b = this.bucket(key, now);
+    if (b.count > 0) b.count -= 1;
+  }
+
  /** Clear the key after a successful login so it does not accumulate. */
  reset(key: string): void {
    this.buckets.delete(key);
@@ -530,51 +563,84 @@ export async function resolveMcpSessionConfig(
    // trusted proxy is configured; the per-email global key is the part that
    // does NOT depend on a trustworthy IP and is the real brute-force backstop.
    const emailKey = `email:${emailLc}`;
-    if (
-      deps.limiter.isBlocked(ipKey) ||
-      deps.limiter.isBlocked(ipEmailKey) ||
-      deps.limiter.isBlocked(emailKey)
-    ) {
+    // Atomic check-AND-reserve, synchronously and BEFORE any await. The old code
+    // did a read-only isBlocked() pre-check here and only recordFailure()'d the
+    // failure AFTER the awaited bcrypt login — so N concurrent requests for one
+    // email all saw count=0, all ran bcrypt, all failed, and only then all
+    // recorded, blowing far past the threshold. tryReserve() folds the check and
+    // the increment into one synchronous, non-interleavable step: it counts this
+    // in-flight attempt NOW, so the (threshold+1)-th concurrent attempt is
+    // rejected before its bcrypt ever runs. The reservation IS the recorded
+    // failure (no separate recordFailure on the failure path below); a successful
+    // login clears it via reset(), and a non-credential business error releases
+    // it via release(). Reserve ALL keys so each per-key budget is charged.
+    const ipOk = deps.limiter.tryReserve(ipKey);
+    const ipEmailOk = deps.limiter.tryReserve(ipEmailKey);
+    const emailOk = deps.limiter.tryReserve(emailKey);
+    if (!ipOk || !ipEmailOk || !emailOk) {
+      // At least one key is at/over threshold: blocked. Release the keys we DID
+      // manage to reserve in this same call so a rejected (already-throttled)
+      // request does not over-charge the keys that were still under budget — the
+      // same observable outcome as the old isBlocked() pre-check, which never
+      // incremented on a blocked request.
+      if (ipOk) deps.limiter.release(ipKey);
+      if (ipEmailOk) deps.limiter.release(ipEmailKey);
+      if (emailOk) deps.limiter.release(emailKey);
      throw new UnauthorizedException(
        'Too many failed MCP login attempts. Try again later.',
      );
    }

-    const workspace = await deps.findWorkspace();
-    if (!workspace) {
-      throw new UnauthorizedException('No workspace is configured.');
-    }
-
-    // SSO/MFA pre-token gate (BLOCKER fix): replicate the AuthController.login
-    // gates BEFORE any token is issued on the Basic path. If the workspace
-    // enforces SSO, or the EE MFA module is bundled and this user/workspace
-    // requires MFA, this throws and we never mint a token. The Bearer path is
-    // intentionally NOT gated here (its JWT was already minted post-gate). This
-    // runs on BOTH init and subsequent Basic requests, but it must run before
-    // login()/verifyCredentials so an SSO/MFA user cannot authenticate at all.
-    // We do NOT count a gate rejection toward the brute-force limiter: it is not
-    // a password-guess signal.
-    if (deps.enforceBasicGate) {
-      await deps.enforceBasicGate(workspace, {
-        email: basic.email,
-        password: basic.password,
-      });
-    }
-
-    // Fix 1 (init vs subsequent):
-    //   - SESSION INIT (no mcp-session-id): full login() mints the user JWT
-    //     (the one allowed session creation + audit event for this MCP
-    //     session). The DocmostClient caches that token, so later tool calls
-    //     never re-login.
-    //   - SUBSEQUENT request (has mcp-session-id): we only need to re-validate
-    //     the caller's credentials for anti-fixation. verifyCredentials() does
-    //     the SAME lookup/password/email-verified/disabled checks as login()
-    //     but mints NO session, writes NO audit row and updates NO lastLoginAt,
-    //     so a correct repeat does not spawn a DB session per request while a
-    //     wrong password still 401s. The getToken here is never used to mint a
-    //     new session: on a subsequent request the existing session already
-    //     holds its token; this config is only consulted at init.
+    // Everything from here through the credential evaluation runs UNDER one
+    // try/catch so a SINGLE rule governs the reservation we took above:
+    // "release the reserved keys unless the error is a genuine credential
+    // failure." That covers all three early-throw paths uniformly —
+    //   (a) findWorkspace() returning null (a CONFIG error),
+    //   (b) the SSO/MFA enforceBasicGate throwing (a BUSINESS error),
+    //   (c) login()/verifyCredentials() throwing a non-credential business error
+    //       (e.g. "email not verified") —
+    // none of which are password-guess signals, so none may burn a victim's
+    // limiter budget. Only a genuine credential failure (isCredentialsFailure)
+    // leaves the reservation in place, because the reservation IS its recorded
+    // failure. Without this, an attacker could exhaust a victim's per-email
+    // backstop with SSO/MFA-gated or misconfigured-workspace requests that never
+    // even run bcrypt. The reservation stays at the TOP (before any await) so the
+    // concurrency race the #83 fix closed is NOT re-introduced.
    try {
+      const workspace = await deps.findWorkspace();
+      if (!workspace) {
+        throw new UnauthorizedException('No workspace is configured.');
+      }
+
+      // SSO/MFA pre-token gate (BLOCKER fix): replicate the AuthController.login
+      // gates BEFORE any token is issued on the Basic path. If the workspace
+      // enforces SSO, or the EE MFA module is bundled and this user/workspace
+      // requires MFA, this throws and we never mint a token. The Bearer path is
+      // intentionally NOT gated here (its JWT was already minted post-gate). This
+      // runs on BOTH init and subsequent Basic requests, but it must run before
+      // login()/verifyCredentials so an SSO/MFA user cannot authenticate at all.
+      // We do NOT count a gate rejection toward the brute-force limiter: it is
+      // not a password-guess signal (the catch below releases the reservation).
+      if (deps.enforceBasicGate) {
+        await deps.enforceBasicGate(workspace, {
+          email: basic.email,
+          password: basic.password,
+        });
+      }
+
+      // Fix 1 (init vs subsequent):
+      //   - SESSION INIT (no mcp-session-id): full login() mints the user JWT
+      //     (the one allowed session creation + audit event for this MCP
+      //     session). The DocmostClient caches that token, so later tool calls
+      //     never re-login.
+      //   - SUBSEQUENT request (has mcp-session-id): we only need to re-validate
+      //     the caller's credentials for anti-fixation. verifyCredentials() does
+      //     the SAME lookup/password/email-verified/disabled checks as login()
+      //     but mints NO session, writes NO audit row and updates NO lastLoginAt,
+      //     so a correct repeat does not spawn a DB session per request while a
+      //     wrong password still 401s. The getToken here is never used to mint a
+      //     new session: on a subsequent request the existing session already
+      //     holds its token; this config is only consulted at init.
      if (deps.isSessionInit) {
        const authToken = await deps.login(
          { email: basic.email, password: basic.password },
@@ -593,14 +659,19 @@ export async function resolveMcpSessionConfig(
        workspace.id,
      );
    } catch (err) {
-      // Only count an actual CREDENTIALS failure (wrong email/password) toward
-      // the brute-force limiter. Business errors like "email not verified" are
-      // a 401/400 surface but are NOT a guessed-password signal, so they must
-      // not let an attacker burn a victim's limiter budget or mask brute-force.
-      if (isCredentialsFailure(err)) {
-        deps.limiter.recordFailure(ipKey);
-        deps.limiter.recordFailure(ipEmailKey);
-        deps.limiter.recordFailure(emailKey);
+      // The in-flight reservation taken above already counted this attempt, so
+      // an actual CREDENTIALS failure (wrong email/password) needs NO separate
+      // recordFailure — the reservation IS the recorded failure (avoiding the
+      // old double-count). But ANY other throw between the reservation and here
+      // — a missing-workspace config error, an SSO/MFA gate rejection, or a
+      // business error like "email not verified" — is a 401/400 surface, NOT a
+      // guessed-password signal, so it must not burn a victim's limiter budget:
+      // release the optimistic reservation (only the keys we actually reserved,
+      // which on this non-blocked path is all three) in that case.
+      if (!isCredentialsFailure(err)) {
+        deps.limiter.release(ipKey);
+        deps.limiter.release(ipEmailKey);
+        deps.limiter.release(emailKey);
      }
      const message =
        err instanceof Error && err.message
@@ -616,8 +687,17 @@ export async function resolveMcpSessionConfig(
    // email. The global email key is reset ONLY on a session-INIT login()
    // success (above), which is a single deliberate authentication, not a
    // high-frequency re-validation.
+    //
+    // Under the reserve model we DID optimistically increment emailKey up front
+    // (tryReserve), so a plain "leave it intact" would let every periodic tool
+    // call of the victim's own live session permanently grow their email bucket
+    // and throttle THEMSELVES. release() undoes exactly the one increment THIS
+    // call took (count -= 1), restoring the pre-request budget — it does NOT
+    // clear a parallel attacker's accumulated failures (that's reset()), so the
+    // brute-force backstop survives while the victim's success is budget-neutral.
    deps.limiter.reset(ipKey);
    deps.limiter.reset(ipEmailKey);
+    deps.limiter.release(emailKey);
    return {
      config: { apiUrl, getToken: async () => '' },
      identity: `basic:${emailLc}`,
--- a/apps/server/src/integrations/mcp/mcp.service.spec.ts
+++ b/apps/server/src/integrations/mcp/mcp.service.spec.ts
@@ -209,6 +209,71 @@ describe('FailedLoginLimiter', () => {
    expect(lim.isBlocked(k, 1000)).toBe(false);
  });

+  describe('tryReserve (atomic check-and-increment, brute-force race fix)', () => {
+    it('allows exactly `threshold` reserves then blocks within the window', () => {
+      const lim = new FailedLoginLimiter(3, 1000);
+      const k = 'ip:1.2.3.4';
+      // threshold (3) successful reserves return true...
+      expect(lim.tryReserve(k, 0)).toBe(true);
+      expect(lim.tryReserve(k, 0)).toBe(true);
+      expect(lim.tryReserve(k, 0)).toBe(true);
+      // ...the next one is blocked (count is now at threshold).
+      expect(lim.tryReserve(k, 0)).toBe(false);
+      // A blocked reserve does NOT increment, so isBlocked stays true at threshold.
+      expect(lim.isBlocked(k, 0)).toBe(true);
+    });
+
+    it('reserves again after the window rolls over', () => {
+      const lim = new FailedLoginLimiter(2, 1000);
+      const k = 'ip:1.2.3.4';
+      expect(lim.tryReserve(k, 0)).toBe(true);
+      expect(lim.tryReserve(k, 0)).toBe(true);
+      expect(lim.tryReserve(k, 0)).toBe(false); // blocked in this window
+      // Past windowMs (>= is inclusive): a fresh bucket, so reserve succeeds again.
+      expect(lim.tryReserve(k, 1000)).toBe(true);
+    });
+
+    it('reset releases the reservation (reserve succeeds again after reset)', () => {
+      const lim = new FailedLoginLimiter(1, 1000);
+      const k = 'ip:1.2.3.4';
+      expect(lim.tryReserve(k, 0)).toBe(true);
+      expect(lim.tryReserve(k, 0)).toBe(false); // at threshold 1 -> blocked
+      lim.reset(k);
+      expect(lim.tryReserve(k, 0)).toBe(true); // reset cleared the bucket
+    });
+
+    it('release undoes one reservation without clearing accumulated failures', () => {
+      const lim = new FailedLoginLimiter(2, 1000);
+      const k = 'email:victim@example.com';
+      expect(lim.tryReserve(k, 0)).toBe(true); // count 1
+      expect(lim.tryReserve(k, 0)).toBe(true); // count 2 == threshold
+      expect(lim.isBlocked(k, 0)).toBe(true);
+      lim.release(k, 0); // undo exactly one -> count 1
+      expect(lim.isBlocked(k, 0)).toBe(false);
+      expect(lim.tryReserve(k, 0)).toBe(true); // count 2 again
+      expect(lim.tryReserve(k, 0)).toBe(false); // blocked: prior failures survived
+    });
+
+    it('RACE: threshold+1 SYNCHRONOUS reserves (no await) yield only `threshold` trues', () => {
+      // Simulate N concurrent /mcp requests hitting the check-and-increment with
+      // zero interleaved awaits — the very scenario the old isBlocked()-then-
+      // recordFailure() flow lost to (all saw count=0, all ran bcrypt). Because
+      // tryReserve folds check+increment into one synchronous step, only the
+      // first `threshold` callers win; the (threshold+1)-th is rejected up front.
+      const threshold = 5;
+      const lim = new FailedLoginLimiter(threshold, 60_000);
+      const k = 'email:victim@example.com';
+      const results: boolean[] = [];
+      for (let i = 0; i < threshold + 1; i++) {
+        results.push(lim.tryReserve(k, 0));
+      }
+      expect(results.filter((r) => r === true)).toHaveLength(threshold);
+      expect(results.filter((r) => r === false)).toHaveLength(1);
+      // The rejected one is the LAST: the first `threshold` all reserved.
+      expect(results[threshold]).toBe(false);
+    });
+  });
+
  describe('sweep (expired-bucket eviction, injectable clock)', () => {
    // sweep() drops buckets whose windowStart is older than windowMs so
    // never-revisited keys cannot accumulate forever. It takes an injectable
@@ -406,6 +471,44 @@ describe('resolveMcpSessionConfig', () => {
    ).rejects.toThrow(/Too many failed MCP login attempts/);
  });

+  it('concurrent Basic requests cannot bypass the limiter (atomic reserve before bcrypt)', async () => {
+    // The race the fix closes: fire threshold+ concurrent /mcp Basic logins for
+    // one email. Each login() (bcrypt-bearing) resolves only after all requests
+    // have entered the flow, so under the OLD check-then-act code every request
+    // would pass the read-only isBlocked() pre-check (count=0) and run bcrypt.
+    // With the atomic reserve, only `threshold` requests get past the synchronous
+    // tryReserve; the rest are throttled BEFORE login() is invoked.
+    const threshold = 5;
+    const limiter = new FailedLoginLimiter(threshold, 60_000);
+    let release!: () => void;
+    const gate = new Promise<void>((r) => {
+      release = r;
+    });
+    const login = jest.fn().mockImplementation(async () => {
+      await gate; // hold every in-flight login open until we release the gate
+      throw new UnauthorizedException('Email or password does not match');
+    });
+    const total = threshold + 4;
+    const calls = Array.from({ length: total }, () =>
+      resolveMcpSessionConfig(
+        basicHeader('victim@example.com', 'wrong'),
+        makeDeps({ login, limiter, clientIp: '10.0.0.1' }),
+      ).then(
+        () => 'resolved' as const,
+        (e) => (/Too many failed/.test(e.message) ? 'throttled' : 'badcreds'),
+      ),
+    );
+    release();
+    const outcomes = await Promise.all(calls);
+    // Only `threshold` requests ever reached bcrypt/login(); the extras were
+    // rejected up front by the atomic reserve, never invoking login().
+    expect(login).toHaveBeenCalledTimes(threshold);
+    expect(outcomes.filter((o) => o === 'badcreds')).toHaveLength(threshold);
+    expect(outcomes.filter((o) => o === 'throttled')).toHaveLength(
+      total - threshold,
+    );
+  });
+
  it('Bearer -> verifies as ACCESS and returns a getToken config', async () => {
    const verifyAccessJwt = jest
      .fn()
@@ -606,6 +709,62 @@ describe('resolveMcpSessionConfig', () => {
    expect(login).not.toHaveBeenCalled();
  });

+  it('SSO/MFA gate rejection does NOT burn the limiter budget (no token, no count)', async () => {
+    // Follow-up to #83: the brute-force keys are reserved at the TOP of the
+    // Basic flow (before any await) to close the concurrency race. But an
+    // enforceBasicGate rejection is a BUSINESS error (SSO enforced / MFA
+    // required), NOT a password-guess signal, so it must release the reservation
+    // — otherwise an attacker could exhaust an SSO/MFA victim's per-email
+    // backstop by firing gate-rejected requests with any password (no bcrypt
+    // even runs). Drive threshold+1 such requests and confirm none are blocked:
+    // every one reaches the gate (proving the email bucket never filled).
+    const threshold = 3;
+    const limiter = new FailedLoginLimiter(threshold, 60_000);
+    const login = jest.fn().mockResolvedValue('issued-user-jwt');
+    const enforceBasicGate = jest
+      .fn()
+      .mockRejectedValue(
+        new UnauthorizedException('This workspace has enforced SSO login.'),
+      );
+    for (let i = 0; i < threshold + 1; i++) {
+      await expect(
+        resolveMcpSessionConfig(
+          basicHeader('victim@example.com', `pw-${i}`),
+          makeDeps({ login, enforceBasicGate, limiter }),
+        ),
+      ).rejects.toThrow(/enforced SSO/);
+    }
+    // The gate fired on every attempt (the limiter never throttled before it),
+    // and login() never ran: the victim's budget was preserved.
+    expect(enforceBasicGate).toHaveBeenCalledTimes(threshold + 1);
+    expect(login).not.toHaveBeenCalled();
+    // The global per-email backstop is still fully under budget afterwards.
+    expect(limiter.isBlocked('email:victim@example.com')).toBe(false);
+  });
+
+  it('missing-workspace config error does NOT burn the limiter budget', async () => {
+    // findWorkspace() returning undefined is a CONFIG error, not a brute-force
+    // signal, so (like the gate) it must release the up-front reservation. With
+    // threshold 1, a counted attempt would throttle the very next one; instead
+    // every attempt reaches findWorkspace() and surfaces the same config 401.
+    const limiter = new FailedLoginLimiter(1, 60_000);
+    const findWorkspace = jest.fn().mockResolvedValue(undefined);
+    const login = jest.fn().mockResolvedValue('issued-user-jwt');
+    const deps = () =>
+      makeDeps({ findWorkspace, login, limiter, clientIp: '10.0.0.42' });
+    await expect(
+      resolveMcpSessionConfig(basicHeader('user@example.com', 'pw'), deps()),
+    ).rejects.toThrow(/No workspace is configured/);
+    // If the first attempt had counted, threshold 1 would now throttle. Instead
+    // the second attempt must reach findWorkspace() again (same config error).
+    await expect(
+      resolveMcpSessionConfig(basicHeader('user@example.com', 'pw'), deps()),
+    ).rejects.toThrow(/No workspace is configured/);
+    expect(findWorkspace).toHaveBeenCalledTimes(2);
+    expect(login).not.toHaveBeenCalled();
+    expect(limiter.isBlocked('email:user@example.com')).toBe(false);
+  });
+
  it('Bearer path is NOT subjected to the Basic SSO/MFA gate', async () => {
    // The gate is only consulted on the Basic branch. A Bearer token (minted
    // post-gate by the normal login) must not be blocked by it.