From 1d610b3a627889ff2356b93d231b84bac8872cb6 Mon Sep 17 00:00:00 2001
From: claude code agent 227 <claude_code@vvzvlad.xyz>
Date: Fri, 26 Jun 2026 06:23:48 +0300
Subject: [PATCH] fix(ai-chat): add per-workspace rolling-day token budget for
 anonymous share assistant (#159)

The anonymous public-share assistant only capped the COUNT of requests
(100/hour/workspace), not their cost. One accepted turn runs the agent loop
up to stepCountIs(5), re-sending the whole client-held transcript as input on
every step, while maxOutputTokens caps only the output; the request window is
hourly with no daily ceiling, so a steady stream at the cap sustains ~24x its
count per day. Counting requests therefore does not bound the owner's LLM bill
(red-team finding #5).

Add a second cost contour: a cluster-wide, sliding-window per-workspace TOKEN
budget over a rolling day. It is checked read-only BEFORE a turn streams (429,
no request slot consumed, nothing spent) and the turn's real usage
(totalUsage: input re-sent per step + output, summed across all steps) is
recorded once it finishes via streamText onFinish. Fails closed on the check
(deny when Redis can't prove we're under budget); best-effort on the record.
Env-overridable via SHARE_AI_WORKSPACE_TOKEN_BUDGET_PER_DAY (default 1M/day).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../public-share-chat.controller.spec.ts      |  36 ++++
 .../ai-chat/public-share-chat.controller.ts   |  19 +-
 .../core/ai-chat/public-share-chat.service.ts |  44 ++++
 .../core/ai-chat/public-share-chat.spec.ts    | 182 ++++++++++++++++
 .../ai-chat/public-share-workspace-limiter.ts | 199 ++++++++++++++++++
 5 files changed, 477 insertions(+), 3 deletions(-)

diff --git a/apps/server/src/core/ai-chat/public-share-chat.controller.spec.ts b/apps/server/src/core/ai-chat/public-share-chat.controller.spec.ts
index 08b20b43..66289d05 100644
--- a/apps/server/src/core/ai-chat/public-share-chat.controller.spec.ts
+++ b/apps/server/src/core/ai-chat/public-share-chat.controller.spec.ts
@@ -34,6 +34,7 @@ describe('resolveShareAssistantRequest (extracted controller funnel)', () => {
     resolveShareRole?: jest.Mock;
     getShareChatModel?: jest.Mock;
     tryConsumeWorkspaceQuota?: jest.Mock;
+    withinShareTokenBudget?: jest.Mock;
   } = {}) {
     const aiSettings = {
       isPublicShareAssistantEnabled: jest
@@ -65,6 +66,8 @@ describe('resolveShareAssistantRequest (extracted controller funnel)', () => {
         over.getShareChatModel ?? jest.fn().mockResolvedValue('MODEL'),
       tryConsumeWorkspaceQuota:
         over.tryConsumeWorkspaceQuota ?? jest.fn().mockResolvedValue(true),
+      withinShareTokenBudget:
+        over.withinShareTokenBudget ?? jest.fn().mockResolvedValue(true),
     };
     const deps: ShareAssistantDeps = {
       aiSettings: aiSettings as never,
@@ -191,6 +194,39 @@ describe('resolveShareAssistantRequest (extracted controller funnel)', () => {
     expect(publicShareChat.tryConsumeWorkspaceQuota).toHaveBeenCalledWith('ws-1');
   });
 
+  it('withinShareTokenBudget false => 429 thrown BEFORE any stream (cost cap, #159 #5)', async () => {
+    const { deps, publicShareChat } = makeDeps({
+      withinShareTokenBudget: jest.fn().mockResolvedValue(false),
+    });
+    expect(await statusOf(deps, body())).toBe(429);
+    expect(publicShareChat.withinShareTokenBudget).toHaveBeenCalledWith('ws-1');
+    // The token budget is the COST backstop: an over-budget workspace must be
+    // rejected WITHOUT consuming a request slot, so the request cap never runs.
+    expect(publicShareChat.tryConsumeWorkspaceQuota).not.toHaveBeenCalled();
+  });
+
+  it('the token budget is checked BEFORE the request cap (over-budget wins, no slot spent)', async () => {
+    // Over budget AND the request cap would also reject: the read-only budget
+    // gate must win so the (mutating) request-slot consume is never reached.
+    const { deps, publicShareChat } = makeDeps({
+      withinShareTokenBudget: jest.fn().mockResolvedValue(false),
+      tryConsumeWorkspaceQuota: jest.fn().mockResolvedValue(false),
+    });
+    expect(await statusOf(deps, body())).toBe(429);
+    expect(publicShareChat.tryConsumeWorkspaceQuota).not.toHaveBeenCalled();
+  });
+
+  it('the token-budget gate is checked BEFORE the payload caps (429 wins over 413)', async () => {
+    const { deps } = makeDeps({
+      withinShareTokenBudget: jest.fn().mockResolvedValue(false),
+    });
+    const huge = {
+      role: 'user',
+      parts: [{ type: 'text', text: 'x'.repeat(MAX_SHARE_MESSAGE_CHARS + 1) }],
+    };
+    expect(await statusOf(deps, body({ messages: [huge] }))).toBe(429);
+  });
+
   it('messages over MAX_SHARE_MESSAGES => 413', async () => {
     const { deps } = makeDeps();
     const tooMany = Array.from({ length: MAX_SHARE_MESSAGES + 1 }, () => ({
diff --git a/apps/server/src/core/ai-chat/public-share-chat.controller.ts b/apps/server/src/core/ai-chat/public-share-chat.controller.ts
index 74f8b538..fdab8582 100644
--- a/apps/server/src/core/ai-chat/public-share-chat.controller.ts
+++ b/apps/server/src/core/ai-chat/public-share-chat.controller.ts
@@ -151,6 +151,7 @@ export interface ShareAssistantDeps {
     | 'resolveShareRole'
     | 'getShareChatModel'
     | 'tryConsumeWorkspaceQuota'
+    | 'withinShareTokenBudget'
   >;
 }
 
@@ -267,9 +268,21 @@ export async function resolveShareAssistantRequest(
     throw new NotFoundException('Not found');
   }
 
-  // 5. Per-WORKSPACE anti-abuse cap (IP-independent; defense in depth). Checked
-  //    BEFORE res.hijack(), so an over-cap workspace gets a clean 429 and spends
-  //    nothing.
+  // 5a. Per-WORKSPACE rolling-day TOKEN budget (the COST backstop). Read-only and
+  //     checked FIRST so a workspace that has already burned its day's token
+  //     budget gets a clean 429 WITHOUT consuming a request slot, and spends
+  //     nothing. Counting requests alone does not bound the owner's provider
+  //     bill (issue #159, finding #5).
+  if (!(await deps.publicShareChat.withinShareTokenBudget(workspaceId))) {
+    throw new HttpException(
+      'This documentation assistant has reached its usage budget. Please try again later.',
+      HttpStatus.TOO_MANY_REQUESTS,
+    );
+  }
+
+  // 5b. Per-WORKSPACE anti-abuse request cap (IP-independent; defense in depth).
+  //     Checked BEFORE res.hijack(), so an over-cap workspace gets a clean 429
+  //     and spends nothing.
   if (!(await deps.publicShareChat.tryConsumeWorkspaceQuota(workspaceId))) {
     throw new HttpException(
       'This documentation assistant is temporarily busy. Please try again later.',
diff --git a/apps/server/src/core/ai-chat/public-share-chat.service.ts b/apps/server/src/core/ai-chat/public-share-chat.service.ts
index 8011814b..e0ac5282 100644
--- a/apps/server/src/core/ai-chat/public-share-chat.service.ts
+++ b/apps/server/src/core/ai-chat/public-share-chat.service.ts
@@ -17,7 +17,9 @@ import { buildShareSystemPrompt } from './public-share-chat.prompt';
 import { roleModelOverride } from './roles/role-model-config';
 import {
   PublicShareWorkspaceLimiter,
+  PublicShareWorkspaceTokenBudget,
   createPublicShareWorkspaceLimiter,
+  createPublicShareWorkspaceTokenBudget,
 } from './public-share-workspace-limiter';
 import { describeProviderError } from '../../integrations/ai/ai-error.util';
 import {
@@ -125,6 +127,16 @@ export class PublicShareChatService {
    */
   private readonly workspaceLimiter: PublicShareWorkspaceLimiter;
 
+  /**
+   * COST contour two: a per-workspace TOKEN budget over a rolling day. The
+   * request-count limiter above bounds how many anonymous calls run; this bounds
+   * how many provider TOKENS they spend (input re-sent per step + output),
+   * which is what the owner is actually billed for (issue #159, finding #5).
+   * Checked read-only before a turn streams; the real usage is recorded once the
+   * turn finishes (`onFinish`).
+   */
+  private readonly tokenBudget: PublicShareWorkspaceTokenBudget;
+
   constructor(
     private readonly ai: AiService,
     private readonly aiSettings: AiSettingsService,
@@ -133,6 +145,7 @@ export class PublicShareChatService {
     private readonly aiAgentRoleRepo: AiAgentRoleRepo,
   ) {
     this.workspaceLimiter = createPublicShareWorkspaceLimiter(redisService);
+    this.tokenBudget = createPublicShareWorkspaceTokenBudget(redisService);
   }
 
   /**
@@ -144,6 +157,25 @@ export class PublicShareChatService {
     return this.workspaceLimiter.tryConsume(workspaceId);
   }
 
+  /**
+   * Read-only pre-stream COST gate: true while the workspace is under its
+   * rolling-day token budget, false once the trailing-day token spend has
+   * reached it (the controller must then 429 BEFORE starting the stream). This
+   * bounds the owner's actual provider bill, which counting requests alone does
+   * not (issue #159, finding #5).
+   */
+  async withinShareTokenBudget(workspaceId: string): Promise<boolean> {
+    return this.tokenBudget.withinBudget(workspaceId);
+  }
+
+  /**
+   * Record a finished turn's real token spend against the rolling-day budget.
+   * Best-effort (the turn already ran): failures are swallowed by the budget.
+   */
+  async recordShareTokens(workspaceId: string, tokens: number): Promise<void> {
+    return this.tokenBudget.record(workspaceId, tokens);
+  }
+
   /**
    * Resolve the admin-selected agent role for the anonymous public-share
    * assistant, scoped to the workspace and soft-delete aware. Returns null when
@@ -231,6 +263,18 @@ export class PublicShareChatService {
         // bill even if the per-IP throttle is evaded; worst case = steps × this.
         maxOutputTokens: resolveShareAiMaxOutputTokens(),
         abortSignal: signal,
+        onFinish: ({ totalUsage }) => {
+          // Account the turn's REAL token spend (input re-sent per step + output,
+          // summed across all steps) against the per-workspace rolling-day budget
+          // so a future turn over budget is rejected up front (issue #159 #5).
+          // totalUsage fields are `number | undefined`; fall back to the sum of
+          // input+output when the provider omits totalTokens. Fire-and-forget:
+          // the turn already streamed, so a record failure must not break it.
+          const u = totalUsage ?? ({} as typeof totalUsage);
+          const tokens =
+            u?.totalTokens ?? (u?.inputTokens ?? 0) + (u?.outputTokens ?? 0);
+          void this.recordShareTokens(workspaceId, tokens);
+        },
         onError: ({ error }) => {
           // Reuse the shared formatter so provider error formatting stays
           // unified (statusCode + body) with the authenticated path.
diff --git a/apps/server/src/core/ai-chat/public-share-chat.spec.ts b/apps/server/src/core/ai-chat/public-share-chat.spec.ts
index 3b80e9be..3232e631 100644
--- a/apps/server/src/core/ai-chat/public-share-chat.spec.ts
+++ b/apps/server/src/core/ai-chat/public-share-chat.spec.ts
@@ -11,8 +11,11 @@ import {
 import { PublicShareChatToolsService } from './tools/public-share-chat-tools.service';
 import {
   PublicShareWorkspaceLimiter,
+  PublicShareWorkspaceTokenBudget,
   resolveShareAiWorkspaceMax,
+  resolveShareAiWorkspaceTokenBudget,
   SHARE_AI_WORKSPACE_MAX_PER_WINDOW,
+  SHARE_AI_WORKSPACE_TOKEN_BUDGET_DEFAULT,
 } from './public-share-workspace-limiter';
 
 /**
@@ -546,6 +549,185 @@ describe('PublicShareWorkspaceLimiter (cluster-wide sliding-window per-workspace
   });
 });
 
+/**
+ * In-memory fake of the ioredis slice the TOKEN budget uses. Unlike the request
+ * limiter (one Lua), the budget runs TWO scripts over the same sorted set:
+ *  - the read-only CHECK (sums the token counts encoded as each member's leading
+ *    integer, admits while the sum is under budget, never mutates), and
+ *  - the RECORD (ZADDs a finished turn's `<tokens>:<unique>` member).
+ * The fake faithfully reproduces both (branching on the script body) so the spec
+ * exercises the REAL budget math, not a re-implementation.
+ */
+class FakeTokenRedis {
+  private sets = new Map<string, Array<{ score: number; member: string }>>();
+
+  async eval(
+    script: string,
+    _numKeys: number,
+    key: string,
+    nowStr: string,
+    windowMsStr: string,
+    arg3: string,
+  ): Promise<number> {
+    const now = Number(nowStr);
+    const windowMs = Number(windowMsStr);
+    const cutoff = now - windowMs;
+    const arr = (this.sets.get(key) ?? []).filter((e) => e.score > cutoff);
+    if (script.includes('ZADD')) {
+      // RECORD: arg3 is the `<tokens>:<unique>` member; append at score=now.
+      arr.push({ score: now, member: arg3 });
+      this.sets.set(key, arr);
+      return 1;
+    }
+    // CHECK: arg3 is the budget; sum the leading integer of each survivor.
+    const budget = Number(arg3);
+    this.sets.set(key, arr);
+    const total = arr.reduce((sum, e) => {
+      const m = /^(\d+)/.exec(e.member);
+      return sum + (m ? Number(m[1]) : 0);
+    }, 0);
+    return total >= budget ? 0 : 1;
+  }
+}
+
+function makeTokenBudget(budget: number, windowMs: number, clock: () => number) {
+  const redis = new FakeTokenRedis() as unknown as import('ioredis').Redis;
+  return new PublicShareWorkspaceTokenBudget(redis, budget, windowMs, clock);
+}
+
+describe('resolveShareAiWorkspaceTokenBudget (env-overridable per-day token budget)', () => {
+  const KEY = 'SHARE_AI_WORKSPACE_TOKEN_BUDGET_PER_DAY';
+  const saved = process.env[KEY];
+  afterEach(() => {
+    if (saved === undefined) delete process.env[KEY];
+    else process.env[KEY] = saved;
+  });
+
+  it('falls back to the default when unset', () => {
+    delete process.env[KEY];
+    expect(resolveShareAiWorkspaceTokenBudget()).toBe(
+      SHARE_AI_WORKSPACE_TOKEN_BUDGET_DEFAULT,
+    );
+  });
+
+  it('honors a positive override', () => {
+    process.env[KEY] = '250000';
+    expect(resolveShareAiWorkspaceTokenBudget()).toBe(250000);
+  });
+
+  it('ignores a non-positive / unparseable value (uses the default)', () => {
+    for (const bad of ['0', '-5', 'nope', '']) {
+      process.env[KEY] = bad;
+      expect(resolveShareAiWorkspaceTokenBudget()).toBe(
+        SHARE_AI_WORKSPACE_TOKEN_BUDGET_DEFAULT,
+      );
+    }
+  });
+});
+
+describe('PublicShareWorkspaceTokenBudget (cluster-wide rolling-day token cap)', () => {
+  it('admits while under budget and rejects once the recorded spend reaches it', async () => {
+    const budget = makeTokenBudget(1000, 60_000, () => 1_000);
+    expect(await budget.withinBudget('ws-1')).toBe(true); // nothing spent yet
+    await budget.record('ws-1', 600);
+    expect(await budget.withinBudget('ws-1')).toBe(true); // 600 < 1000
+    await budget.record('ws-1', 400);
+    // 1000 >= 1000: the budget is exhausted, so the next turn is rejected up front.
+    expect(await budget.withinBudget('ws-1')).toBe(false);
+  });
+
+  it('counts TOKENS, not requests: one fat turn can exhaust the budget alone', async () => {
+    const budget = makeTokenBudget(1000, 60_000, () => 1_000);
+    // A single accepted turn re-sends the whole transcript across 5 steps; here
+    // it lands as 1200 tokens — already over the day budget on its own.
+    await budget.record('ws-1', 1200);
+    expect(await budget.withinBudget('ws-1')).toBe(false);
+  });
+
+  it('ages out spend older than the window so the budget recovers', async () => {
+    let now = 0;
+    const budget = makeTokenBudget(1000, 60_000, () => now);
+    await budget.record('ws-1', 1000); // at budget
+    now += 59_999; // still inside the day window
+    expect(await budget.withinBudget('ws-1')).toBe(false);
+    now += 2; // the spend is now strictly older than windowMs
+    expect(await budget.withinBudget('ws-1')).toBe(true);
+  });
+
+  it('ignores non-positive / non-finite usage (never records phantom spend)', async () => {
+    const budget = makeTokenBudget(1000, 60_000, () => 1_000);
+    await budget.record('ws-1', 0);
+    await budget.record('ws-1', -50);
+    await budget.record('ws-1', Number.NaN);
+    await budget.record('ws-1', Infinity);
+    expect(await budget.withinBudget('ws-1')).toBe(true); // nothing accumulated
+  });
+
+  it('keeps separate budgets per workspace', async () => {
+    const budget = makeTokenBudget(500, 60_000, () => 1_000);
+    await budget.record('ws-a', 500); // ws-a exhausted
+    expect(await budget.withinBudget('ws-a')).toBe(false);
+    expect(await budget.withinBudget('ws-b')).toBe(true); // ws-b untouched
+  });
+
+  it('FAILS CLOSED on the read-only check when Redis rejects', async () => {
+    const failingRedis = {
+      eval: () => Promise.reject(new Error('redis down')),
+    } as unknown as import('ioredis').Redis;
+    const budget = new PublicShareWorkspaceTokenBudget(
+      failingRedis,
+      1000,
+      60_000,
+      () => 1_000,
+    );
+    const errSpy = jest
+      .spyOn(Logger.prototype, 'error')
+      .mockImplementation(() => undefined);
+    expect(await budget.withinBudget('ws-1')).toBe(false);
+    expect(errSpy).toHaveBeenCalled();
+    errSpy.mockRestore();
+  });
+
+  it('SWALLOWS a record failure (best-effort post-accounting, never throws)', async () => {
+    // The turn already streamed; a record failure must not surface to the caller.
+    const failingRedis = {
+      eval: () => Promise.reject(new Error('redis down')),
+    } as unknown as import('ioredis').Redis;
+    const budget = new PublicShareWorkspaceTokenBudget(
+      failingRedis,
+      1000,
+      60_000,
+      () => 1_000,
+    );
+    const errSpy = jest
+      .spyOn(Logger.prototype, 'error')
+      .mockImplementation(() => undefined);
+    await expect(budget.record('ws-1', 100)).resolves.toBeUndefined();
+    expect(errSpy).toHaveBeenCalled();
+    errSpy.mockRestore();
+  });
+});
+
+describe('PublicShareChatService.withinShareTokenBudget / recordShareTokens', () => {
+  it('delegates the cost gate + accounting to the redis-backed token budget', async () => {
+    const redis = new FakeTokenRedis();
+    const redisService = { getOrThrow: () => redis } as never;
+    const service = new PublicShareChatService(
+      {} as never,
+      {} as never,
+      {} as never,
+      redisService,
+      {} as never,
+    );
+    // Default budget is large, so a fresh workspace is under budget; recording a
+    // modest spend keeps it under budget (asserts the wiring the controller +
+    // onFinish rely on).
+    expect(await service.withinShareTokenBudget('ws-1')).toBe(true);
+    await service.recordShareTokens('ws-1', 1234);
+    expect(await service.withinShareTokenBudget('ws-1')).toBe(true);
+  });
+});
+
 describe('PublicShareChatService.tryConsumeWorkspaceQuota', () => {
   it('delegates to the redis-backed per-workspace limiter', async () => {
     const redis = new FakeRedis();
diff --git a/apps/server/src/core/ai-chat/public-share-workspace-limiter.ts b/apps/server/src/core/ai-chat/public-share-workspace-limiter.ts
index cf0dd80d..d6f660a8 100644
--- a/apps/server/src/core/ai-chat/public-share-workspace-limiter.ts
+++ b/apps/server/src/core/ai-chat/public-share-workspace-limiter.ts
@@ -136,6 +136,177 @@ export class PublicShareWorkspaceLimiter {
   }
 }
 
+/**
+ * SECOND cost contour: a per-workspace TOKEN budget over a rolling DAY.
+ *
+ * The request-count cap above bounds how MANY anonymous calls a workspace
+ * admits, but NOT how expensive each one is: one accepted call runs the agent
+ * loop up to `stepCountIs(5)`, and every step re-sends the WHOLE client-held
+ * transcript (~hundreds of KB) as input, so the provider input alone can be tens
+ * of thousands of tokens PER step while `maxOutputTokens` only caps the output.
+ * The request cap is also hourly with no daily ceiling, so a steady stream at
+ * the hourly cap sustains ~24x its count per day. Counting requests therefore
+ * does not bound the owner's actual LLM bill (issue #159, finding #5).
+ *
+ * This contour caps the SPEND directly: the actual tokens consumed (input +
+ * output, summed across all steps of every accepted turn) over the trailing
+ * `windowMs` (one rolling day) must stay under `budget`. It is checked BEFORE a
+ * turn streams (read-only) and the turn's real usage is recorded AFTER it
+ * finishes (`streamText` onFinish). Like the request cap it is cluster-wide
+ * (shared Redis) and uses a sliding-window LOG so the day boundary cannot be
+ * gamed for a 2x burst.
+ *
+ * Pre-check is read-only, so a turn already over budget is rejected, but the
+ * tokens of an in-flight turn are not yet known and are accounted only once it
+ * finishes. The worst-case overshoot past the budget is therefore one turn
+ * (bounded by steps x (maxOutputTokens + transcript size)) — acceptable for a
+ * cost backstop on an optional anonymous assistant.
+ */
+
+/** Default per-workspace token budget over the rolling day. */
+export const SHARE_AI_WORKSPACE_TOKEN_BUDGET_DEFAULT = 1_000_000;
+/** Default token-budget window length: one rolling day. */
+export const SHARE_AI_WORKSPACE_TOKEN_WINDOW_MS = 24 * 60 * 60 * 1000;
+
+/** Redis key namespace for the per-workspace token-spend sliding-window log. */
+const TOKEN_KEY_PREFIX = 'share-ai:ws-tokens:';
+
+/**
+ * Read-only sliding-window token-budget check.
+ *
+ * KEYS[1] = the per-workspace token sorted-set key
+ * ARGV[1] = now (epoch ms)
+ * ARGV[2] = windowMs
+ * ARGV[3] = budget (max tokens in the trailing window)
+ *
+ * Drops entries older than the window, then sums the token counts encoded as the
+ * leading integer of each surviving member. Returns 1 if the running total is
+ * still UNDER budget (admit), 0 once it has reached/exceeded the budget. Does NOT
+ * add anything — the turn's real usage is recorded separately once it finishes.
+ */
+const TOKEN_BUDGET_CHECK_LUA = `
+local key = KEYS[1]
+local now = tonumber(ARGV[1])
+local windowMs = tonumber(ARGV[2])
+local budget = tonumber(ARGV[3])
+redis.call('ZREMRANGEBYSCORE', key, 0, now - windowMs)
+local members = redis.call('ZRANGE', key, 0, -1)
+local total = 0
+for i = 1, #members do
+  local t = tonumber(string.match(members[i], '^(%d+)'))
+  if t then total = total + t end
+end
+if total >= budget then
+  return 0
+end
+return 1
+`;
+
+/**
+ * Record one finished turn's token spend in the sliding-window log.
+ *
+ * KEYS[1] = the per-workspace token sorted-set key
+ * ARGV[1] = now (epoch ms) — the entry score
+ * ARGV[2] = windowMs
+ * ARGV[3] = member (`<tokens>:<unique>`; the leading integer is the token count)
+ *
+ * Always ZADDs (the turn already ran and spent the tokens) and refreshes the
+ * key TTL so idle workspaces cost no memory. Trims expired entries first so the
+ * set never grows unbounded for a busy workspace.
+ */
+const TOKEN_RECORD_LUA = `
+local key = KEYS[1]
+local now = tonumber(ARGV[1])
+local windowMs = tonumber(ARGV[2])
+local member = ARGV[3]
+redis.call('ZREMRANGEBYSCORE', key, 0, now - windowMs)
+redis.call('ZADD', key, now, member)
+redis.call('PEXPIRE', key, windowMs)
+return 1
+`;
+
+/**
+ * Cluster-wide, sliding-window per-workspace TOKEN budget backed by Redis.
+ * `withinBudget(key)` is a read-only pre-stream gate; `record(key, tokens)`
+ * accounts a finished turn's real usage. Decoupled from NestJS so it is testable
+ * against a mocked/real ioredis client, mirroring the request-count limiter.
+ */
+export class PublicShareWorkspaceTokenBudget {
+  private readonly logger = new Logger(PublicShareWorkspaceTokenBudget.name);
+  private counter = 0;
+
+  constructor(
+    private readonly redis: Redis,
+    private readonly budget: number = SHARE_AI_WORKSPACE_TOKEN_BUDGET_DEFAULT,
+    private readonly windowMs: number = SHARE_AI_WORKSPACE_TOKEN_WINDOW_MS,
+    private readonly now: () => number = Date.now,
+  ) {}
+
+  /**
+   * Read-only pre-stream check. Returns true while the workspace is under its
+   * rolling-day token budget, false once the trailing-window spend has reached
+   * it (caller must then 429 BEFORE streaming any tokens).
+   *
+   * FAILS CLOSED (false) on a Redis error: identical reasoning to the request
+   * limiter — when we cannot prove the workspace is under budget we DENY rather
+   * than admit an unmetered billable call. The assistant is optional, so a
+   * transient Redis blip briefly disabling it beats an unbounded provider bill.
+   */
+  async withinBudget(key: string): Promise<boolean> {
+    const t = this.now();
+    try {
+      const admitted = await this.redis.eval(
+        TOKEN_BUDGET_CHECK_LUA,
+        1,
+        TOKEN_KEY_PREFIX + key,
+        String(t),
+        String(this.windowMs),
+        String(this.budget),
+      );
+      return admitted === 1;
+    } catch (err) {
+      this.logger.error(
+        `share-ai token budget Redis failure for key "${key}"; failing closed`,
+        err as Error,
+      );
+      return false;
+    }
+  }
+
+  /**
+   * Record a finished turn's token spend. Best-effort: the turn already ran, so
+   * a Redis failure here is logged but not propagated — it would only cause a
+   * slight under-count of the running budget, never a wrong answer to the
+   * caller. Non-positive / non-finite usage is ignored.
+   */
+  async record(key: string, tokens: number): Promise<void> {
+    if (!Number.isFinite(tokens) || tokens <= 0) return;
+    const spend = Math.floor(tokens);
+    const t = this.now();
+    // Member: `<tokens>:<unique>` — the check Lua sums the leading integer, and
+    // the unique suffix keeps distinct turns in the same ms from colliding on
+    // the sorted-set member (which would drop one entry and under-count).
+    const member = `${spend}:${t}-${this.counter++}-${Math.random()
+      .toString(36)
+      .slice(2)}`;
+    try {
+      await this.redis.eval(
+        TOKEN_RECORD_LUA,
+        1,
+        TOKEN_KEY_PREFIX + key,
+        String(t),
+        String(this.windowMs),
+        member,
+      );
+    } catch (err) {
+      this.logger.error(
+        `share-ai token budget record failure for key "${key}" (${spend} tokens); ignoring`,
+        err as Error,
+      );
+    }
+  }
+}
+
 /**
  * Read the per-workspace cap from the environment (overridable seam), falling
  * back to the sane default. A non-positive / unparseable value uses the default.
@@ -162,3 +333,31 @@ export function createPublicShareWorkspaceLimiter(
     SHARE_AI_WORKSPACE_WINDOW_MS,
   );
 }
+
+/**
+ * Read the per-workspace rolling-day token budget from the environment
+ * (overridable seam), falling back to the sane default. A non-positive /
+ * unparseable value uses the default.
+ */
+export function resolveShareAiWorkspaceTokenBudget(): number {
+  const raw = Number(process.env.SHARE_AI_WORKSPACE_TOKEN_BUDGET_PER_DAY);
+  return Number.isFinite(raw) && raw > 0
+    ? Math.floor(raw)
+    : SHARE_AI_WORKSPACE_TOKEN_BUDGET_DEFAULT;
+}
+
+/**
+ * Build the per-workspace token budget from the injected RedisService (the same
+ * global ioredis client used by the request-count limiter). Tiny factory so the
+ * service constructor stays declarative and the budget stays unit-testable with
+ * a hand-rolled fake redis.
+ */
+export function createPublicShareWorkspaceTokenBudget(
+  redisService: RedisService,
+): PublicShareWorkspaceTokenBudget {
+  return new PublicShareWorkspaceTokenBudget(
+    redisService.getOrThrow(),
+    resolveShareAiWorkspaceTokenBudget(),
+    SHARE_AI_WORKSPACE_TOKEN_WINDOW_MS,
+  );
+}