fix(dictation): realtime lockout (latest-wins), unify STT settings (#737), surface upstream failures

Found while live-testing the realtime dictation: - 'already active' lockout (real bug): the per-user slot was tied to the connected socket lifetime and a stale/racing socket could leave the counter stuck, so a fresh mic start was rejected. Now per-user single-session is enforced purely by LATEST-WINS EVICTION — a new connect disconnects the user's prior socket and frees its slot synchronously — and the user counter no longer participates in the cap decision (it could only cause false lockouts). Also free the slot when a start fails to open. The per-workspace cap is unchanged. - #737: drop the separate sttRealtimeModel / sttRealtimeBaseUrl settings — realtime dictation now reuses the existing STT model + base URL (the realtime WS endpoint is derived from it server-side). Removed the fields from the DTO, types, settings service, repo allowlist, and the settings UI. The STT 'Test endpoint' button is now a single context-aware button (probes the realtime WS endpoint when realtime is on, the batch endpoint otherwise), and the 'Request format' selector is disabled while realtime is on (realtime always uses the OpenAI Realtime protocol). - no-silent-loss: parse the OpenAI conversation.item.input_audio_transcription.failed event (e.g. insufficient_quota, bad model) and surface its concrete reason to the client instead of dropping it silently — previously a per-item transcription failure produced 'no words' with no explanation. Tests: realtime suites green (gateway latest-wins eviction, parser .failed surfacing, ai-settings reuse-STT-model); server + client tsc clean; workspace vitest 37 pass. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 20:07:10 +03:00
parent 310b54a6da
commit 235d627191
14 changed files with 1370 additions and 1305 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -54,9 +54,11 @@ embeds — plus a large batch of security hardening and test coverage.
  of the existing batch STT. Audio streams over a dedicated `/ai-realtime`
  Socket.IO namespace and text is inserted as you speak (interim partials shown
  as a ghost decoration, only finals committed to the document). Gated by a new
-  `dictationRealtime` workspace toggle, with `sttRealtimeModel` and
-  `sttRealtimeBaseUrl` settings (empty model falls back to `sttModel`; empty base
-  URL falls back to the STT base URL server-side).
+  `dictationRealtime` workspace toggle; it reuses the existing STT model and
+  endpoint (the realtime WS endpoint is derived from the STT base URL), so no
+  separate model/endpoint settings are needed. While the toggle is on the batch
+  "Request format" selector is disabled, since realtime always uses the OpenAI
+  Realtime protocol.
  - **Ops caveat (single-process assumption):** the realtime concurrency caps
    (1 concurrent session per user, 5 per workspace) are enforced **in-memory,
    per API process**. They are therefore authoritative only on a **single API
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -63,10 +63,6 @@ const formSchema = z.object({
  // STT-specific fields. Empty base URL / key fall back to the chat ones.
  sttModel: z.string(),
  sttBaseUrl: z.string(),
-  // Realtime (streaming) STT fields. Empty model falls back to sttModel and
-  // empty base URL falls back to the STT base URL server-side.
-  sttRealtimeModel: z.string(),
-  sttRealtimeBaseUrl: z.string(),
  sttApiStyle: z.enum(["multipart", "json"]),
  sttApiKey: z.string(),
 });
@@ -245,8 +241,6 @@ export default function AiProviderSettings() {
      embeddingApiKey: "",
      sttModel: "",
      sttBaseUrl: "",
-      sttRealtimeModel: "",
-      sttRealtimeBaseUrl: "",
      sttApiStyle: "multipart" as SttApiStyle,
      sttApiKey: "",
    },
@@ -268,8 +262,6 @@ export default function AiProviderSettings() {
      embeddingApiKey: "",
      sttModel: settings.sttModel ?? "",
      sttBaseUrl: settings.sttBaseUrl ?? "",
-      sttRealtimeModel: settings.sttRealtimeModel ?? "",
-      sttRealtimeBaseUrl: settings.sttRealtimeBaseUrl ?? "",
      sttApiStyle: settings.sttApiStyle ?? "multipart",
      sttApiKey: "",
    });
@@ -304,10 +296,6 @@ export default function AiProviderSettings() {
      // server-side.
      sttModel: values.sttModel,
      sttBaseUrl: values.sttBaseUrl,
-      // Realtime STT: empty model falls back to sttModel, empty base URL falls
-      // back to the STT base URL server-side.
-      sttRealtimeModel: values.sttRealtimeModel,
-      sttRealtimeBaseUrl: values.sttRealtimeBaseUrl,
      sttApiStyle: values.sttApiStyle,
    };

@@ -971,7 +959,11 @@ export default function AiProviderSettings() {
        <Select
          mt="sm"
          label={t("Request format")}
-          description={t("How transcription requests are sent to the endpoint")}
+          description={
+            realtimeDictationEnabled
+              ? t("Realtime dictation uses the OpenAI Realtime protocol")
+              : t("How transcription requests are sent to the endpoint")
+          }
          data={[
            {
              value: "multipart",
@@ -980,13 +972,19 @@ export default function AiProviderSettings() {
            { value: "json", label: t("OpenRouter (JSON, base64 audio)") },
          ]}
          allowDeselect={false}
-          disabled={isLoading}
+          // The batch request format is irrelevant while realtime dictation is on
+          // (realtime always uses the OpenAI Realtime protocol). The stored value
+          // is kept for batch use.
+          disabled={isLoading || realtimeDictationEnabled}
          {...form.getInputProps("sttApiStyle")}
        />

        <TextInput
          mt="sm"
          label={t("Base URL")}
+          description={t(
+            "Used for both batch transcription and realtime dictation",
+          )}
          placeholder={t("Leave empty to use the chat base URL")}
          disabled={isLoading}
          {...form.getInputProps("sttBaseUrl")}
@@ -995,77 +993,45 @@ export default function AiProviderSettings() {
          {t("Resolves to {{url}}", { url: sttResolved })}
        </Text>

+        {/* Single context-aware test button: when realtime dictation is ON it
+            probes the realtime WS endpoint (derived from the STT base URL);
+            otherwise it probes the batch STT endpoint. Realtime reuses the same
+            model + base URL, so there is intentionally only ONE test button. */}
        <Group mt="md" align="center">
          <Button
            variant="default"
            size="sm"
-            loading={sttTest.isPending}
-            onClick={() => sttTest.mutate("stt")}
+            loading={
+              realtimeDictationEnabled ? realtimeTest.isPending : sttTest.isPending
+            }
+            onClick={() =>
+              realtimeDictationEnabled
+                ? realtimeTest.mutate()
+                : sttTest.mutate("stt")
+            }
          >
-            {t("Test endpoint")}
+            {realtimeDictationEnabled
+              ? t("Test realtime endpoint")
+              : t("Test endpoint")}
          </Button>
-          {sttTest.data &&
-            (sttTest.data.ok ? (
+          {(realtimeDictationEnabled ? realtimeTest.data : sttTest.data) &&
+            ((realtimeDictationEnabled ? realtimeTest.data : sttTest.data)!.ok ? (
              <Text size="sm" c="green">
                {t("Connection successful")}
              </Text>
            ) : (
              <Text size="sm" c="red">
-                {sttTest.data.error || t("Connection failed")}
+                {(realtimeDictationEnabled ? realtimeTest.data : sttTest.data)!
+                  .error || t("Connection failed")}
              </Text>
            ))}
        </Group>
-
-        {/* Realtime (streaming) dictation: layered on top of batch STT and only
-            shown when the workspace toggle is on. Model falls back to the STT
-            model and the endpoint falls back to the STT base URL server-side. */}
        {realtimeDictationEnabled && (
-          <>
-            <Text size="xs" c="dimmed" mt="md" mb="xs">
-              {t(
-                "Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)",
-              )}
-            </Text>
-
-            <TextInput
-              label={t("Realtime model")}
-              placeholder="gpt-4o-mini-transcribe"
-              disabled={isLoading}
-              {...form.getInputProps("sttRealtimeModel")}
-            />
-
-            <TextInput
-              mt="sm"
-              label={t("Realtime endpoint")}
-              description={t(
-                "Leave empty to use the STT base URL",
-              )}
-              placeholder={t("Leave empty to use the STT base URL")}
-              disabled={isLoading}
-              {...form.getInputProps("sttRealtimeBaseUrl")}
-            />
-
-            <Group mt="md" align="center">
-              <Button
-                variant="default"
-                size="sm"
-                loading={realtimeTest.isPending}
-                onClick={() => realtimeTest.mutate()}
-              >
-                {t("Test endpoint")}
-              </Button>
-              {realtimeTest.data &&
-                (realtimeTest.data.ok ? (
-                  <Text size="sm" c="green">
-                    {t("Connection successful")}
-                  </Text>
-                ) : (
-                  <Text size="sm" c="red">
-                    {realtimeTest.data.error || t("Connection failed")}
-                  </Text>
-                ))}
-            </Group>
-          </>
+          <Text size="xs" c="dimmed" mt="xs">
+            {t(
+              "Streams audio live and inserts text as you speak (reuses the STT model and endpoint above; requires an OpenAI-compatible Realtime endpoint)",
+            )}
+          </Text>
        )}
      </Paper>

--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -32,8 +32,6 @@ export interface IAiSettings {
  // key is stored (empty means "uses the chat API key").
  sttModel?: string;
  sttBaseUrl?: string;
-  sttRealtimeModel?: string;
-  sttRealtimeBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
  hasSttApiKey: boolean;
  // RAG indexing coverage (pages indexed for semantic search).
@@ -61,8 +59,6 @@ export interface IAiSettingsUpdate {
  embeddingApiKey?: string;
  sttModel?: string;
  sttBaseUrl?: string;
-  sttRealtimeModel?: string;
-  sttRealtimeBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
  // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
  sttApiKey?: string;
--- a/apps/server/src/core/ai-chat/realtime/ai-realtime.gateway.spec.ts
+++ b/apps/server/src/core/ai-chat/realtime/ai-realtime.gateway.spec.ts
@@ -234,17 +234,28 @@ describe('AiRealtimeGateway.handleConnection gate + caps', () => {
    expect(__testCounters.workspace.count('w1')).toBe(0);
  });

-  it('refuses when the per-user cap is already reached (no increment)', async () => {
-    __testCounters.user.increment('u1'); // user already at cap (1)
+  it('latest-wins: a new connect for the same user EVICTS the old one (never "already active")', async () => {
    const { gateway } = buildGateway();
-    const socket = buildSocket();
-    await gateway.handleConnection(socket as any);
+    // First socket for u1 connects and takes the slot.
+    const a = buildSocket();
+    await gateway.handleConnection(a as any);
+    expect(__testCounters.user.count('u1')).toBe(1);

-    expect(socket.emit).toHaveBeenCalledWith('error', {
+    // A SECOND socket for the SAME user must NOT be rejected — it evicts the
+    // first (disconnects it, frees its slot synchronously) and is admitted.
+    const b = buildSocket();
+    await gateway.handleConnection(b as any);
+
+    // Old socket evicted (disconnected) with the replacement notice.
+    expect(a.disconnect).toHaveBeenCalled();
+    expect(a.emit).toHaveBeenCalledWith('error', {
+      message: expect.stringMatching(/replaced by a newer/i),
+    });
+    // The new socket was admitted — never the "already active" refusal.
+    expect(b.emit).not.toHaveBeenCalledWith('error', {
      message: expect.stringMatching(/already active/i),
    });
-    expect(socket.disconnect).toHaveBeenCalled();
-    // Still exactly 1 (the pre-existing slot), not bumped to 2.
+    // Exactly one live slot remains (evict -1, admit +1), held by the new socket.
    expect(__testCounters.user.count('u1')).toBe(1);
  });

--- a/apps/server/src/core/ai-chat/realtime/ai-realtime.gateway.ts
+++ b/apps/server/src/core/ai-chat/realtime/ai-realtime.gateway.ts
@@ -59,6 +59,12 @@ import {
 const userSessions = new SessionCounters();
 const workspaceSessions = new SessionCounters();

+// Latest-wins: the current realtime socket per user. A NEW connection for a user
+// who already has one EVICTS the old (one expensive session per user; a fresh mic
+// start should replace a stale/lingering socket rather than be rejected with
+// "already active"). Per-process, like the counters above.
+const userSockets = new Map<string, Socket>();
+
 /** Per-socket state we stash on client.data. */
 interface RealtimeClientData {
  userId: string;
@@ -153,6 +159,13 @@ export class AiRealtimeGateway
      data.userId = userId;
      data.workspaceId = workspaceId;

+      // Latest-wins: evict any existing realtime socket for this user BEFORE the
+      // cap check, so a fresh mic start always replaces a stale/lingering session
+      // instead of being rejected with "already active". The eviction frees the
+      // old slot synchronously (see evictExistingUserSocket), so the cap check
+      // below sees the user's slot as free.
+      this.evictExistingUserSocket(userId, client);
+
      // Gate + concurrency caps. canConnect is a pure decision over the current
      // counts; it checks BOTH the feature gate and BOTH caps before we mutate
      // either counter, so a rejected connection leaves the counters clean.
@@ -161,7 +174,13 @@ export class AiRealtimeGateway
        ai?: { dictation?: boolean; dictationRealtime?: boolean };
      };
      const decision = canConnect(userId, workspaceId, settings, {
-        userCount: userSessions.count(userId),
+        // Per-user single-session is enforced by eviction (latest-wins) above,
+        // NOT by this counter: a new connect always replaces the user's prior
+        // socket, so the user dimension is reported as free here. (Feeding the
+        // live user counter caused false "already active" lockouts when a stale
+        // or racing socket left the count > 0 despite eviction.) The workspace
+        // cap still uses the real count.
+        userCount: 0,
        workspaceCount: workspaceSessions.count(workspaceId),
      });
      if (decision.allowed === false) {
@@ -175,6 +194,8 @@ export class AiRealtimeGateway
      // Remember exactly what we counted so disconnect decrements symmetrically.
      data.countedUserId = userId;
      data.countedWorkspaceId = workspaceId;
+      // Track this as the user's current socket (for latest-wins eviction).
+      userSockets.set(userId, client);
    } catch (err) {
      // Auth/origin failure (or any unexpected connect error): never leak
      // details, never increment a counter.
@@ -225,6 +246,11 @@ export class AiRealtimeGateway
          ? err.message
          : describeProviderError(err, 'Failed to start realtime dictation');
      client.emit('error', { message });
+      // A session that never opened must NOT keep holding the per-user slot.
+      // Disconnect so handleDisconnect frees the counted slot immediately —
+      // otherwise the socket lingers and the next attempt hits the 1/user cap
+      // ("already active") even though no session is actually running.
+      client.disconnect();
    }
  }

@@ -256,6 +282,43 @@ export class AiRealtimeGateway
      workspaceSessions.decrement(state.countedWorkspaceId);
      state.countedWorkspaceId = undefined;
    }
+    // Drop the latest-wins pointer only if it still points at THIS socket (a
+    // newer connection may have already replaced it via eviction).
+    if (state.userId && userSockets.get(state.userId) === client) {
+      userSockets.delete(state.userId);
+    }
+  }
+
+  /**
+   * Latest-wins eviction: if the user already has a realtime socket, free its
+   * slot SYNCHRONOUSLY (so the new connection's cap check sees it gone), clear
+   * its counted markers so its later async handleDisconnect is a no-op, then
+   * disconnect it with a clear reason. Skipped if the existing socket is the
+   * incoming one (defensive).
+   */
+  private evictExistingUserSocket(userId: string, incoming: Socket): void {
+    const existing = userSockets.get(userId);
+    if (!existing || existing === incoming) return;
+    const exState = existing.data as RealtimeClientData;
+    exState.handle?.close();
+    exState.handle = undefined;
+    if (exState.countedUserId) {
+      userSessions.decrement(exState.countedUserId);
+      exState.countedUserId = undefined;
+    }
+    if (exState.countedWorkspaceId) {
+      workspaceSessions.decrement(exState.countedWorkspaceId);
+      exState.countedWorkspaceId = undefined;
+    }
+    userSockets.delete(userId);
+    try {
+      existing.emit('error', {
+        message: 'Replaced by a newer dictation session',
+      });
+      existing.disconnect();
+    } catch {
+      // The old socket is being discarded anyway; ignore disconnect races.
+    }
  }

  /**
@@ -287,5 +350,8 @@ export const __testCounters = {
  reset(): void {
    userSessions.reset();
    workspaceSessions.reset();
+    // Clear the latest-wins socket map too, or a socket from a prior test leaks
+    // into the next one (module-level state).
+    userSockets.clear();
  },
 };
--- a/apps/server/src/core/ai-chat/realtime/ai-realtime.service.spec.ts
+++ b/apps/server/src/core/ai-chat/realtime/ai-realtime.service.spec.ts
@@ -101,7 +101,7 @@ function makeService(
 /** A fully-configured STT config that resolves to the OpenAI default URL. */
 const OPENAI_CFG = {
  driver: 'openai',
-  sttRealtimeModel: 'gpt-4o-transcribe',
+  sttModel: 'gpt-4o-transcribe',
  sttApiKey: 'sk-test',
 };

@@ -294,6 +294,24 @@ describe('parseUpstreamEvent (OpenAI GA → normalized realtime events)', () =>
      ),
    ).toEqual({ type: 'error', message: 'boom' });

+    // A per-item transcription failure (e.g. insufficient_quota) must surface as
+    // an error, not be silently ignored.
+    expect(
+      parseUpstreamEvent(
+        JSON.stringify({
+          type: 'conversation.item.input_audio_transcription.failed',
+          item_id: 'seg-x',
+          content_index: 0,
+          error: {
+            type: 'insufficient_quota',
+            code: 'insufficient_quota',
+            message: 'You exceeded your current quota',
+          },
+        }),
+        acc,
+      ),
+    ).toEqual({ type: 'error', message: 'You exceeded your current quota' });
+
    expect(parseUpstreamEvent(JSON.stringify({ type: 'whatever' }), acc)).toEqual({
      type: 'ignore',
    });
--- a/apps/server/src/core/ai-chat/realtime/ai-realtime.service.ts
+++ b/apps/server/src/core/ai-chat/realtime/ai-realtime.service.ts
@@ -225,6 +225,18 @@ export function parseUpstreamEvent(
      return { type: 'final', itemId, text };
    }

+    case 'conversation.item.input_audio_transcription.failed': {
+      // A per-item transcription failure (e.g. insufficient_quota, bad model).
+      // Without this case it fell through to `ignore` and the cause was SILENTLY
+      // dropped — the user just saw "no words" with no explanation. Surface the
+      // provider's concrete reason instead (no silent loss).
+      if (evt.item_id) acc.delete(accKey(evt.item_id));
+      const message =
+        evt.error?.message?.trim() ||
+        describeProviderError(evt.error, 'Realtime transcription failed');
+      return { type: 'error', message };
+    }
+
    case 'error': {
      // Surface the provider's concrete cause; never a generic message.
      const message =
@@ -269,12 +281,12 @@ export class AiRealtimeService {
    opts: OpenSessionOptions,
  ): Promise<RealtimeSessionHandle> {
    const cfg = await this.aiSettings.resolve(workspaceId);
-    const model = cfg?.sttRealtimeModel || cfg?.sttModel;
+    const model = cfg?.sttModel;
    if (!cfg?.driver || !model) {
      throw new AiSttNotConfiguredException();
    }

-    const baseUrl = cfg.sttRealtimeBaseUrl || cfg.sttBaseUrl || cfg.baseUrl;
+    const baseUrl = cfg.sttBaseUrl || cfg.baseUrl;
    const wssUrl = AiRealtimeService.deriveRealtimeUrl(baseUrl);

    // Fast pre-flight SSRF check on the http(s) equivalent (ssrf-guard only
--- a/apps/server/src/database/repos/workspace/workspace.repo.ts
+++ b/apps/server/src/database/repos/workspace/workspace.repo.ts
@@ -239,7 +239,7 @@ export class WorkspaceRepo {
    // is a real jsonb object, never a double-encoded string. The CASE self-heals
    // workspaces whose settings.ai.provider was previously corrupted into an
    // array/string.
-    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttRealtimeModel', 'sttRealtimeBaseUrl', 'sttApiStyle', 'systemPrompt', 'publicShareChatModel', 'publicShareAssistantRoleId'];
+    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'systemPrompt', 'publicShareChatModel', 'publicShareAssistantRoleId'];
    const entries = Object.entries(provider).filter(
      ([k, v]) => v !== undefined && ALLOWED.includes(k),
    );
--- a/apps/server/src/integrations/ai/ai-settings.service.spec.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.spec.ts
@@ -56,28 +56,48 @@ function buildService(deps: Deps) {
 }

 describe('AiSettingsService.update partial merge', () => {
-  it('a DTO with only realtime fields patches exactly those keys', async () => {
+  it('a DTO with only STT fields patches exactly those keys', async () => {
    const updateAiProviderSettings = jest.fn().mockResolvedValue(undefined);
    const { service } = buildService({ updateAiProviderSettings });

    await service.update('w1', {
-      sttRealtimeModel: 'gpt-4o-realtime',
-      sttRealtimeBaseUrl: 'https://api.example.com/v1',
+      sttModel: 'gpt-4o-transcribe',
+      sttBaseUrl: 'https://api.example.com/v1',
    });

    expect(updateAiProviderSettings).toHaveBeenCalledTimes(1);
    const [, patch] = updateAiProviderSettings.mock.calls[0];
    expect(Object.keys(patch).sort()).toEqual(
-      ['sttRealtimeBaseUrl', 'sttRealtimeModel'].sort(),
+      ['sttBaseUrl', 'sttModel'].sort(),
    );
  });

-  it('a DTO with chatModel does NOT clobber realtime fields (only chatModel patched)', async () => {
+  it('a DTO with chatModel does NOT clobber STT fields (only chatModel patched)', async () => {
    const updateAiProviderSettings = jest.fn().mockResolvedValue(undefined);
    const { service } = buildService({ updateAiProviderSettings });

    await service.update('w1', { chatModel: 'gpt-4o' });

+    const [, patch] = updateAiProviderSettings.mock.calls[0];
+    expect(patch).toEqual({ chatModel: 'gpt-4o' });
+    expect(patch).not.toHaveProperty('sttModel');
+    expect(patch).not.toHaveProperty('sttBaseUrl');
+  });
+
+  // Realtime dictation no longer has its own model/endpoint: the separate
+  // sttRealtimeModel / sttRealtimeBaseUrl fields were removed and the realtime
+  // session reuses sttModel / sttBaseUrl. The DTO whitelist strips them, so even
+  // if a stale client sends them they never reach the provider patch.
+  it('ignores removed realtime-specific fields (not in the patch allowlist)', async () => {
+    const updateAiProviderSettings = jest.fn().mockResolvedValue(undefined);
+    const { service } = buildService({ updateAiProviderSettings });
+
+    await service.update('w1', {
+      chatModel: 'gpt-4o',
+      sttRealtimeModel: 'gpt-4o-realtime',
+      sttRealtimeBaseUrl: 'https://api.example.com/v1',
+    } as any);
+
    const [, patch] = updateAiProviderSettings.mock.calls[0];
    expect(patch).toEqual({ chatModel: 'gpt-4o' });
    expect(patch).not.toHaveProperty('sttRealtimeModel');
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -32,8 +32,6 @@ export interface UpdateAiSettingsInput {
  embeddingApiKey?: string;
  sttModel?: string;
  sttBaseUrl?: string;
-  sttRealtimeModel?: string;
-  sttRealtimeBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
  sttApiKey?: string;
  publicShareChatModel?: string;
@@ -165,10 +163,6 @@ export class AiSettingsService {
      publicShareAssistantRoleId: provider.publicShareAssistantRoleId,
      embeddingModel: provider.embeddingModel,
      sttModel: provider.sttModel,
-      // Raw passthrough, NO fallback; the realtime consumer falls back to
-      // `sttModel` / (`sttBaseUrl` || `baseUrl`) at use time.
-      sttRealtimeModel: provider.sttRealtimeModel,
-      sttRealtimeBaseUrl: provider.sttRealtimeBaseUrl,
      // Plain passthrough, no fallback; the transcribe path defaults unset to
      // 'multipart' (current behavior).
      sttApiStyle: provider.sttApiStyle,
@@ -245,8 +239,6 @@ export class AiSettingsService {
      embeddingBaseUrl: provider.embeddingBaseUrl,
      sttModel: provider.sttModel,
      sttBaseUrl: provider.sttBaseUrl,
-      sttRealtimeModel: provider.sttRealtimeModel,
-      sttRealtimeBaseUrl: provider.sttRealtimeBaseUrl,
      sttApiStyle: provider.sttApiStyle,
      systemPrompt: provider.systemPrompt,
      publicShareChatModel: provider.publicShareChatModel,
@@ -286,8 +278,6 @@ export class AiSettingsService {
      'embeddingBaseUrl',
      'sttModel',
      'sttBaseUrl',
-      'sttRealtimeModel',
-      'sttRealtimeBaseUrl',
      'sttApiStyle',
      'systemPrompt',
      'publicShareChatModel',
--- a/apps/server/src/integrations/ai/ai.types.ts
+++ b/apps/server/src/integrations/ai/ai.types.ts
@@ -28,13 +28,10 @@ export interface AiProviderSettings {
  // Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
  embeddingBaseUrl?: string;
  sttModel?: string;
-  // STT-specific base URL. Falls back to baseUrl when empty/unset.
+  // STT-specific base URL. Falls back to baseUrl when empty/unset. Used for BOTH
+  // batch transcription and realtime dictation (the realtime WS path is derived
+  // from this base URL).
  sttBaseUrl?: string;
-  // Realtime STT model id. Falls back to `sttModel` at use time when empty/unset.
-  sttRealtimeModel?: string;
-  // Realtime STT base URL. Falls back to `sttBaseUrl` || `baseUrl` at use time
-  // when empty/unset.
-  sttRealtimeBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
  systemPrompt?: string;
  // Cheap chat model id used ONLY by the anonymous public-share assistant. The
@@ -84,8 +81,6 @@ export interface MaskedAiSettings {
  embeddingBaseUrl?: string;
  sttModel?: string;
  sttBaseUrl?: string;
-  sttRealtimeModel?: string;
-  sttRealtimeBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
  systemPrompt?: string;
  publicShareChatModel?: string;
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
@@ -50,14 +50,6 @@ export class UpdateAiSettingsDto {
  @IsString()
  sttBaseUrl?: string;

-  @IsOptional()
-  @IsString()
-  sttRealtimeModel?: string;
-
-  @IsOptional()
-  @IsString()
-  sttRealtimeBaseUrl?: string;
-
  @IsOptional()
  @IsIn(STT_API_STYLES)
  sttApiStyle?: SttApiStyle;
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.ssrf.spec.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.ssrf.spec.ts
@@ -4,9 +4,11 @@ import { validate } from 'class-validator';
 import { UpdateAiSettingsDto } from './update-ai-settings.dto';
 import { isUrlAllowed } from '../../../core/ai-chat/external-mcp/ssrf-guard';

-// SSRF contract for sttRealtimeBaseUrl.
+// SSRF contract for sttBaseUrl.
 //
-// The DTO intentionally validates sttRealtimeBaseUrl with @IsString() ONLY (no
+// Realtime dictation reuses the STT base URL (the realtime WS endpoint is
+// derived from it), so sttBaseUrl is the field that feeds the connect-time SSRF
+// guard. The DTO intentionally validates sttBaseUrl with @IsString() ONLY (no
 // @IsUrl): an admin may legitimately point at an internal-looking host that DNS
 // resolves to a public address, and over-strict URL validation would reject
 // valid setups. The real defense is the CONNECT-TIME SSRF guard (isUrlAllowed on
@@ -18,23 +20,19 @@ async function validateDto(payload: Record<string, unknown>) {
  return validate(dto as object);
 }

-describe('UpdateAiSettingsDto.sttRealtimeBaseUrl is @IsString only (no @IsUrl)', () => {
+describe('UpdateAiSettingsDto.sttBaseUrl is @IsString only (no @IsUrl)', () => {
  it('accepts a metadata-service URL at the DTO layer (string, not URL-validated)', async () => {
    const errors = await validateDto({
-      sttRealtimeBaseUrl: 'http://169.254.169.254/v1',
+      sttBaseUrl: 'http://169.254.169.254/v1',
    });
-    const fieldErr = errors.find(
-      (e) => e.property === 'sttRealtimeBaseUrl',
-    );
+    const fieldErr = errors.find((e) => e.property === 'sttBaseUrl');
    // No DTO-level rejection: blocking is deferred to the connect-time guard.
    expect(fieldErr).toBeUndefined();
  });

-  it('rejects a non-string sttRealtimeBaseUrl with an isString error', async () => {
-    const errors = await validateDto({ sttRealtimeBaseUrl: 123 });
-    const fieldErr = errors.find(
-      (e) => e.property === 'sttRealtimeBaseUrl',
-    );
+  it('rejects a non-string sttBaseUrl with an isString error', async () => {
+    const errors = await validateDto({ sttBaseUrl: 123 });
+    const fieldErr = errors.find((e) => e.property === 'sttBaseUrl');
    expect(Object.keys(fieldErr?.constraints ?? {})).toContain('isString');
  });
 });