feat(dictation): add realtime streaming STT (live dictation)

Layer an optional realtime speech-to-text path on top of the existing batch dictation, so transcribed text appears as the user speaks. Transport A2: browser <-> our server (Socket.IO `/ai-realtime`) <-> OpenAI Realtime (raw ws). The provider API key never leaves the server; the upstream URL is SSRF-checked before connecting; the gateway enforces the dictation+dictationRealtime gate, cookie-JWT auth and per-user/ per-workspace concurrency caps. Implemented against the GA (2026) OpenAI Realtime transcription contract (session.update / audio.input.format / server_vad), not the now-removed beta shape. Editor UI B2: interim text is shown as a meta-only ProseMirror ghost decoration (no Yjs/history noise); only completed segments are committed. Chat shows interim as a dimmed tail. The mic button switches realtime vs batch by the workspace flag; batch remains the default and fallback. Server: - AiRealtimeService (upstream ws proxy, normalized events, idle/max- duration timeouts, idempotent teardown) + parseUpstreamEvent unit tests - AiRealtimeGateway (Socket.IO `/ai-realtime`) wired into AiChatModule - admin-gated POST /ai-chat/realtime/test connectivity probe - config: settings.ai.dictationRealtime + provider sttRealtimeModel/ sttRealtimeBaseUrl (realtime key reuses sttApiKey; no new secret) Client: - pcm16 AudioWorklet (24kHz mono PCM16), RealtimeDictationClient, use-realtime-dictation hook (status/start/stop/cancel + onInterim/onFinal) - RealtimeMicButton + dictation-interim ProseMirror decoration - editor/chat integration + AI settings UI (toggle, model, test endpoint) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 14:47:28 +03:00
parent 74e2b7ad7f
commit 7db3f007cb
25 changed files with 2111 additions and 19 deletions
--- a/apps/client/src/features/ai-chat/components/chat-input.tsx
+++ b/apps/client/src/features/ai-chat/components/chat-input.tsx
@@ -1,11 +1,19 @@
-import { KeyboardEvent } from "react";
-import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core";
+import { KeyboardEvent, useState } from "react";
+import {
+  ActionIcon,
+  Group,
+  Stack,
+  Text,
+  Textarea,
+  Tooltip,
+} from "@mantine/core";
 import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react";
 import { useTranslation } from "react-i18next";
 import { useAtom, useAtomValue } from "jotai";
 import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts";
 import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
 import { MicButton } from "@/features/dictation/components/mic-button";
+import { RealtimeMicButton } from "@/features/dictation/components/realtime-mic-button";

 interface ChatInputProps {
  onSend: (text: string) => void;
@@ -29,12 +37,17 @@ export default function ChatInput({
  const [value, setValue] = useAtom(aiChatDraftAtom);
  const workspace = useAtomValue(workspaceAtom);
  const isDictationEnabled = workspace?.settings?.ai?.dictation === true;
+  const isRealtime = workspace?.settings?.ai?.dictationRealtime === true;
+  // Live interim (partial) transcript shown as a dimmed tail under the input.
+  const [interim, setInterim] = useState("");

  const send = (): void => {
    const text = value.trim();
    if (!text || isStreaming || disabled) return;
    onSend(text);
    setValue("");
+    // Drop any leftover partial when a message is sent.
+    setInterim("");
  };

  const handleKeyDown = (e: KeyboardEvent<HTMLTextAreaElement>): void => {
@@ -45,7 +58,8 @@ export default function ChatInput({
  };

  return (
-    <Group gap="xs" align="flex-end" wrap="nowrap">
+    <Stack gap="xs">
+      <Group gap="xs" align="flex-end" wrap="nowrap">
      <Textarea
        style={{ flex: 1 }}
        placeholder={t("Ask the AI agent…")}
@@ -61,13 +75,24 @@ export default function ChatInput({
        // switch), so a fresh chat lands with the cursor ready in the field.
        autoFocus
      />
-      {isDictationEnabled && (
-        <MicButton
-          size="lg"
-          disabled={isStreaming || disabled}
-          onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
-        />
-      )}
+      {isDictationEnabled &&
+        (isRealtime ? (
+          <RealtimeMicButton
+            size="lg"
+            disabled={isStreaming || disabled}
+            onInterim={(text) => setInterim(text)}
+            onFinal={(text) => {
+              setValue((v) => (v ? `${v} ${text}` : text));
+              setInterim("");
+            }}
+          />
+        ) : (
+          <MicButton
+            size="lg"
+            disabled={isStreaming || disabled}
+            onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
+          />
+        ))}
      {isStreaming ? (
        <Tooltip label={t("Stop")} withArrow>
          <ActionIcon
@@ -93,6 +118,12 @@ export default function ChatInput({
          </ActionIcon>
        </Tooltip>
      )}
-    </Group>
+      </Group>
+      {interim && (
+        <Text size="sm" c="dimmed">
+          {interim}
+        </Text>
+      )}
+    </Stack>
  );
 }
--- a/apps/client/src/features/dictation/audio/audio-worklet.d.ts
+++ b/apps/client/src/features/dictation/audio/audio-worklet.d.ts
@@ -0,0 +1,33 @@
+// Minimal ambient declarations for the AudioWorklet global scope.
+//
+// The client tsconfig only pulls in the DOM libs (no "webworker"/"audioworklet"
+// lib), so the symbols available inside an AudioWorkletProcessor module are not
+// known to `tsc`. These declarations are intentionally narrow: just enough for
+// `pcm16-worklet.ts` to typecheck, matching the Web Audio API spec shapes used
+// by that processor. They describe the worklet global scope, not the main thread.
+
+declare abstract class AudioWorkletProcessor {
+  // Message channel back to the main thread (used to transfer PCM16 buffers).
+  readonly port: MessagePort;
+
+  constructor();
+
+  // Called for each render quantum. `inputs`/`outputs` are channel arrays
+  // indexed as [input][channel][sample]; `parameters` maps AudioParam names to
+  // their per-sample (or single-value) Float32Array. Return `true` to keep the
+  // processor alive.
+  abstract process(
+    inputs: Float32Array[][],
+    outputs: Float32Array[][],
+    parameters: Record<string, Float32Array>,
+  ): boolean;
+}
+
+// Registers a processor class under a name usable from `new AudioWorkletNode`.
+declare function registerProcessor(
+  name: string,
+  processorCtor: new () => AudioWorkletProcessor,
+): void;
+
+// The render context's sample rate, in Hz, available in the worklet global scope.
+declare const sampleRate: number;
--- a/apps/client/src/features/dictation/audio/pcm16-worklet.ts
+++ b/apps/client/src/features/dictation/audio/pcm16-worklet.ts
@@ -0,0 +1,123 @@
+// Self-contained AudioWorkletProcessor that turns the live microphone stream into
+// PCM16 (signed 16-bit, little-endian), mono, 24000 Hz chunks for the realtime STT
+// upstream. It runs in the AudioWorklet global scope, so it MUST NOT import anything
+// (the worklet module has no module graph / bundler runtime around it).
+//
+// Per `process()` call the host hands us a render quantum (typically 128 frames) at
+// the context sample rate. We read the first input channel (mono), linearly resample
+// to 24000 Hz while carrying the fractional read position across calls (so we never
+// assume a particular input rate, e.g. 44.1k or 48k), accumulate the resampled
+// samples, and once we have ~150 ms worth (3600 samples) we emit them as an
+// Int16 ArrayBuffer transferred to the main thread.
+
+// Target output rate required by the upstream transcription contract.
+const TARGET_RATE = 24000;
+// ~150 ms of audio at the target rate: 24000 * 0.15 = 3600 samples per message.
+const FRAME_SAMPLES = Math.round(TARGET_RATE * 0.15);
+
+class Pcm16Worklet extends AudioWorkletProcessor {
+  // Fractional read position within the CURRENT quantum, in input-sample units.
+  // Kept across `process()` calls so resampling has no per-quantum seams. After a
+  // quantum it is rebased relative to the next quantum's start, so a value in
+  // [-1, 0) means "interpolate between the previous quantum's last sample and the
+  // next quantum's first sample".
+  private resamplePos = 0;
+
+  // The previous quantum's last input sample, used to interpolate across the
+  // boundary between two render quanta (the conceptual sample at index -1).
+  private prevSample = 0;
+
+  // True once at least one sample has been seen (so `prevSample` is meaningful).
+  private primed = false;
+
+  // Accumulated resampled Float32 samples awaiting conversion + flush.
+  private acc: Float32Array = new Float32Array(FRAME_SAMPLES);
+  private accLen = 0;
+
+  process(inputs: Float32Array[][]): boolean {
+    const input = inputs[0];
+    // No connected input (or a momentarily empty quantum): keep the node alive
+    // and emit silence below.
+    const channel = input && input.length > 0 ? input[0] : undefined;
+
+    if (channel && channel.length > 0) {
+      this.resampleAndAccumulate(channel);
+    }
+
+    // Drive silence to the output so connecting this node to destination keeps
+    // the graph running without echoing the microphone back to the speakers.
+    return true;
+  }
+
+  // Linearly resample `channel` (at the context `sampleRate`) to TARGET_RATE and
+  // push the results into the accumulator, flushing whole frames as they fill.
+  private resampleAndAccumulate(channel: Float32Array): void {
+    const ratio = sampleRate / TARGET_RATE; // input samples consumed per output sample
+    const n = channel.length;
+
+    if (!this.primed) {
+      // First quantum: there is no real predecessor, so seed the virtual index -1
+      // with this quantum's first sample and start reading from 0.
+      this.prevSample = channel[0];
+      this.primed = true;
+      this.resamplePos = 0;
+    }
+
+    let pos = this.resamplePos;
+
+    // Emit output samples whose RIGHT neighbor (floor + 1) is available within
+    // this quantum, i.e. while floor + 1 <= n - 1  ⇔  pos < n - 1. The left
+    // neighbor at floor === -1 is the carried `prevSample`; floor >= 0 reads the
+    // quantum directly. Any leftover position (whose right neighbor would be the
+    // NEXT quantum's first sample) is carried via `resamplePos` and resolved on
+    // the next call. This guarantees we never read `channel[n]` (out of bounds).
+    while (pos < n - 1) {
+      const floor = Math.floor(pos);
+      const frac = pos - floor;
+
+      const s0 = floor < 0 ? this.prevSample : channel[floor];
+      const s1 = channel[floor + 1];
+
+      this.pushSample(s0 + (s1 - s0) * frac);
+      pos += ratio;
+    }
+
+    // Rebase the leftover position relative to the next quantum's start and carry
+    // this quantum's last sample as the predecessor for the boundary interval.
+    this.resamplePos = pos - n;
+    this.prevSample = channel[n - 1];
+  }
+
+  // Append one resampled sample; flush a full PCM16 frame whenever the
+  // accumulator reaches FRAME_SAMPLES.
+  private pushSample(sample: number): void {
+    this.acc[this.accLen] = sample;
+    this.accLen += 1;
+    if (this.accLen >= FRAME_SAMPLES) {
+      this.flush();
+    }
+  }
+
+  // Convert the accumulated Float32 samples to Int16 LE and post the ArrayBuffer
+  // to the main thread, transferring ownership (zero-copy). DataView writes are
+  // little-endian to match the PCM16 contract regardless of host endianness.
+  private flush(): void {
+    const count = this.accLen;
+    if (count === 0) return;
+
+    const buffer = new ArrayBuffer(count * 2);
+    const view = new DataView(buffer);
+    for (let i = 0; i < count; i++) {
+      // Clamp to [-1, 1] then scale to the signed 16-bit range.
+      let s = this.acc[i];
+      if (s > 1) s = 1;
+      else if (s < -1) s = -1;
+      view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true);
+    }
+    this.accLen = 0;
+
+    this.port.postMessage(buffer, [buffer]);
+  }
+}
+
+registerProcessor("pcm16-worklet", Pcm16Worklet);
--- a/apps/client/src/features/dictation/components/realtime-mic-button.tsx
+++ b/apps/client/src/features/dictation/components/realtime-mic-button.tsx
@@ -0,0 +1,84 @@
+import { FC, useEffect, useRef } from "react";
+import { ActionIcon, Tooltip } from "@mantine/core";
+import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
+import { useTranslation } from "react-i18next";
+import {
+  useRealtimeDictation,
+  type RealtimeDictationStatus,
+} from "@/features/dictation/hooks/use-realtime-dictation";
+
+interface RealtimeMicButtonProps {
+  onInterim: (text: string) => void;
+  onFinal: (text: string) => void;
+  onStart?: () => void;
+  disabled?: boolean;
+  // Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
+  // editor toolbar.
+  size?: "md" | "lg";
+}
+
+/**
+ * Streaming sibling of MicButton. Drives the realtime dictation state machine:
+ * a click starts recording (mic icon), a second click stops it (stop icon).
+ * Interim/final transcripts are surfaced through the onInterim/onFinal props as
+ * they arrive; there is no "transcribing" state because final text lands
+ * incrementally while recording. Mirrors MicButton's look and tooltips.
+ */
+export const RealtimeMicButton: FC<RealtimeMicButtonProps> = ({
+  onInterim,
+  onFinal,
+  onStart,
+  disabled,
+  size = "lg",
+}) => {
+  const { t } = useTranslation();
+  const { status, start, stop } = useRealtimeDictation({
+    onInterim,
+    onFinal,
+    onStart,
+  });
+  const iconSize = size === "lg" ? 18 : 16;
+
+  // When recording ends (status leaves "recording" for idle/error), clear any
+  // leftover partial in the consumer once. Tracked via the previous status so
+  // it only fires on the transition, not on every render.
+  const prevStatusRef = useRef<RealtimeDictationStatus>(status);
+  useEffect(() => {
+    if (prevStatusRef.current === "recording" && status !== "recording") {
+      onInterim("");
+    }
+    prevStatusRef.current = status;
+  }, [status, onInterim]);
+
+  if (status === "recording") {
+    return (
+      <Tooltip label={t("Stop recording")} withArrow>
+        <ActionIcon
+          size={size}
+          color="red"
+          variant="light"
+          onClick={stop}
+          aria-label={t("Stop recording")}
+        >
+          <IconPlayerStopFilled size={iconSize} />
+        </ActionIcon>
+      </Tooltip>
+    );
+  }
+
+  // idle / error: subtle mic to (re)start. No spinner — there is no separate
+  // transcribing phase in the realtime flow.
+  return (
+    <Tooltip label={t("Start dictation")} withArrow>
+      <ActionIcon
+        size={size}
+        variant="subtle"
+        onClick={() => void start()}
+        disabled={disabled}
+        aria-label={t("Start dictation")}
+      >
+        <IconMicrophone size={iconSize} />
+      </ActionIcon>
+    </Tooltip>
+  );
+};
--- a/apps/client/src/features/dictation/hooks/use-realtime-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-realtime-dictation.ts
@@ -0,0 +1,427 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { notifications } from "@mantine/notifications";
+import { useTranslation } from "react-i18next";
+import { RealtimeDictationClient } from "@/features/dictation/services/realtime-dictation-client";
+
+// The worklet module URL is produced via `new URL(..., import.meta.url)` so Vite
+// emits the processor as a separate, self-contained module chunk (it must run in
+// the AudioWorklet global scope, outside the main bundle). Built once at module
+// load — the resolved URL is stable for the app's lifetime.
+const PCM16_WORKLET_URL = new URL(
+  "../audio/pcm16-worklet.ts",
+  import.meta.url,
+);
+
+export type RealtimeDictationStatus = "idle" | "recording" | "error";
+
+export interface UseRealtimeDictationOptions {
+  onInterim: (text: string) => void; // latest partial for the live segment
+  onFinal: (text: string) => void; // a completed segment (trimmed)
+  onStart?: () => void; // fired right when capture begins (caret snapshot)
+  maxDurationMs?: number; // default 120000
+}
+
+export interface UseRealtimeDictationResult {
+  status: RealtimeDictationStatus;
+  start: () => Promise<void>;
+  stop: () => void;
+  cancel: () => void;
+}
+
+// AudioContext is webkit-prefixed on some older Safari builds; keep a typed
+// fallback so the hook never crashes when the standard name is missing.
+function getAudioContextCtor(): typeof AudioContext | undefined {
+  if (typeof AudioContext !== "undefined") return AudioContext;
+  const w = window as unknown as { webkitAudioContext?: typeof AudioContext };
+  return w.webkitAudioContext;
+}
+
+/**
+ * Streaming sibling of `use-dictation`. Captures the mic, resamples to PCM16
+ * 24 kHz in an AudioWorklet, and streams it over the normalized `/ai-realtime`
+ * Socket.IO namespace, surfacing interim/final transcripts as they arrive.
+ *
+ * Mirrors `use-dictation`'s conventions: refs hold the live graph/client/timers
+ * so re-renders never lose them, getUserMedia errors map to the same Mantine
+ * notifications, and every exit path stops the MediaStream tracks and closes the
+ * AudioContext. There is no `transcribing` state — final text arrives
+ * incrementally while `recording`.
+ */
+export function useRealtimeDictation(
+  options: UseRealtimeDictationOptions,
+): UseRealtimeDictationResult {
+  const { t, i18n } = useTranslation();
+  const [status, setStatus] = useState<RealtimeDictationStatus>("idle");
+
+  // Keep the latest callbacks in a ref so async socket handlers always call the
+  // current handlers without re-creating the capture graph.
+  const optionsRef = useRef(options);
+  optionsRef.current = options;
+
+  const streamRef = useRef<MediaStream | null>(null);
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const sourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
+  const workletRef = useRef<AudioWorkletNode | null>(null);
+  const clientRef = useRef<RealtimeDictationClient | null>(null);
+
+  const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+
+  const canceledRef = useRef(false);
+  const startingRef = useRef(false);
+  // True once the server emits `ready`; audio is buffered until then, then flushed.
+  const readyRef = useRef(false);
+  // PCM16 chunks captured before the upstream session is ready.
+  const pendingAudioRef = useRef<ArrayBuffer[]>([]);
+  // Stable ref to the latest stop() so the max-duration timer (armed inside the
+  // start closure) can invoke the current version without re-arming every render.
+  const stopRef = useRef<() => void>(() => undefined);
+
+  const clearTimer = useCallback(() => {
+    if (timerRef.current !== null) {
+      clearTimeout(timerRef.current);
+      timerRef.current = null;
+    }
+  }, []);
+
+  const stopTracks = useCallback(() => {
+    streamRef.current?.getTracks().forEach((track) => track.stop());
+    streamRef.current = null;
+  }, []);
+
+  // Tear down the audio graph (worklet node, source, context). Never throws on a
+  // half-built or already-closed graph.
+  const teardownAudio = useCallback(() => {
+    const worklet = workletRef.current;
+    if (worklet) {
+      worklet.port.onmessage = null;
+      try {
+        worklet.disconnect();
+      } catch {
+        // Node may already be disconnected; ignore.
+      }
+      workletRef.current = null;
+    }
+
+    const source = sourceRef.current;
+    if (source) {
+      try {
+        source.disconnect();
+      } catch {
+        // Ignore disconnect of an already-detached node.
+      }
+      sourceRef.current = null;
+    }
+
+    const ctx = audioContextRef.current;
+    if (ctx) {
+      audioContextRef.current = null;
+      if (ctx.state !== "closed") {
+        // close() returns a promise; swallow rejections so teardown never throws.
+        void ctx.close().catch(() => undefined);
+      }
+    }
+  }, []);
+
+  // Full teardown shared by stop/cancel/unmount. Order: stop streaming upstream,
+  // disconnect the socket, then dismantle the local audio graph and tracks, then
+  // clear timers and reset the ready/pending state.
+  const teardown = useCallback(() => {
+    const client = clientRef.current;
+    if (client) {
+      clientRef.current = null;
+      try {
+        client.stop();
+      } catch {
+        // Socket may already be gone; ignore.
+      }
+      client.disconnect();
+    }
+
+    teardownAudio();
+    stopTracks();
+    clearTimer();
+
+    readyRef.current = false;
+    pendingAudioRef.current = [];
+    startingRef.current = false;
+  }, [teardownAudio, stopTracks, clearTimer]);
+
+  // Surface a concrete failure: log it, notify, flip to "error", and reset to
+  // "idle" after a short delay (mirrors use-dictation's error timer).
+  const handleError = useCallback(
+    (message: string, err?: unknown) => {
+      if (canceledRef.current) return;
+      // Never log audio — only the textual reason.
+      console.error("[realtime-dictation]", message, err ?? "");
+      notifications.show({ color: "red", message });
+      teardown();
+      setStatus("error");
+      if (errorTimerRef.current !== null) {
+        clearTimeout(errorTimerRef.current);
+      }
+      errorTimerRef.current = setTimeout(() => {
+        errorTimerRef.current = null;
+        setStatus("idle");
+      }, 1500);
+    },
+    [teardown],
+  );
+
+  const start = useCallback(async (): Promise<void> => {
+    // Synchronous live guard: status is stale between renders, so also block on
+    // refs to prevent a double-click from opening two MediaStreams / sockets.
+    if (
+      startingRef.current ||
+      streamRef.current ||
+      audioContextRef.current ||
+      clientRef.current
+    ) {
+      return;
+    }
+    if (status !== "idle") return;
+    startingRef.current = true;
+    canceledRef.current = false;
+    readyRef.current = false;
+    pendingAudioRef.current = [];
+
+    if (!navigator.mediaDevices?.getUserMedia) {
+      const reason =
+        "navigator.mediaDevices.getUserMedia is unavailable in this context";
+      console.error("[realtime-dictation] " + reason);
+      notifications.show({
+        color: "red",
+        message: t("Audio recording is not available in this browser/context"),
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    let stream: MediaStream;
+    try {
+      stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    } catch (err) {
+      // Always log the full error for diagnosis (name, message, stack).
+      console.error("[realtime-dictation] getUserMedia failed", err);
+      const name = (err as { name?: string })?.name;
+      const detail = (err as { message?: string })?.message ?? String(err);
+      let message: string;
+      if (name === "NotAllowedError" || name === "SecurityError") {
+        message = t("Microphone access denied");
+      } else if (name === "NotFoundError" || name === "OverconstrainedError") {
+        message = t("No microphone found");
+      } else if (name === "NotReadableError" || name === "AbortError") {
+        message = t("Microphone is unavailable or already in use");
+      } else {
+        // Unknown failure: show the real reason instead of a generic string.
+        message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
+      }
+      notifications.show({ color: "red", message });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    // If a cancel landed during the await, drop the stream and bail out.
+    if (canceledRef.current) {
+      stream.getTracks().forEach((track) => track.stop());
+      startingRef.current = false;
+      setStatus("idle");
+      return;
+    }
+    streamRef.current = stream;
+
+    // Build the capture graph. The worklet still resamples robustly if the browser
+    // ignores the 24 kHz hint, so any actual context rate is handled correctly.
+    const AudioCtx = getAudioContextCtor();
+    if (!AudioCtx) {
+      stopTracks();
+      notifications.show({
+        color: "red",
+        message: t("Audio recording is not available in this browser/context"),
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    let audioContext: AudioContext;
+    try {
+      audioContext = new AudioCtx({ sampleRate: 24000 });
+      audioContextRef.current = audioContext;
+      // AudioWorklet requires a secure context (https/localhost), same constraint
+      // as getUserMedia. A failure here means the UI should fall back to batch.
+      await audioContext.audioWorklet.addModule(PCM16_WORKLET_URL);
+    } catch (err) {
+      console.error("[realtime-dictation] audio worklet setup failed", err);
+      teardownAudio();
+      stopTracks();
+      const detail = (err as { message?: string })?.message ?? String(err);
+      notifications.show({
+        color: "red",
+        message: `${t("Could not start recording")}: ${detail}`,
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    // Another cancel could have landed during addModule().
+    if (canceledRef.current) {
+      teardownAudio();
+      stopTracks();
+      startingRef.current = false;
+      setStatus("idle");
+      return;
+    }
+
+    let source: MediaStreamAudioSourceNode;
+    let worklet: AudioWorkletNode;
+    try {
+      source = audioContext.createMediaStreamSource(stream);
+      worklet = new AudioWorkletNode(audioContext, "pcm16-worklet");
+      sourceRef.current = source;
+      workletRef.current = worklet;
+      // MediaStreamSource → worklet → destination. The worklet emits silence, so
+      // connecting to destination drives the render graph without echoing the mic.
+      source.connect(worklet);
+      worklet.connect(audioContext.destination);
+    } catch (err) {
+      console.error("[realtime-dictation] audio graph wiring failed", err);
+      teardownAudio();
+      stopTracks();
+      const detail = (err as { message?: string })?.message ?? String(err);
+      notifications.show({
+        color: "red",
+        message: `${t("Could not start recording")}: ${detail}`,
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    // Each worklet message is a PCM16 ArrayBuffer. Forward it once the upstream
+    // session is ready; until then buffer so no leading audio is dropped.
+    worklet.port.onmessage = (event: MessageEvent) => {
+      if (canceledRef.current) return;
+      const buf = event.data as ArrayBuffer;
+      if (!(buf instanceof ArrayBuffer)) return;
+      if (readyRef.current && clientRef.current) {
+        clientRef.current.sendAudio(buf);
+      } else {
+        pendingAudioRef.current.push(buf);
+      }
+    };
+
+    // Wire the realtime transport. The server replies `ready` once the upstream
+    // STT session is live; we then flush any buffered audio.
+    const client = new RealtimeDictationClient({
+      onReady: () => {
+        if (canceledRef.current) return;
+        readyRef.current = true;
+        const pending = pendingAudioRef.current;
+        pendingAudioRef.current = [];
+        for (const buf of pending) clientRef.current?.sendAudio(buf);
+      },
+      onInterim: (_itemId, text) => {
+        if (canceledRef.current) return;
+        optionsRef.current.onInterim(text);
+      },
+      onFinal: (_itemId, text) => {
+        if (canceledRef.current) return;
+        const trimmed = text.trim();
+        if (trimmed.length > 0) optionsRef.current.onFinal(trimmed);
+      },
+      onError: (message) => {
+        handleError(message);
+      },
+      onClosed: () => {
+        // The server ended the session (idle/max-duration or graceful upstream
+        // close). Skip if a cancel already tore everything down, or if an error
+        // path already owns the status (its error→idle timer is pending), or if a
+        // local stop already cleared the live refs. Otherwise tear down the capture
+        // graph + socket and return to idle so the mic/AudioContext don't leak and
+        // the button doesn't stay stuck on "recording".
+        if (canceledRef.current) return;
+        if (errorTimerRef.current !== null) return;
+        if (
+          !clientRef.current &&
+          !audioContextRef.current &&
+          !streamRef.current
+        ) {
+          return;
+        }
+        teardown();
+        setStatus("idle");
+      },
+    });
+    clientRef.current = client;
+
+    // Notify the caller right when capture begins (before opening the socket) so
+    // the editor can snapshot the caret position.
+    try {
+      optionsRef.current.onStart?.();
+    } catch (err) {
+      console.error("[realtime-dictation] onStart callback threw", err);
+    }
+
+    // Open the socket, then ask the server to open the upstream session. The
+    // language hint is the base subtag of the resolved UI language (e.g. "en-US"
+    // → "en"), since the upstream transcription model expects an ISO language
+    // code, not a region-tagged locale; the server omits it upstream when absent.
+    client.connect();
+    const locale = i18n.resolvedLanguage || i18n.language || "";
+    const language = locale.split("-")[0] || undefined;
+    client.start({ language });
+
+    setStatus("recording");
+    // Capture has truly begun; release the synchronous start guard.
+    startingRef.current = false;
+
+    const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
+    timerRef.current = setTimeout(() => {
+      // Reuse stop() so the upstream is flushed/closed gracefully.
+      stopRef.current?.();
+    }, maxDurationMs);
+  }, [status, t, i18n, stopTracks, teardownAudio, handleError]);
+
+  const stop = useCallback((): void => {
+    // Nothing live → no-op (never crash on an idle/destroyed state).
+    if (
+      !clientRef.current &&
+      !audioContextRef.current &&
+      !streamRef.current &&
+      !startingRef.current
+    ) {
+      return;
+    }
+    teardown();
+    setStatus("idle");
+  }, [teardown]);
+
+  // Keep the stop ref pointed at the latest stop() for the max-duration timer.
+  stopRef.current = stop;
+
+  const cancel = useCallback((): void => {
+    // Mark canceled first so any late socket/worklet callbacks are ignored.
+    canceledRef.current = true;
+    teardown();
+    setStatus("idle");
+  }, [teardown]);
+
+  // Clean up on unmount: stop tracks, close the context/worklet, disconnect the
+  // socket, and clear timers.
+  useEffect(() => {
+    return () => {
+      canceledRef.current = true;
+      if (errorTimerRef.current !== null) {
+        clearTimeout(errorTimerRef.current);
+        errorTimerRef.current = null;
+      }
+      teardown();
+    };
+  }, [teardown]);
+
+  return { status, start, stop, cancel };
+}
--- a/apps/client/src/features/dictation/services/realtime-dictation-client.ts
+++ b/apps/client/src/features/dictation/services/realtime-dictation-client.ts
@@ -0,0 +1,124 @@
+import { io, Socket } from "socket.io-client";
+import { SOCKET_URL } from "@/features/websocket/types";
+
+// Handlers the hook supplies; the client translates the normalized `/ai-realtime`
+// Socket.IO events into these callbacks. The client itself owns no React state —
+// it is a thin transport wrapper so the hook can stay focused on the audio graph.
+export interface RealtimeDictationHandlers {
+  // Upstream STT session is established; safe to start sending audio.
+  onReady: () => void;
+  // Latest partial transcript for the current (not-yet-final) segment.
+  onInterim: (itemId: string, text: string) => void;
+  // A completed segment's transcript.
+  onFinal: (itemId: string, text: string) => void;
+  // Concrete failure reason (connect error or server-surfaced error).
+  onError: (message: string) => void;
+  // Session ended (graceful stop or upstream closed).
+  onClosed: () => void;
+}
+
+interface StartOptions {
+  language?: string;
+}
+
+// Wraps the dedicated `/ai-realtime` Socket.IO namespace. Cookie-based auth rides
+// the handshake via `withCredentials` (no bearer token), exactly like the main
+// app socket. `autoConnect: false` lets the hook wire listeners up before the
+// handshake fires so no early event is missed.
+export class RealtimeDictationClient {
+  private socket: Socket | null = null;
+  // onError must fire at most once per session: the server `error` and socket
+  // `connect_error` can both arrive (e.g. an error then a failed reconnect), but
+  // the hook owns the error→idle flow and a second call would double-fire it.
+  private erroredFlag = false;
+
+  constructor(private readonly handlers: RealtimeDictationHandlers) {}
+
+  // Forward the first error reason only; later error/connect_error are swallowed.
+  private emitError(message: string): void {
+    if (this.erroredFlag) return;
+    this.erroredFlag = true;
+    this.handlers.onError(message);
+  }
+
+  // Create the socket, register listeners, then open the connection. Safe to call
+  // once per client instance; a second call is a no-op while a socket exists.
+  connect(): void {
+    if (this.socket) return;
+    // Fresh socket → allow onError to fire again for this connection.
+    this.erroredFlag = false;
+
+    // SOCKET_URL is undefined in this app (socket.io derives the page origin), so
+    // the `/ai-realtime` namespace rides the same `/socket.io` path as the main
+    // socket — which the Vite dev server proxies as a websocket.
+    const socket: Socket = SOCKET_URL
+      ? io(`${SOCKET_URL}/ai-realtime`, {
+          transports: ["websocket"],
+          withCredentials: true,
+          autoConnect: false,
+        })
+      : io("/ai-realtime", {
+          transports: ["websocket"],
+          withCredentials: true,
+          autoConnect: false,
+        });
+
+    this.socket = socket;
+
+    socket.on("ready", () => this.handlers.onReady());
+
+    socket.on("interim", (payload: { itemId: string; text: string }) => {
+      this.handlers.onInterim(payload?.itemId ?? "", payload?.text ?? "");
+    });
+
+    socket.on("final", (payload: { itemId: string; text: string }) => {
+      this.handlers.onFinal(payload?.itemId ?? "", payload?.text ?? "");
+    });
+
+    socket.on("error", (payload: { message?: string } | string) => {
+      const message =
+        typeof payload === "string"
+          ? payload
+          : payload?.message || "Realtime dictation error";
+      this.emitError(message);
+    });
+
+    socket.on("closed", () => this.handlers.onClosed());
+
+    // Low-level transport failure (handshake/auth/proxy). Surface a concrete cause.
+    socket.on("connect_error", (err: Error) => {
+      const message = err?.message
+        ? `Realtime connection failed: ${err.message}`
+        : "Realtime connection failed";
+      this.emitError(message);
+    });
+
+    socket.connect();
+  }
+
+  // Ask the server to resolve config and open the upstream STT session.
+  start(opts: StartOptions): void {
+    this.socket?.emit("start", { language: opts.language });
+  }
+
+  // Forward a raw PCM16 chunk; socket.io serializes the ArrayBuffer as binary.
+  sendAudio(buf: ArrayBuffer): void {
+    this.socket?.emit("audio", buf);
+  }
+
+  // Request a graceful flush/close of the upstream session.
+  stop(): void {
+    this.socket?.emit("stop");
+  }
+
+  // Tear down: drop every listener and close the connection. Idempotent.
+  disconnect(): void {
+    const socket = this.socket;
+    if (!socket) return;
+    this.socket = null;
+    // Reset so a subsequent connect() on a reused instance can error again.
+    this.erroredFlag = false;
+    socket.removeAllListeners();
+    socket.disconnect();
+  }
+}
--- a/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
+++ b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
@@ -1,12 +1,21 @@
 import { FC, useRef } from "react";
 import type { Editor } from "@tiptap/react";
+import { useAtomValue } from "jotai";
 import { MicButton } from "@/features/dictation/components/mic-button";
+import { RealtimeMicButton } from "@/features/dictation/components/realtime-mic-button";
+import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
+import {
+  setDictationInterim,
+  clearDictationInterim,
+} from "@/features/editor/extensions/dictation-interim/dictation-interim.ts";

 interface Props {
  editor: Editor;
 }

 export const DictationGroup: FC<Props> = ({ editor }) => {
+  const workspace = useAtomValue(workspaceAtom);
+  const isRealtime = workspace?.settings?.ai?.dictationRealtime === true;
  const rangeRef = useRef<{ from: number; to: number } | null>(null);

  const handleStart = () => {
@@ -50,6 +59,33 @@ export const DictationGroup: FC<Props> = ({ editor }) => {
    }
  };

+  // Realtime path: commit each final segment at the LIVE caret (inserts happen
+  // during recording, so no fixed snapshot is needed); interim is shown via the
+  // ghost decoration only.
+  if (isRealtime) {
+    return (
+      <RealtimeMicButton
+        size="md"
+        disabled={!editor.isEditable}
+        onStart={() => {
+          if (editor && !editor.isDestroyed) clearDictationInterim(editor);
+        }}
+        onInterim={(text) => {
+          if (editor && !editor.isDestroyed) setDictationInterim(editor, text);
+        }}
+        onFinal={(text) => {
+          if (!editor || editor.isDestroyed) return;
+          clearDictationInterim(editor);
+          try {
+            editor.chain().focus().insertContent(`${text} `).run();
+          } catch {
+            // The editor may have been destroyed mid-stream; ignore.
+          }
+        }}
+      />
+    );
+  }
+
  return (
    <MicButton
      size="md"
--- a/apps/client/src/features/editor/extensions/dictation-interim/dictation-interim.ts
+++ b/apps/client/src/features/editor/extensions/dictation-interim/dictation-interim.ts
@@ -0,0 +1,97 @@
+import { Extension } from "@tiptap/core";
+import type { Editor } from "@tiptap/core";
+import { Plugin, PluginKey } from "@tiptap/pm/state";
+import { Decoration, DecorationSet } from "@tiptap/pm/view";
+
+// Plugin key shared by the extension and the imperative helpers below so they
+// dispatch/read the same plugin state.
+const dictationInterimKey = new PluginKey<DictationInterimState>(
+  "dictationInterim",
+);
+
+interface DictationInterimState {
+  // The current interim (partial) transcript. Empty string means "no ghost".
+  text: string;
+}
+
+/**
+ * B2 editor decoration: shows the realtime interim (partial) transcript as a
+ * ghost widget at the caret. The interim is held ONLY in plugin meta state and
+ * rendered as a widget Decoration — it is NEVER written into the document, so
+ * it produces no Yjs update and no history entry. Only final segments are
+ * committed (by the dictation-group / chat consumers).
+ */
+export const DictationInterim = Extension.create({
+  name: "dictationInterim",
+
+  addProseMirrorPlugins() {
+    return [
+      new Plugin<DictationInterimState>({
+        key: dictationInterimKey,
+        state: {
+          init: (): DictationInterimState => ({ text: "" }),
+          apply: (tr, value): DictationInterimState => {
+            const meta = tr.getMeta(dictationInterimKey) as
+              | DictationInterimState
+              | undefined;
+            // Meta-only updates replace the interim text; everything else keeps
+            // the existing value (it follows the caret on its own since the
+            // decoration is recomputed against the live selection).
+            if (meta) {
+              return { text: meta.text };
+            }
+            return value;
+          },
+        },
+        props: {
+          decorations(state) {
+            const pluginState = dictationInterimKey.getState(state);
+            const text = pluginState?.text ?? "";
+            if (!text) {
+              return null;
+            }
+
+            // Render the interim as an inline ghost at the caret. Inline styles
+            // keep this self-contained — no global CSS is required.
+            const widget = Decoration.widget(
+              state.selection.head,
+              () => {
+                const span = document.createElement("span");
+                span.textContent = text;
+                span.setAttribute("contenteditable", "false");
+                span.style.opacity = "0.5";
+                span.style.fontStyle = "italic";
+                return span;
+              },
+              { side: 1, ignoreSelection: true },
+            );
+
+            return DecorationSet.create(state.doc, [widget]);
+          },
+        },
+      }),
+    ];
+  },
+});
+
+/**
+ * Set the interim ghost text via a META-ONLY transaction — no doc steps, so it
+ * generates no Yjs update and no history entry.
+ */
+export function setDictationInterim(editor: Editor, text: string): void {
+  editor.view.dispatch(
+    editor.state.tr.setMeta(dictationInterimKey, { text }),
+  );
+}
+
+/**
+ * Clear the interim ghost text via a META-ONLY transaction (same no-op-on-doc
+ * guarantee as setDictationInterim).
+ */
+export function clearDictationInterim(editor: Editor): void {
+  editor.view.dispatch(
+    editor.state.tr.setMeta(dictationInterimKey, { text: "" }),
+  );
+}
+
+export default DictationInterim;
--- a/apps/client/src/features/editor/extensions/extensions.ts
+++ b/apps/client/src/features/editor/extensions/extensions.ts
@@ -123,6 +123,7 @@ import { countWords } from "alfaaz";
 import AutoJoiner from "@/features/editor/extensions/autojoiner.ts";
 import GlobalDragHandle from "@/features/editor/extensions/drag-handle.ts";
 import { CleanStyles } from "@/features/editor/extensions/clean-styles.ts";
+import { DictationInterim } from "@/features/editor/extensions/dictation-interim/dictation-interim.ts";

 const lowlight = createLowlight(common);
 lowlight.register("mermaid", plaintext);
@@ -343,6 +344,7 @@ export const mainExtensions = [
    },
  }),
  Selection,
+  DictationInterim,
  Attachment.configure({
    view: AttachmentView,
  }),
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -32,6 +32,7 @@ import {
  useAiSettingsQuery,
  useReindexAiEmbeddingsMutation,
  useTestAiConnectionMutation,
+  useTestRealtimeConnectionMutation,
  useUpdateAiSettingsMutation,
 } from "@/features/workspace/queries/ai-settings-query.ts";
 import {
@@ -62,6 +63,10 @@ const formSchema = z.object({
  // STT-specific fields. Empty base URL / key fall back to the chat ones.
  sttModel: z.string(),
  sttBaseUrl: z.string(),
+  // Realtime (streaming) STT fields. Empty model falls back to sttModel and
+  // empty base URL falls back to the STT base URL server-side.
+  sttRealtimeModel: z.string(),
+  sttRealtimeBaseUrl: z.string(),
  sttApiStyle: z.enum(["multipart", "json"]),
  sttApiKey: z.string(),
 });
@@ -176,6 +181,8 @@ export default function AiProviderSettings() {
  const chatTest = useTestAiConnectionMutation();
  const embedTest = useTestAiConnectionMutation();
  const sttTest = useTestAiConnectionMutation();
+  // Realtime probe hits a separate /ai-chat/realtime/test route (admin-gated).
+  const realtimeTest = useTestRealtimeConnectionMutation();

  // Agent roles drive the public-share assistant identity picker. Admin-gated
  // (the component returns early for non-admins), same as the AI settings query.
@@ -192,6 +199,8 @@ export default function AiProviderSettings() {
  const [dictationEnabled, setDictationEnabled] = useState<boolean>(
    workspace?.settings?.ai?.dictation ?? false,
  );
+  const [realtimeDictationEnabled, setRealtimeDictationEnabled] =
+    useState<boolean>(workspace?.settings?.ai?.dictationRealtime ?? false);
  const [publicShareAssistantEnabled, setPublicShareAssistantEnabled] =
    useState<boolean>(
      workspace?.settings?.ai?.publicShareAssistant ?? false,
@@ -199,6 +208,10 @@ export default function AiProviderSettings() {
  const [chatToggleLoading, setChatToggleLoading] = useState(false);
  const [searchToggleLoading, setSearchToggleLoading] = useState(false);
  const [dictationToggleLoading, setDictationToggleLoading] = useState(false);
+  const [
+    realtimeDictationToggleLoading,
+    setRealtimeDictationToggleLoading,
+  ] = useState(false);
  const [
    publicShareAssistantToggleLoading,
    setPublicShareAssistantToggleLoading,
@@ -232,6 +245,8 @@ export default function AiProviderSettings() {
      embeddingApiKey: "",
      sttModel: "",
      sttBaseUrl: "",
+      sttRealtimeModel: "",
+      sttRealtimeBaseUrl: "",
      sttApiStyle: "multipart" as SttApiStyle,
      sttApiKey: "",
    },
@@ -253,6 +268,8 @@ export default function AiProviderSettings() {
      embeddingApiKey: "",
      sttModel: settings.sttModel ?? "",
      sttBaseUrl: settings.sttBaseUrl ?? "",
+      sttRealtimeModel: settings.sttRealtimeModel ?? "",
+      sttRealtimeBaseUrl: settings.sttRealtimeBaseUrl ?? "",
      sttApiStyle: settings.sttApiStyle ?? "multipart",
      sttApiKey: "",
    });
@@ -287,6 +304,10 @@ export default function AiProviderSettings() {
      // server-side.
      sttModel: values.sttModel,
      sttBaseUrl: values.sttBaseUrl,
+      // Realtime STT: empty model falls back to sttModel, empty base URL falls
+      // back to the STT base URL server-side.
+      sttRealtimeModel: values.sttRealtimeModel,
+      sttRealtimeBaseUrl: values.sttRealtimeBaseUrl,
      sttApiStyle: values.sttApiStyle,
    };

@@ -434,6 +455,35 @@ export default function AiProviderSettings() {
    }
  }

+  // Optimistic toggle for the "Realtime dictation" feature
+  // (settings.ai.dictationRealtime). Layered on top of batch dictation.
+  async function handleToggleRealtimeDictation(value: boolean) {
+    setRealtimeDictationToggleLoading(true);
+    const previous = realtimeDictationEnabled;
+    setRealtimeDictationEnabled(value);
+    try {
+      const updated = await updateWorkspace({ aiDictationRealtime: value });
+      setWorkspace({
+        ...updated,
+        settings: {
+          ...updated.settings,
+          ai: { ...updated.settings?.ai, dictationRealtime: value },
+        },
+      });
+      notifications.show({ message: t("Updated successfully") });
+    } catch (err) {
+      setRealtimeDictationEnabled(previous);
+      const message = (err as { response?: { data?: { message?: string } } })
+        ?.response?.data?.message;
+      notifications.show({
+        message: message ?? t("Failed to update data"),
+        color: "red",
+      });
+    } finally {
+      setRealtimeDictationToggleLoading(false);
+    }
+  }
+
  // Optimistic toggle for the anonymous public-share AI assistant
  // (settings.ai.publicShareAssistant). When off, the public endpoint 404s.
  async function handleTogglePublicShareAssistant(value: boolean) {
@@ -853,13 +903,24 @@ export default function AiProviderSettings() {
            <StatusDot status={sttStatus} label={cardStatusLabel(sttStatus, t)} />
            <Text fw={600}>{t("Voice / STT")}</Text>
          </Group>
-          <Switch
-            label={t("Voice dictation")}
-            labelPosition="left"
-            checked={dictationEnabled}
-            disabled={dictationToggleLoading}
-            onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
-          />
+          <Group gap="md" align="center" wrap="nowrap">
+            <Switch
+              label={t("Voice dictation")}
+              labelPosition="left"
+              checked={dictationEnabled}
+              disabled={dictationToggleLoading}
+              onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
+            />
+            <Switch
+              label={t("Realtime dictation")}
+              labelPosition="left"
+              checked={realtimeDictationEnabled}
+              disabled={realtimeDictationToggleLoading}
+              onChange={(e) =>
+                handleToggleRealtimeDictation(e.currentTarget.checked)
+              }
+            />
+          </Group>
        </Group>
        <Text size="xs" c="dimmed" mt={4} mb="md">
          {t(
@@ -954,6 +1015,58 @@ export default function AiProviderSettings() {
              </Text>
            ))}
        </Group>
+
+        {/* Realtime (streaming) dictation: layered on top of batch STT and only
+            shown when the workspace toggle is on. Model falls back to the STT
+            model and the endpoint falls back to the STT base URL server-side. */}
+        {realtimeDictationEnabled && (
+          <>
+            <Text size="xs" c="dimmed" mt="md" mb="xs">
+              {t(
+                "Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)",
+              )}
+            </Text>
+
+            <TextInput
+              label={t("Realtime model")}
+              placeholder="gpt-4o-mini-transcribe"
+              disabled={isLoading}
+              {...form.getInputProps("sttRealtimeModel")}
+            />
+
+            <TextInput
+              mt="sm"
+              label={t("Realtime endpoint")}
+              description={t(
+                "Leave empty to use the STT base URL",
+              )}
+              placeholder={t("Leave empty to use the STT base URL")}
+              disabled={isLoading}
+              {...form.getInputProps("sttRealtimeBaseUrl")}
+            />
+
+            <Group mt="md" align="center">
+              <Button
+                variant="default"
+                size="sm"
+                loading={realtimeTest.isPending}
+                onClick={() => realtimeTest.mutate()}
+              >
+                {t("Test endpoint")}
+              </Button>
+              {realtimeTest.data &&
+                (realtimeTest.data.ok ? (
+                  <Text size="sm" c="green">
+                    {t("Connection successful")}
+                  </Text>
+                ) : (
+                  <Text size="sm" c="red">
+                    {realtimeTest.data.error || t("Connection failed")}
+                  </Text>
+                ))}
+            </Group>
+          </>
+        )}
      </Paper>

      {/* Nested: external MCP tools the agent calls out to */}
--- a/apps/client/src/features/workspace/queries/ai-settings-query.ts
+++ b/apps/client/src/features/workspace/queries/ai-settings-query.ts
@@ -8,6 +8,7 @@ import {
  getAiSettings,
  updateAiSettings,
  testAiConnection,
+  testRealtimeConnection,
  reindexAiEmbeddings,
  IAiSettings,
  IAiSettingsUpdate,
@@ -55,6 +56,12 @@ export function useTestAiConnectionMutation() {
  });
 }

+export function useTestRealtimeConnectionMutation() {
+  return useMutation<IAiTestResult, Error, void>({
+    mutationFn: () => testRealtimeConnection(),
+  });
+}
+
 export function useReindexAiEmbeddingsMutation() {
  const { t } = useTranslation();
  const queryClient = useQueryClient();
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -32,6 +32,8 @@ export interface IAiSettings {
  // key is stored (empty means "uses the chat API key").
  sttModel?: string;
  sttBaseUrl?: string;
+  sttRealtimeModel?: string;
+  sttRealtimeBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
  hasSttApiKey: boolean;
  // RAG indexing coverage (pages indexed for semantic search).
@@ -59,6 +61,8 @@ export interface IAiSettingsUpdate {
  embeddingApiKey?: string;
  sttModel?: string;
  sttBaseUrl?: string;
+  sttRealtimeModel?: string;
+  sttRealtimeBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
  // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
  sttApiKey?: string;
@@ -95,6 +99,14 @@ export async function testAiConnection(
  return req.data;
 }

+// Probes the realtime (streaming STT) endpoint. Unlike the other tests this
+// route lives under /ai-chat (not /workspace/ai-settings); it is admin-gated
+// server-side and returns the same { ok, error? } envelope at req.data.
+export async function testRealtimeConnection(): Promise<IAiTestResult> {
+  const req = await api.post<IAiTestResult>("/ai-chat/realtime/test");
+  return req.data;
+}
+
 export async function reindexAiEmbeddings(): Promise<IAiSettings> {
  const req = await api.post<IAiSettings>("/workspace/ai-settings/reindex");
  return req.data;
--- a/apps/client/src/features/workspace/types/workspace.types.ts
+++ b/apps/client/src/features/workspace/types/workspace.types.ts
@@ -25,6 +25,7 @@ export interface IWorkspace {
  mcpEnabled?: boolean;
  aiChat?: boolean;
  aiDictation?: boolean;
+  aiDictationRealtime?: boolean;
  aiPublicShareAssistant?: boolean;
  trashRetentionDays?: number;
  restrictApiToAdmins?: boolean;
@@ -62,6 +63,7 @@ export interface IWorkspaceAiSettings {
  mcp?: boolean;
  chat?: boolean;
  dictation?: boolean;
+  dictationRealtime?: boolean;
  publicShareAssistant?: boolean;
 }