feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the server via the workspace's OpenAI-compatible AI provider (Whisper / gpt-4o-transcribe / self-hosted whisper), then inserts the text. Backend: - New `stt_api_key_enc` column + migration; STT creds parity with chat/ embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to chat baseUrl/key). Both provider whitelists updated (service + repo). - AiService.getTranscriptionModel + AiTranscriptionService. - Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace scope + throttle, 25MB cap, MIME whitelist, never logs audio/key). - New `settings.ai.dictation` workspace flag (DTO + service + audit). Frontend: - Wire up the Voice/STT settings card (model/base URL/key) and the Voice-dictation toggle. - New `features/dictation`: useDictation (MediaRecorder state machine), MicButton, transcribe service; integrated into the chat composer and a new editor-toolbar dictation group, both gated by ai.dictation.
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1181,5 +1181,13 @@
  "Embeddings": "Embeddings",
  "Leave empty to use the chat API key": "Leave empty to use the chat API key",
  "Leave empty to use the chat base URL": "Leave empty to use the chat base URL",
-  "Reindex now": "Reindex now"
+  "Reindex now": "Reindex now",
+  "Start dictation": "Start dictation",
+  "Stop recording": "Stop recording",
+  "Transcribing…": "Transcribing…",
+  "Microphone access denied": "Microphone access denied",
+  "No microphone found": "No microphone found",
+  "Could not start recording": "Could not start recording",
+  "Transcription failed": "Transcription failed",
+  "Voice dictation is not configured": "Voice dictation is not configured"
 }
--- a/apps/client/src/features/ai-chat/components/chat-input.tsx
+++ b/apps/client/src/features/ai-chat/components/chat-input.tsx
@@ -2,8 +2,10 @@ import { KeyboardEvent } from "react";
 import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core";
 import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react";
 import { useTranslation } from "react-i18next";
-import { useAtom } from "jotai";
+import { useAtom, useAtomValue } from "jotai";
 import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts";
+import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
+import { MicButton } from "@/features/dictation/components/mic-button";

 interface ChatInputProps {
  onSend: (text: string) => void;
@@ -25,6 +27,8 @@ export default function ChatInput({
 }: ChatInputProps) {
  const { t } = useTranslation();
  const [value, setValue] = useAtom(aiChatDraftAtom);
+  const workspace = useAtomValue(workspaceAtom);
+  const isDictationEnabled = workspace?.settings?.ai?.dictation === true;

  const send = (): void => {
    const text = value.trim();
@@ -57,6 +61,13 @@ export default function ChatInput({
        // switch), so a fresh chat lands with the cursor ready in the field.
        autoFocus
      />
+      {isDictationEnabled && (
+        <MicButton
+          size="lg"
+          disabled={isStreaming || disabled}
+          onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
+        />
+      )}
      {isStreaming ? (
        <Tooltip label={t("Stop")} withArrow>
          <ActionIcon
--- a/apps/client/src/features/dictation/components/mic-button.tsx
+++ b/apps/client/src/features/dictation/components/mic-button.tsx
@@ -0,0 +1,76 @@
+import { FC } from "react";
+import { ActionIcon, Loader, Tooltip } from "@mantine/core";
+import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
+import { useTranslation } from "react-i18next";
+import { useDictation } from "@/features/dictation/hooks/use-dictation";
+
+interface MicButtonProps {
+  onText: (text: string) => void;
+  onStart?: () => void;
+  disabled?: boolean;
+  // Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
+  // editor toolbar.
+  size?: "md" | "lg";
+}
+
+/**
+ * Self-contained dictation toggle. Owns its own capture state machine: a click
+ * starts recording (mic icon), a second click stops it (stop icon), and while
+ * the audio is being transcribed it shows a spinner and is disabled to prevent
+ * overlapping requests.
+ */
+export const MicButton: FC<MicButtonProps> = ({
+  onText,
+  onStart,
+  disabled,
+  size = "lg",
+}) => {
+  const { t } = useTranslation();
+  const { status, start, stop } = useDictation({ onText, onStart });
+  const iconSize = size === "lg" ? 18 : 16;
+
+  if (status === "recording") {
+    return (
+      <Tooltip label={t("Stop recording")} withArrow>
+        <ActionIcon
+          size={size}
+          color="red"
+          variant="light"
+          onClick={stop}
+          aria-label={t("Stop recording")}
+        >
+          <IconPlayerStopFilled size={iconSize} />
+        </ActionIcon>
+      </Tooltip>
+    );
+  }
+
+  if (status === "transcribing" || status === "error") {
+    return (
+      <Tooltip label={t("Transcribing…")} withArrow>
+        <ActionIcon
+          size={size}
+          variant="subtle"
+          disabled
+          aria-label={t("Transcribing…")}
+        >
+          <Loader size="xs" />
+        </ActionIcon>
+      </Tooltip>
+    );
+  }
+
+  return (
+    <Tooltip label={t("Start dictation")} withArrow>
+      <ActionIcon
+        size={size}
+        variant="subtle"
+        onClick={() => void start()}
+        disabled={disabled}
+        aria-label={t("Start dictation")}
+      >
+        <IconMicrophone size={iconSize} />
+      </ActionIcon>
+    </Tooltip>
+  );
+};
--- a/apps/client/src/features/dictation/hooks/use-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-dictation.ts
@@ -0,0 +1,260 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { notifications } from "@mantine/notifications";
+import { useTranslation } from "react-i18next";
+import { transcribeAudio } from "@/features/dictation/services/dictation-service";
+
+export type DictationStatus = "idle" | "recording" | "transcribing" | "error";
+
+interface UseDictationOptions {
+  onText: (text: string) => void;
+  onStart?: () => void;
+  maxDurationMs?: number;
+}
+
+interface UseDictationResult {
+  status: DictationStatus;
+  start: () => Promise<void>;
+  stop: () => void;
+  cancel: () => void;
+}
+
+// Candidate container/codec combinations in preference order. The first one the
+// browser supports wins; if none do we let MediaRecorder pick its own default.
+const MIME_CANDIDATES = [
+  "audio/webm;codecs=opus",
+  "audio/webm",
+  "audio/mp4",
+  "audio/ogg;codecs=opus",
+  "audio/ogg",
+];
+
+// Derive a sensible upload filename from the recorded MIME type. The server keys
+// off the blob's MIME, so this only affects the part name, but a matching
+// extension keeps things tidy.
+function filenameForMime(mime: string): string {
+  if (mime.includes("mp4")) return "speech.mp4";
+  if (mime.includes("ogg")) return "speech.ogg";
+  return "speech.webm";
+}
+
+function pickMimeType(): string | undefined {
+  if (typeof MediaRecorder === "undefined") return undefined;
+  for (const candidate of MIME_CANDIDATES) {
+    if (MediaRecorder.isTypeSupported?.(candidate)) return candidate;
+  }
+  return undefined;
+}
+
+/**
+ * Encapsulates the browser audio-capture state machine: request the mic, record
+ * with MediaRecorder, then POST the blob for transcription. Refs hold the live
+ * recorder/stream/chunks/timer/cancel flag so component re-renders never lose
+ * them, and every exit path stops the MediaStream tracks.
+ */
+export function useDictation(
+  options: UseDictationOptions,
+): UseDictationResult {
+  const { t } = useTranslation();
+  const [status, setStatus] = useState<DictationStatus>("idle");
+
+  // Keep the latest callbacks in a ref so the recorder's onstop closure always
+  // calls the current handlers without re-creating the recorder.
+  const optionsRef = useRef(options);
+  optionsRef.current = options;
+
+  const recorderRef = useRef<MediaRecorder | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const canceledRef = useRef(false);
+  const startingRef = useRef(false);
+
+  const clearTimer = useCallback(() => {
+    if (timerRef.current !== null) {
+      clearTimeout(timerRef.current);
+      timerRef.current = null;
+    }
+  }, []);
+
+  const stopTracks = useCallback(() => {
+    streamRef.current?.getTracks().forEach((track) => track.stop());
+    streamRef.current = null;
+  }, []);
+
+  const start = useCallback(async (): Promise<void> => {
+    // Synchronous live guard: status is stale between renders, so also block on
+    // refs to prevent a double-click from opening two MediaStreams (the first
+    // would leak).
+    if (startingRef.current || recorderRef.current || streamRef.current) return;
+    if (status !== "idle") return;
+    startingRef.current = true;
+
+    let stream: MediaStream;
+    try {
+      stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    } catch (err) {
+      const name = (err as { name?: string })?.name;
+      let message: string;
+      if (name === "NotAllowedError" || name === "SecurityError") {
+        message = t("Microphone access denied");
+      } else if (name === "NotFoundError" || name === "OverconstrainedError") {
+        message = t("No microphone found");
+      } else {
+        message = t("Could not start recording");
+      }
+      notifications.show({ color: "red", message });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    streamRef.current = stream;
+    chunksRef.current = [];
+    canceledRef.current = false;
+
+    const mimeType = pickMimeType();
+    let recorder: MediaRecorder;
+    try {
+      recorder = new MediaRecorder(
+        stream,
+        mimeType ? { mimeType } : undefined,
+      );
+    } catch {
+      // The stream was acquired but the recorder failed to construct; stop the
+      // tracks so the MediaStream does not leak before bailing out.
+      stopTracks();
+      notifications.show({
+        color: "red",
+        message: t("Could not start recording"),
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+    recorderRef.current = recorder;
+
+    recorder.ondataavailable = (e: BlobEvent) => {
+      if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
+    };
+
+    recorder.onstop = () => {
+      clearTimer();
+      const recordedMime = recorder.mimeType || mimeType || "audio/webm";
+      const wasCanceled = canceledRef.current;
+
+      // Stop the mic tracks regardless of how we got here.
+      stopTracks();
+      recorderRef.current = null;
+
+      if (wasCanceled) {
+        chunksRef.current = [];
+        setStatus("idle");
+        return;
+      }
+
+      const blob = new Blob(chunksRef.current, { type: recordedMime });
+      chunksRef.current = [];
+
+      setStatus("transcribing");
+      void transcribeAudio(blob, filenameForMime(recordedMime))
+        .then((text) => {
+          // Whisper often returns a leading space; insert the trimmed value.
+          const trimmed = text.trim();
+          if (trimmed.length > 0) optionsRef.current.onText(trimmed);
+          setStatus("idle");
+        })
+        .catch((err: unknown) => {
+          const httpStatus = (err as { response?: { status?: number } })
+            ?.response?.status;
+          // The server returns 503 when dictation is unconfigured and 403 when
+          // it is disabled server-side; both map to the same "not configured".
+          const message =
+            httpStatus === 503 || httpStatus === 403
+              ? t("Voice dictation is not configured")
+              : t("Transcription failed");
+          notifications.show({ color: "red", message });
+          // Surface the error state briefly, then return to idle. Store the
+          // timer so it can be cleared on unmount.
+          setStatus("error");
+          if (errorTimerRef.current !== null) {
+            clearTimeout(errorTimerRef.current);
+          }
+          errorTimerRef.current = setTimeout(() => {
+            errorTimerRef.current = null;
+            setStatus("idle");
+          }, 1500);
+        });
+    };
+
+    // Notify the caller right when recording begins (before any async work) so
+    // the editor can snapshot the caret position.
+    try {
+      optionsRef.current.onStart?.();
+      recorder.start();
+    } catch {
+      // recorder.start() can synchronously throw (InvalidStateError /
+      // NotSupportedError); clean up so the button is not left stuck and the
+      // MediaStream does not leak.
+      stopTracks();
+      recorderRef.current = null;
+      startingRef.current = false;
+      notifications.show({
+        color: "red",
+        message: t("Could not start recording"),
+      });
+      setStatus("idle");
+      return;
+    }
+    setStatus("recording");
+    // Recording has truly begun; release the synchronous start guard.
+    startingRef.current = false;
+
+    const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
+    timerRef.current = setTimeout(() => {
+      if (recorderRef.current?.state === "recording") {
+        recorderRef.current.stop();
+      }
+    }, maxDurationMs);
+  }, [status, t, clearTimer, stopTracks]);
+
+  const stop = useCallback((): void => {
+    clearTimer();
+    const recorder = recorderRef.current;
+    if (recorder && recorder.state === "recording") {
+      recorder.stop();
+    }
+  }, [clearTimer]);
+
+  const cancel = useCallback((): void => {
+    clearTimer();
+    canceledRef.current = true;
+    const recorder = recorderRef.current;
+    if (recorder && recorder.state === "recording") {
+      // onstop sees canceledRef and skips transcription; it also stops tracks.
+      recorder.stop();
+    } else {
+      stopTracks();
+    }
+    setStatus("idle");
+  }, [clearTimer, stopTracks]);
+
+  // Clean up on unmount: stop any live recorder/stream and clear the timers.
+  useEffect(() => {
+    return () => {
+      clearTimer();
+      if (errorTimerRef.current !== null) {
+        clearTimeout(errorTimerRef.current);
+        errorTimerRef.current = null;
+      }
+      const recorder = recorderRef.current;
+      if (recorder && recorder.state === "recording") {
+        canceledRef.current = true;
+        recorder.stop();
+      }
+      stopTracks();
+    };
+  }, [clearTimer, stopTracks]);
+
+  return { status, start, stop, cancel };
+}
--- a/apps/client/src/features/dictation/services/dictation-service.ts
+++ b/apps/client/src/features/dictation/services/dictation-service.ts
@@ -0,0 +1,17 @@
+import api from "@/lib/api-client";
+
+// POST the recorded audio as multipart/form-data; the server transcribes it with
+// the workspace STT model and returns { text } (wrapped in the standard envelope,
+// so the value is at req.data.text). `filename` only sets the part name; the
+// server keys off the blob's MIME type.
+export async function transcribeAudio(
+  blob: Blob,
+  filename = "speech.webm",
+): Promise<string> {
+  const form = new FormData();
+  form.append("file", blob, filename);
+  const req = await api.post<{ text: string }>("/ai-chat/transcribe", form, {
+    headers: { "Content-Type": "multipart/form-data" },
+  });
+  return req.data.text;
+}
--- a/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx
+++ b/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx
@@ -13,6 +13,7 @@ import { QuickInsertsGroup } from "./groups/quick-inserts-group";
 import { MoreInsertsGroup } from "./groups/more-inserts-group";
 import { HistoryGroup } from "./groups/history-group";
 import { AskAiGroup } from "./groups/ask-ai-group";
+import { DictationGroup } from "./groups/dictation-group";
 import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
 import classes from "./fixed-toolbar.module.css";

@@ -30,6 +31,7 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
  const state = useToolbarState(editor);
  const workspace = useAtomValue(workspaceAtom);
  const isGenerativeAiEnabled = workspace?.settings?.ai?.generative === true;
+  const isDictationEnabled = workspace?.settings?.ai?.dictation === true;

  if (!editor || !state) return null;

@@ -65,6 +67,12 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
          <MoreInsertsGroup editor={editor} templateMode={templateMode} />
          <div className={classes.divider} />
          <HistoryGroup editor={editor} state={state} />
+          {isDictationEnabled && (
+            <>
+              <div className={classes.divider} />
+              <DictationGroup editor={editor} />
+            </>
+          )}
        </div>
      </div>
      <div className={classes.spacer} aria-hidden />
--- a/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
+++ b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
@@ -0,0 +1,61 @@
+import { FC, useRef } from "react";
+import type { Editor } from "@tiptap/react";
+import { MicButton } from "@/features/dictation/components/mic-button";
+
+interface Props {
+  editor: Editor;
+}
+
+export const DictationGroup: FC<Props> = ({ editor }) => {
+  const rangeRef = useRef<{ from: number; to: number } | null>(null);
+
+  const handleStart = () => {
+    const { from, to } = editor.state.selection;
+    rangeRef.current = { from, to };
+  };
+
+  const handleText = (text: string) => {
+    // The editor may be gone by the time async transcription returns; bail out
+    // instead of operating on a destroyed instance.
+    if (!editor || editor.isDestroyed) return;
+    const snapshot = rangeRef.current;
+    rangeRef.current = null;
+    // The document may have shrunk during transcription (e.g. a collaborative
+    // edit), so clamp the snapshot into the current bounds before inserting.
+    const docSize = editor.state.doc.content.size;
+    const clamp = (p: number) => Math.max(0, Math.min(p, docSize));
+    try {
+      if (snapshot) {
+        // Insert at the snapshotted caret; a trailing space keeps words
+        // separated (the hook already trims the transcribed text).
+        editor
+          .chain()
+          .focus()
+          .insertContentAt(
+            { from: clamp(snapshot.from), to: clamp(snapshot.to) },
+            `${text} `,
+          )
+          .run();
+      } else {
+        editor.chain().focus().insertContent(`${text} `).run();
+      }
+    } catch {
+      // The snapshot drifted out of range; fall back to the current caret.
+      try {
+        editor.chain().focus().insertContent(`${text} `).run();
+      } catch {
+        // The editor may have been destroyed; ignore so a dead editor can't
+        // surface an uncaught error.
+      }
+    }
+  };
+
+  return (
+    <MicButton
+      size="md"
+      onStart={handleStart}
+      onText={handleText}
+      disabled={!editor.isEditable}
+    />
+  );
+};
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -47,6 +47,10 @@ const formSchema = z.object({
  systemPrompt: z.string(),
  apiKey: z.string(),
  embeddingApiKey: z.string(),
+  // STT-specific fields. Empty base URL / key fall back to the chat ones.
+  sttModel: z.string(),
+  sttBaseUrl: z.string(),
+  sttApiKey: z.string(),
 });

 type FormValues = z.infer<typeof formSchema>;
@@ -101,8 +105,12 @@ export default function AiProviderSettings() {
  const [searchEnabled, setSearchEnabled] = useState<boolean>(
    workspace?.settings?.ai?.search ?? false,
  );
+  const [dictationEnabled, setDictationEnabled] = useState<boolean>(
+    workspace?.settings?.ai?.dictation ?? false,
+  );
  const [chatToggleLoading, setChatToggleLoading] = useState(false);
  const [searchToggleLoading, setSearchToggleLoading] = useState(false);
+  const [dictationToggleLoading, setDictationToggleLoading] = useState(false);

  // Whether a key is currently stored server-side (drives the placeholder).
  const [hasApiKey, setHasApiKey] = useState(false);
@@ -111,6 +119,9 @@ export default function AiProviderSettings() {
  // Same, for the embedding-specific key.
  const [hasEmbeddingApiKey, setHasEmbeddingApiKey] = useState(false);
  const [embeddingKeyCleared, setEmbeddingKeyCleared] = useState(false);
+  // Same, for the STT-specific key.
+  const [hasSttApiKey, setHasSttApiKey] = useState(false);
+  const [sttKeyCleared, setSttKeyCleared] = useState(false);

  // Modal for the (large) system message editor.
  const [promptOpened, promptHandlers] = useDisclosure(false);
@@ -125,6 +136,9 @@ export default function AiProviderSettings() {
      systemPrompt: "",
      apiKey: "",
      embeddingApiKey: "",
+      sttModel: "",
+      sttBaseUrl: "",
+      sttApiKey: "",
    },
  });

@@ -140,12 +154,17 @@ export default function AiProviderSettings() {
      systemPrompt: settings.systemPrompt ?? "",
      apiKey: "",
      embeddingApiKey: "",
+      sttModel: settings.sttModel ?? "",
+      sttBaseUrl: settings.sttBaseUrl ?? "",
+      sttApiKey: "",
    });
    form.resetDirty();
    setHasApiKey(settings.hasApiKey);
    setKeyCleared(false);
    setHasEmbeddingApiKey(settings.hasEmbeddingApiKey);
    setEmbeddingKeyCleared(false);
+    setHasSttApiKey(settings.hasSttApiKey);
+    setSttKeyCleared(false);
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [settings]);

@@ -160,6 +179,10 @@ export default function AiProviderSettings() {
      baseUrl: values.baseUrl,
      embeddingBaseUrl: values.embeddingBaseUrl,
      systemPrompt: values.systemPrompt,
+      // The STT base URL is optional; empty falls back to the chat base URL
+      // server-side.
+      sttModel: values.sttModel,
+      sttBaseUrl: values.sttBaseUrl,
    };

    // Key semantics (never send the stored key back):
@@ -179,6 +202,13 @@ export default function AiProviderSettings() {
      payload.embeddingApiKey = "";
    }

+    // Same write-only semantics for the STT-specific key.
+    if (values.sttApiKey.length > 0) {
+      payload.sttApiKey = values.sttApiKey;
+    } else if (sttKeyCleared) {
+      payload.sttApiKey = "";
+    }
+
    return payload;
  }

@@ -191,6 +221,9 @@ export default function AiProviderSettings() {
    setHasEmbeddingApiKey(updated.hasEmbeddingApiKey);
    setEmbeddingKeyCleared(false);
    form.setFieldValue("embeddingApiKey", "");
+    setHasSttApiKey(updated.hasSttApiKey);
+    setSttKeyCleared(false);
+    form.setFieldValue("sttApiKey", "");
    form.resetDirty();
  }

@@ -206,6 +239,12 @@ export default function AiProviderSettings() {
    form.setFieldValue("embeddingApiKey", "");
  }

+  function handleClearSttKey() {
+    setSttKeyCleared(true);
+    setHasSttApiKey(false);
+    form.setFieldValue("sttApiKey", "");
+  }
+
  // Optimistic toggle for the "AI chat" feature (settings.ai.chat).
  async function handleToggleChat(value: boolean) {
    setChatToggleLoading(true);
@@ -268,6 +307,34 @@ export default function AiProviderSettings() {
    }
  }

+  // Optimistic toggle for the "Voice dictation" feature (settings.ai.dictation).
+  async function handleToggleDictation(value: boolean) {
+    setDictationToggleLoading(true);
+    const previous = dictationEnabled;
+    setDictationEnabled(value);
+    try {
+      const updated = await updateWorkspace({ aiDictation: value });
+      setWorkspace({
+        ...updated,
+        settings: {
+          ...updated.settings,
+          ai: { ...updated.settings?.ai, dictation: value },
+        },
+      });
+      notifications.show({ message: t("Updated successfully") });
+    } catch (err) {
+      setDictationEnabled(previous);
+      const message = (err as { response?: { data?: { message?: string } } })
+        ?.response?.data?.message;
+      notifications.show({
+        message: message ?? t("Failed to update data"),
+        color: "red",
+      });
+    } finally {
+      setDictationToggleLoading(false);
+    }
+  }
+
  // Admins only — match the previous behavior.
  if (!isAdmin) {
    return (
@@ -294,6 +361,11 @@ export default function AiProviderSettings() {
    "/embeddings",
    form.values.baseUrl,
  );
+  const sttResolved = resolveUrl(
+    form.values.sttBaseUrl,
+    "/audio/transcriptions",
+    form.values.baseUrl,
+  );

  const monoFont = "ui-monospace, Menlo, monospace";

@@ -541,8 +613,8 @@ export default function AiProviderSettings() {
        </Box>
      </Paper>

-      {/* Card 3 — Voice / STT (disabled stub, not wired to the form/backend) */}
-      <Paper withBorder radius="md" p="lg" opacity={0.6}>
+      {/* Card 3 — Voice / STT */}
+      <Paper withBorder radius="md" p="lg">
        <Group justify="space-between" align="center" wrap="nowrap">
          <Group gap="xs" align="center" wrap="nowrap">
            <StatusDot status="idle" />
@@ -551,8 +623,9 @@ export default function AiProviderSettings() {
          <Switch
            label={t("Voice dictation")}
            labelPosition="left"
-            checked={false}
-            disabled
+            checked={dictationEnabled}
+            disabled={dictationToggleLoading}
+            onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
          />
        </Group>
        <Text size="xs" c="dimmed" mt={4} mb="md">
@@ -562,33 +635,46 @@ export default function AiProviderSettings() {
        </Text>

        <Group grow align="flex-start">
-          <TextInput label={t("Model")} value="" disabled readOnly />
-          <PasswordInput label={t("API key")} value="" disabled readOnly />
-        </Group>
-        <TextInput mt="sm" label={t("Base URL")} value="" disabled readOnly />
-
-        <Group mt="md">
-          <Button variant="default" size="sm" disabled>
-            {t("Test endpoint")}
-          </Button>
+          <TextInput
+            label={t("Model")}
+            disabled={isLoading}
+            {...form.getInputProps("sttModel")}
+          />
+          <Stack gap={4}>
+            <PasswordInput
+              label={t("API key")}
+              placeholder={
+                hasSttApiKey
+                  ? t("•••• set")
+                  : t("Leave empty to use the chat API key")
+              }
+              autoComplete="off"
+              {...form.getInputProps("sttApiKey")}
+            />
+            {hasSttApiKey && (
+              <Anchor
+                component="button"
+                type="button"
+                c="red"
+                size="xs"
+                onClick={handleClearSttKey}
+              >
+                {t("Clear")}
+              </Anchor>
+            )}
+          </Stack>
        </Group>

-        <Box
-          mt="md"
-          mx="calc(var(--mantine-spacing-lg) * -1)"
-          mb="calc(var(--mantine-spacing-lg) * -1)"
-          px="lg"
-          py="md"
-          style={{
-            borderTop: "1px solid var(--mantine-color-default-border)",
-            background: "var(--mantine-color-default-hover)",
-            borderRadius: "0 0 var(--mantine-radius-md) var(--mantine-radius-md)",
-          }}
-        >
-          <Text size="xs" c="dimmed">
-            {t("Voice dictation is not available yet.")}
-          </Text>
-        </Box>
+        <TextInput
+          mt="sm"
+          label={t("Base URL")}
+          placeholder={t("Leave empty to use the chat base URL")}
+          disabled={isLoading}
+          {...form.getInputProps("sttBaseUrl")}
+        />
+        <Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
+          {t("Resolves to {{url}}", { url: sttResolved })}
+        </Text>
      </Paper>

      {/* Nested: external MCP tools the agent calls out to */}
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -16,6 +16,12 @@ export interface IAiSettings {
  systemPrompt?: string;
  hasApiKey: boolean;
  hasEmbeddingApiKey: boolean;
+  // STT-specific settings. `sttBaseUrl` is the RAW stored value (empty means
+  // "uses the chat base URL"). `hasSttApiKey` indicates whether an STT-specific
+  // key is stored (empty means "uses the chat API key").
+  sttModel?: string;
+  sttBaseUrl?: string;
+  hasSttApiKey: boolean;
  // RAG indexing coverage (pages indexed for semantic search).
  indexedPages: number;
  totalPages: number;
@@ -35,6 +41,10 @@ export interface IAiSettingsUpdate {
  systemPrompt?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
+  // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
+  sttApiKey?: string;
 }

 // Result of a connection test against the configured provider.
--- a/apps/client/src/features/workspace/types/workspace.types.ts
+++ b/apps/client/src/features/workspace/types/workspace.types.ts
@@ -24,6 +24,7 @@ export interface IWorkspace {
  disablePublicSharing?: boolean;
  mcpEnabled?: boolean;
  aiChat?: boolean;
+  aiDictation?: boolean;
  trashRetentionDays?: number;
  restrictApiToAdmins?: boolean;
  allowMemberTemplates?: boolean;
@@ -46,6 +47,7 @@ export interface IWorkspaceAiSettings {
  generative?: boolean;
  mcp?: boolean;
  chat?: boolean;
+  dictation?: boolean;
 }

 export interface IWorkspaceSharingSettings {