feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the server via the workspace's OpenAI-compatible AI provider (Whisper / gpt-4o-transcribe / self-hosted whisper), then inserts the text. Backend: - New `stt_api_key_enc` column + migration; STT creds parity with chat/ embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to chat baseUrl/key). Both provider whitelists updated (service + repo). - AiService.getTranscriptionModel + AiTranscriptionService. - Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace scope + throttle, 25MB cap, MIME whitelist, never logs audio/key). - New `settings.ai.dictation` workspace flag (DTO + service + audit). Frontend: - Wire up the Voice/STT settings card (model/base URL/key) and the Voice-dictation toggle. - New `features/dictation`: useDictation (MediaRecorder state machine), MicButton, transcribe service; integrated into the chat composer and a new editor-toolbar dictation group, both gated by ai.dictation.
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1181,5 +1181,13 @@
  "Embeddings": "Embeddings",
  "Leave empty to use the chat API key": "Leave empty to use the chat API key",
  "Leave empty to use the chat base URL": "Leave empty to use the chat base URL",
-  "Reindex now": "Reindex now"
+  "Reindex now": "Reindex now",
+  "Start dictation": "Start dictation",
+  "Stop recording": "Stop recording",
+  "Transcribing…": "Transcribing…",
+  "Microphone access denied": "Microphone access denied",
+  "No microphone found": "No microphone found",
+  "Could not start recording": "Could not start recording",
+  "Transcription failed": "Transcription failed",
+  "Voice dictation is not configured": "Voice dictation is not configured"
 }
--- a/apps/client/src/features/ai-chat/components/chat-input.tsx
+++ b/apps/client/src/features/ai-chat/components/chat-input.tsx
@@ -2,8 +2,10 @@ import { KeyboardEvent } from "react";
 import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core";
 import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react";
 import { useTranslation } from "react-i18next";
-import { useAtom } from "jotai";
+import { useAtom, useAtomValue } from "jotai";
 import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts";
+import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
+import { MicButton } from "@/features/dictation/components/mic-button";

 interface ChatInputProps {
  onSend: (text: string) => void;
@@ -25,6 +27,8 @@ export default function ChatInput({
 }: ChatInputProps) {
  const { t } = useTranslation();
  const [value, setValue] = useAtom(aiChatDraftAtom);
+  const workspace = useAtomValue(workspaceAtom);
+  const isDictationEnabled = workspace?.settings?.ai?.dictation === true;

  const send = (): void => {
    const text = value.trim();
@@ -57,6 +61,13 @@ export default function ChatInput({
        // switch), so a fresh chat lands with the cursor ready in the field.
        autoFocus
      />
+      {isDictationEnabled && (
+        <MicButton
+          size="lg"
+          disabled={isStreaming || disabled}
+          onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
+        />
+      )}
      {isStreaming ? (
        <Tooltip label={t("Stop")} withArrow>
          <ActionIcon
--- a/apps/client/src/features/dictation/components/mic-button.tsx
+++ b/apps/client/src/features/dictation/components/mic-button.tsx
@@ -0,0 +1,76 @@
+import { FC } from "react";
+import { ActionIcon, Loader, Tooltip } from "@mantine/core";
+import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
+import { useTranslation } from "react-i18next";
+import { useDictation } from "@/features/dictation/hooks/use-dictation";
+
+interface MicButtonProps {
+  onText: (text: string) => void;
+  onStart?: () => void;
+  disabled?: boolean;
+  // Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
+  // editor toolbar.
+  size?: "md" | "lg";
+}
+
+/**
+ * Self-contained dictation toggle. Owns its own capture state machine: a click
+ * starts recording (mic icon), a second click stops it (stop icon), and while
+ * the audio is being transcribed it shows a spinner and is disabled to prevent
+ * overlapping requests.
+ */
+export const MicButton: FC<MicButtonProps> = ({
+  onText,
+  onStart,
+  disabled,
+  size = "lg",
+}) => {
+  const { t } = useTranslation();
+  const { status, start, stop } = useDictation({ onText, onStart });
+  const iconSize = size === "lg" ? 18 : 16;
+
+  if (status === "recording") {
+    return (
+      <Tooltip label={t("Stop recording")} withArrow>
+        <ActionIcon
+          size={size}
+          color="red"
+          variant="light"
+          onClick={stop}
+          aria-label={t("Stop recording")}
+        >
+          <IconPlayerStopFilled size={iconSize} />
+        </ActionIcon>
+      </Tooltip>
+    );
+  }
+
+  if (status === "transcribing" || status === "error") {
+    return (
+      <Tooltip label={t("Transcribing…")} withArrow>
+        <ActionIcon
+          size={size}
+          variant="subtle"
+          disabled
+          aria-label={t("Transcribing…")}
+        >
+          <Loader size="xs" />
+        </ActionIcon>
+      </Tooltip>
+    );
+  }
+
+  return (
+    <Tooltip label={t("Start dictation")} withArrow>
+      <ActionIcon
+        size={size}
+        variant="subtle"
+        onClick={() => void start()}
+        disabled={disabled}
+        aria-label={t("Start dictation")}
+      >
+        <IconMicrophone size={iconSize} />
+      </ActionIcon>
+    </Tooltip>
+  );
+};
--- a/apps/client/src/features/dictation/hooks/use-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-dictation.ts
@@ -0,0 +1,260 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { notifications } from "@mantine/notifications";
+import { useTranslation } from "react-i18next";
+import { transcribeAudio } from "@/features/dictation/services/dictation-service";
+
+export type DictationStatus = "idle" | "recording" | "transcribing" | "error";
+
+interface UseDictationOptions {
+  onText: (text: string) => void;
+  onStart?: () => void;
+  maxDurationMs?: number;
+}
+
+interface UseDictationResult {
+  status: DictationStatus;
+  start: () => Promise<void>;
+  stop: () => void;
+  cancel: () => void;
+}
+
+// Candidate container/codec combinations in preference order. The first one the
+// browser supports wins; if none do we let MediaRecorder pick its own default.
+const MIME_CANDIDATES = [
+  "audio/webm;codecs=opus",
+  "audio/webm",
+  "audio/mp4",
+  "audio/ogg;codecs=opus",
+  "audio/ogg",
+];
+
+// Derive a sensible upload filename from the recorded MIME type. The server keys
+// off the blob's MIME, so this only affects the part name, but a matching
+// extension keeps things tidy.
+function filenameForMime(mime: string): string {
+  if (mime.includes("mp4")) return "speech.mp4";
+  if (mime.includes("ogg")) return "speech.ogg";
+  return "speech.webm";
+}
+
+function pickMimeType(): string | undefined {
+  if (typeof MediaRecorder === "undefined") return undefined;
+  for (const candidate of MIME_CANDIDATES) {
+    if (MediaRecorder.isTypeSupported?.(candidate)) return candidate;
+  }
+  return undefined;
+}
+
+/**
+ * Encapsulates the browser audio-capture state machine: request the mic, record
+ * with MediaRecorder, then POST the blob for transcription. Refs hold the live
+ * recorder/stream/chunks/timer/cancel flag so component re-renders never lose
+ * them, and every exit path stops the MediaStream tracks.
+ */
+export function useDictation(
+  options: UseDictationOptions,
+): UseDictationResult {
+  const { t } = useTranslation();
+  const [status, setStatus] = useState<DictationStatus>("idle");
+
+  // Keep the latest callbacks in a ref so the recorder's onstop closure always
+  // calls the current handlers without re-creating the recorder.
+  const optionsRef = useRef(options);
+  optionsRef.current = options;
+
+  const recorderRef = useRef<MediaRecorder | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const canceledRef = useRef(false);
+  const startingRef = useRef(false);
+
+  const clearTimer = useCallback(() => {
+    if (timerRef.current !== null) {
+      clearTimeout(timerRef.current);
+      timerRef.current = null;
+    }
+  }, []);
+
+  const stopTracks = useCallback(() => {
+    streamRef.current?.getTracks().forEach((track) => track.stop());
+    streamRef.current = null;
+  }, []);
+
+  const start = useCallback(async (): Promise<void> => {
+    // Synchronous live guard: status is stale between renders, so also block on
+    // refs to prevent a double-click from opening two MediaStreams (the first
+    // would leak).
+    if (startingRef.current || recorderRef.current || streamRef.current) return;
+    if (status !== "idle") return;
+    startingRef.current = true;
+
+    let stream: MediaStream;
+    try {
+      stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    } catch (err) {
+      const name = (err as { name?: string })?.name;
+      let message: string;
+      if (name === "NotAllowedError" || name === "SecurityError") {
+        message = t("Microphone access denied");
+      } else if (name === "NotFoundError" || name === "OverconstrainedError") {
+        message = t("No microphone found");
+      } else {
+        message = t("Could not start recording");
+      }
+      notifications.show({ color: "red", message });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    streamRef.current = stream;
+    chunksRef.current = [];
+    canceledRef.current = false;
+
+    const mimeType = pickMimeType();
+    let recorder: MediaRecorder;
+    try {
+      recorder = new MediaRecorder(
+        stream,
+        mimeType ? { mimeType } : undefined,
+      );
+    } catch {
+      // The stream was acquired but the recorder failed to construct; stop the
+      // tracks so the MediaStream does not leak before bailing out.
+      stopTracks();
+      notifications.show({
+        color: "red",
+        message: t("Could not start recording"),
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+    recorderRef.current = recorder;
+
+    recorder.ondataavailable = (e: BlobEvent) => {
+      if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
+    };
+
+    recorder.onstop = () => {
+      clearTimer();
+      const recordedMime = recorder.mimeType || mimeType || "audio/webm";
+      const wasCanceled = canceledRef.current;
+
+      // Stop the mic tracks regardless of how we got here.
+      stopTracks();
+      recorderRef.current = null;
+
+      if (wasCanceled) {
+        chunksRef.current = [];
+        setStatus("idle");
+        return;
+      }
+
+      const blob = new Blob(chunksRef.current, { type: recordedMime });
+      chunksRef.current = [];
+
+      setStatus("transcribing");
+      void transcribeAudio(blob, filenameForMime(recordedMime))
+        .then((text) => {
+          // Whisper often returns a leading space; insert the trimmed value.
+          const trimmed = text.trim();
+          if (trimmed.length > 0) optionsRef.current.onText(trimmed);
+          setStatus("idle");
+        })
+        .catch((err: unknown) => {
+          const httpStatus = (err as { response?: { status?: number } })
+            ?.response?.status;
+          // The server returns 503 when dictation is unconfigured and 403 when
+          // it is disabled server-side; both map to the same "not configured".
+          const message =
+            httpStatus === 503 || httpStatus === 403
+              ? t("Voice dictation is not configured")
+              : t("Transcription failed");
+          notifications.show({ color: "red", message });
+          // Surface the error state briefly, then return to idle. Store the
+          // timer so it can be cleared on unmount.
+          setStatus("error");
+          if (errorTimerRef.current !== null) {
+            clearTimeout(errorTimerRef.current);
+          }
+          errorTimerRef.current = setTimeout(() => {
+            errorTimerRef.current = null;
+            setStatus("idle");
+          }, 1500);
+        });
+    };
+
+    // Notify the caller right when recording begins (before any async work) so
+    // the editor can snapshot the caret position.
+    try {
+      optionsRef.current.onStart?.();
+      recorder.start();
+    } catch {
+      // recorder.start() can synchronously throw (InvalidStateError /
+      // NotSupportedError); clean up so the button is not left stuck and the
+      // MediaStream does not leak.
+      stopTracks();
+      recorderRef.current = null;
+      startingRef.current = false;
+      notifications.show({
+        color: "red",
+        message: t("Could not start recording"),
+      });
+      setStatus("idle");
+      return;
+    }
+    setStatus("recording");
+    // Recording has truly begun; release the synchronous start guard.
+    startingRef.current = false;
+
+    const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
+    timerRef.current = setTimeout(() => {
+      if (recorderRef.current?.state === "recording") {
+        recorderRef.current.stop();
+      }
+    }, maxDurationMs);
+  }, [status, t, clearTimer, stopTracks]);
+
+  const stop = useCallback((): void => {
+    clearTimer();
+    const recorder = recorderRef.current;
+    if (recorder && recorder.state === "recording") {
+      recorder.stop();
+    }
+  }, [clearTimer]);
+
+  const cancel = useCallback((): void => {
+    clearTimer();
+    canceledRef.current = true;
+    const recorder = recorderRef.current;
+    if (recorder && recorder.state === "recording") {
+      // onstop sees canceledRef and skips transcription; it also stops tracks.
+      recorder.stop();
+    } else {
+      stopTracks();
+    }
+    setStatus("idle");
+  }, [clearTimer, stopTracks]);
+
+  // Clean up on unmount: stop any live recorder/stream and clear the timers.
+  useEffect(() => {
+    return () => {
+      clearTimer();
+      if (errorTimerRef.current !== null) {
+        clearTimeout(errorTimerRef.current);
+        errorTimerRef.current = null;
+      }
+      const recorder = recorderRef.current;
+      if (recorder && recorder.state === "recording") {
+        canceledRef.current = true;
+        recorder.stop();
+      }
+      stopTracks();
+    };
+  }, [clearTimer, stopTracks]);
+
+  return { status, start, stop, cancel };
+}
--- a/apps/client/src/features/dictation/services/dictation-service.ts
+++ b/apps/client/src/features/dictation/services/dictation-service.ts
@@ -0,0 +1,17 @@
+import api from "@/lib/api-client";
+
+// POST the recorded audio as multipart/form-data; the server transcribes it with
+// the workspace STT model and returns { text } (wrapped in the standard envelope,
+// so the value is at req.data.text). `filename` only sets the part name; the
+// server keys off the blob's MIME type.
+export async function transcribeAudio(
+  blob: Blob,
+  filename = "speech.webm",
+): Promise<string> {
+  const form = new FormData();
+  form.append("file", blob, filename);
+  const req = await api.post<{ text: string }>("/ai-chat/transcribe", form, {
+    headers: { "Content-Type": "multipart/form-data" },
+  });
+  return req.data.text;
+}
--- a/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx
+++ b/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx
@@ -13,6 +13,7 @@ import { QuickInsertsGroup } from "./groups/quick-inserts-group";
 import { MoreInsertsGroup } from "./groups/more-inserts-group";
 import { HistoryGroup } from "./groups/history-group";
 import { AskAiGroup } from "./groups/ask-ai-group";
+import { DictationGroup } from "./groups/dictation-group";
 import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
 import classes from "./fixed-toolbar.module.css";

@@ -30,6 +31,7 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
  const state = useToolbarState(editor);
  const workspace = useAtomValue(workspaceAtom);
  const isGenerativeAiEnabled = workspace?.settings?.ai?.generative === true;
+  const isDictationEnabled = workspace?.settings?.ai?.dictation === true;

  if (!editor || !state) return null;

@@ -65,6 +67,12 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
          <MoreInsertsGroup editor={editor} templateMode={templateMode} />
          <div className={classes.divider} />
          <HistoryGroup editor={editor} state={state} />
+          {isDictationEnabled && (
+            <>
+              <div className={classes.divider} />
+              <DictationGroup editor={editor} />
+            </>
+          )}
        </div>
      </div>
      <div className={classes.spacer} aria-hidden />
--- a/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
+++ b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
@@ -0,0 +1,61 @@
+import { FC, useRef } from "react";
+import type { Editor } from "@tiptap/react";
+import { MicButton } from "@/features/dictation/components/mic-button";
+
+interface Props {
+  editor: Editor;
+}
+
+export const DictationGroup: FC<Props> = ({ editor }) => {
+  const rangeRef = useRef<{ from: number; to: number } | null>(null);
+
+  const handleStart = () => {
+    const { from, to } = editor.state.selection;
+    rangeRef.current = { from, to };
+  };
+
+  const handleText = (text: string) => {
+    // The editor may be gone by the time async transcription returns; bail out
+    // instead of operating on a destroyed instance.
+    if (!editor || editor.isDestroyed) return;
+    const snapshot = rangeRef.current;
+    rangeRef.current = null;
+    // The document may have shrunk during transcription (e.g. a collaborative
+    // edit), so clamp the snapshot into the current bounds before inserting.
+    const docSize = editor.state.doc.content.size;
+    const clamp = (p: number) => Math.max(0, Math.min(p, docSize));
+    try {
+      if (snapshot) {
+        // Insert at the snapshotted caret; a trailing space keeps words
+        // separated (the hook already trims the transcribed text).
+        editor
+          .chain()
+          .focus()
+          .insertContentAt(
+            { from: clamp(snapshot.from), to: clamp(snapshot.to) },
+            `${text} `,
+          )
+          .run();
+      } else {
+        editor.chain().focus().insertContent(`${text} `).run();
+      }
+    } catch {
+      // The snapshot drifted out of range; fall back to the current caret.
+      try {
+        editor.chain().focus().insertContent(`${text} `).run();
+      } catch {
+        // The editor may have been destroyed; ignore so a dead editor can't
+        // surface an uncaught error.
+      }
+    }
+  };
+
+  return (
+    <MicButton
+      size="md"
+      onStart={handleStart}
+      onText={handleText}
+      disabled={!editor.isEditable}
+    />
+  );
+};
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -47,6 +47,10 @@ const formSchema = z.object({
  systemPrompt: z.string(),
  apiKey: z.string(),
  embeddingApiKey: z.string(),
+  // STT-specific fields. Empty base URL / key fall back to the chat ones.
+  sttModel: z.string(),
+  sttBaseUrl: z.string(),
+  sttApiKey: z.string(),
 });

 type FormValues = z.infer<typeof formSchema>;
@@ -101,8 +105,12 @@ export default function AiProviderSettings() {
  const [searchEnabled, setSearchEnabled] = useState<boolean>(
    workspace?.settings?.ai?.search ?? false,
  );
+  const [dictationEnabled, setDictationEnabled] = useState<boolean>(
+    workspace?.settings?.ai?.dictation ?? false,
+  );
  const [chatToggleLoading, setChatToggleLoading] = useState(false);
  const [searchToggleLoading, setSearchToggleLoading] = useState(false);
+  const [dictationToggleLoading, setDictationToggleLoading] = useState(false);

  // Whether a key is currently stored server-side (drives the placeholder).
  const [hasApiKey, setHasApiKey] = useState(false);
@@ -111,6 +119,9 @@ export default function AiProviderSettings() {
  // Same, for the embedding-specific key.
  const [hasEmbeddingApiKey, setHasEmbeddingApiKey] = useState(false);
  const [embeddingKeyCleared, setEmbeddingKeyCleared] = useState(false);
+  // Same, for the STT-specific key.
+  const [hasSttApiKey, setHasSttApiKey] = useState(false);
+  const [sttKeyCleared, setSttKeyCleared] = useState(false);

  // Modal for the (large) system message editor.
  const [promptOpened, promptHandlers] = useDisclosure(false);
@@ -125,6 +136,9 @@ export default function AiProviderSettings() {
      systemPrompt: "",
      apiKey: "",
      embeddingApiKey: "",
+      sttModel: "",
+      sttBaseUrl: "",
+      sttApiKey: "",
    },
  });

@@ -140,12 +154,17 @@ export default function AiProviderSettings() {
      systemPrompt: settings.systemPrompt ?? "",
      apiKey: "",
      embeddingApiKey: "",
+      sttModel: settings.sttModel ?? "",
+      sttBaseUrl: settings.sttBaseUrl ?? "",
+      sttApiKey: "",
    });
    form.resetDirty();
    setHasApiKey(settings.hasApiKey);
    setKeyCleared(false);
    setHasEmbeddingApiKey(settings.hasEmbeddingApiKey);
    setEmbeddingKeyCleared(false);
+    setHasSttApiKey(settings.hasSttApiKey);
+    setSttKeyCleared(false);
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [settings]);

@@ -160,6 +179,10 @@ export default function AiProviderSettings() {
      baseUrl: values.baseUrl,
      embeddingBaseUrl: values.embeddingBaseUrl,
      systemPrompt: values.systemPrompt,
+      // The STT base URL is optional; empty falls back to the chat base URL
+      // server-side.
+      sttModel: values.sttModel,
+      sttBaseUrl: values.sttBaseUrl,
    };

    // Key semantics (never send the stored key back):
@@ -179,6 +202,13 @@ export default function AiProviderSettings() {
      payload.embeddingApiKey = "";
    }

+    // Same write-only semantics for the STT-specific key.
+    if (values.sttApiKey.length > 0) {
+      payload.sttApiKey = values.sttApiKey;
+    } else if (sttKeyCleared) {
+      payload.sttApiKey = "";
+    }
+
    return payload;
  }

@@ -191,6 +221,9 @@ export default function AiProviderSettings() {
    setHasEmbeddingApiKey(updated.hasEmbeddingApiKey);
    setEmbeddingKeyCleared(false);
    form.setFieldValue("embeddingApiKey", "");
+    setHasSttApiKey(updated.hasSttApiKey);
+    setSttKeyCleared(false);
+    form.setFieldValue("sttApiKey", "");
    form.resetDirty();
  }

@@ -206,6 +239,12 @@ export default function AiProviderSettings() {
    form.setFieldValue("embeddingApiKey", "");
  }

+  function handleClearSttKey() {
+    setSttKeyCleared(true);
+    setHasSttApiKey(false);
+    form.setFieldValue("sttApiKey", "");
+  }
+
  // Optimistic toggle for the "AI chat" feature (settings.ai.chat).
  async function handleToggleChat(value: boolean) {
    setChatToggleLoading(true);
@@ -268,6 +307,34 @@ export default function AiProviderSettings() {
    }
  }

+  // Optimistic toggle for the "Voice dictation" feature (settings.ai.dictation).
+  async function handleToggleDictation(value: boolean) {
+    setDictationToggleLoading(true);
+    const previous = dictationEnabled;
+    setDictationEnabled(value);
+    try {
+      const updated = await updateWorkspace({ aiDictation: value });
+      setWorkspace({
+        ...updated,
+        settings: {
+          ...updated.settings,
+          ai: { ...updated.settings?.ai, dictation: value },
+        },
+      });
+      notifications.show({ message: t("Updated successfully") });
+    } catch (err) {
+      setDictationEnabled(previous);
+      const message = (err as { response?: { data?: { message?: string } } })
+        ?.response?.data?.message;
+      notifications.show({
+        message: message ?? t("Failed to update data"),
+        color: "red",
+      });
+    } finally {
+      setDictationToggleLoading(false);
+    }
+  }
+
  // Admins only — match the previous behavior.
  if (!isAdmin) {
    return (
@@ -294,6 +361,11 @@ export default function AiProviderSettings() {
    "/embeddings",
    form.values.baseUrl,
  );
+  const sttResolved = resolveUrl(
+    form.values.sttBaseUrl,
+    "/audio/transcriptions",
+    form.values.baseUrl,
+  );

  const monoFont = "ui-monospace, Menlo, monospace";

@@ -541,8 +613,8 @@ export default function AiProviderSettings() {
        </Box>
      </Paper>

-      {/* Card 3 — Voice / STT (disabled stub, not wired to the form/backend) */}
-      <Paper withBorder radius="md" p="lg" opacity={0.6}>
+      {/* Card 3 — Voice / STT */}
+      <Paper withBorder radius="md" p="lg">
        <Group justify="space-between" align="center" wrap="nowrap">
          <Group gap="xs" align="center" wrap="nowrap">
            <StatusDot status="idle" />
@@ -551,8 +623,9 @@ export default function AiProviderSettings() {
          <Switch
            label={t("Voice dictation")}
            labelPosition="left"
-            checked={false}
-            disabled
+            checked={dictationEnabled}
+            disabled={dictationToggleLoading}
+            onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
          />
        </Group>
        <Text size="xs" c="dimmed" mt={4} mb="md">
@@ -562,33 +635,46 @@ export default function AiProviderSettings() {
        </Text>

        <Group grow align="flex-start">
-          <TextInput label={t("Model")} value="" disabled readOnly />
-          <PasswordInput label={t("API key")} value="" disabled readOnly />
-        </Group>
-        <TextInput mt="sm" label={t("Base URL")} value="" disabled readOnly />
-
-        <Group mt="md">
-          <Button variant="default" size="sm" disabled>
-            {t("Test endpoint")}
-          </Button>
+          <TextInput
+            label={t("Model")}
+            disabled={isLoading}
+            {...form.getInputProps("sttModel")}
+          />
+          <Stack gap={4}>
+            <PasswordInput
+              label={t("API key")}
+              placeholder={
+                hasSttApiKey
+                  ? t("•••• set")
+                  : t("Leave empty to use the chat API key")
+              }
+              autoComplete="off"
+              {...form.getInputProps("sttApiKey")}
+            />
+            {hasSttApiKey && (
+              <Anchor
+                component="button"
+                type="button"
+                c="red"
+                size="xs"
+                onClick={handleClearSttKey}
+              >
+                {t("Clear")}
+              </Anchor>
+            )}
+          </Stack>
        </Group>

-        <Box
-          mt="md"
-          mx="calc(var(--mantine-spacing-lg) * -1)"
-          mb="calc(var(--mantine-spacing-lg) * -1)"
-          px="lg"
-          py="md"
-          style={{
-            borderTop: "1px solid var(--mantine-color-default-border)",
-            background: "var(--mantine-color-default-hover)",
-            borderRadius: "0 0 var(--mantine-radius-md) var(--mantine-radius-md)",
-          }}
-        >
-          <Text size="xs" c="dimmed">
-            {t("Voice dictation is not available yet.")}
-          </Text>
-        </Box>
+        <TextInput
+          mt="sm"
+          label={t("Base URL")}
+          placeholder={t("Leave empty to use the chat base URL")}
+          disabled={isLoading}
+          {...form.getInputProps("sttBaseUrl")}
+        />
+        <Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
+          {t("Resolves to {{url}}", { url: sttResolved })}
+        </Text>
      </Paper>

      {/* Nested: external MCP tools the agent calls out to */}
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -16,6 +16,12 @@ export interface IAiSettings {
  systemPrompt?: string;
  hasApiKey: boolean;
  hasEmbeddingApiKey: boolean;
+  // STT-specific settings. `sttBaseUrl` is the RAW stored value (empty means
+  // "uses the chat base URL"). `hasSttApiKey` indicates whether an STT-specific
+  // key is stored (empty means "uses the chat API key").
+  sttModel?: string;
+  sttBaseUrl?: string;
+  hasSttApiKey: boolean;
  // RAG indexing coverage (pages indexed for semantic search).
  indexedPages: number;
  totalPages: number;
@@ -35,6 +41,10 @@ export interface IAiSettingsUpdate {
  systemPrompt?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
+  // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
+  sttApiKey?: string;
 }

 // Result of a connection test against the configured provider.
--- a/apps/client/src/features/workspace/types/workspace.types.ts
+++ b/apps/client/src/features/workspace/types/workspace.types.ts
@@ -24,6 +24,7 @@ export interface IWorkspace {
  disablePublicSharing?: boolean;
  mcpEnabled?: boolean;
  aiChat?: boolean;
+  aiDictation?: boolean;
  trashRetentionDays?: number;
  restrictApiToAdmins?: boolean;
  allowMemberTemplates?: boolean;
@@ -46,6 +47,7 @@ export interface IWorkspaceAiSettings {
  generative?: boolean;
  mcp?: boolean;
  chat?: boolean;
+  dictation?: boolean;
 }

 export interface IWorkspaceSharingSettings {
--- a/apps/server/src/core/ai-chat/ai-chat.controller.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts
@@ -1,4 +1,5 @@
 import {
+  BadRequestException,
  Body,
  Controller,
  ForbiddenException,
@@ -9,6 +10,7 @@ import {
  Req,
  Res,
  UseGuards,
+  UseInterceptors,
 } from '@nestjs/common';
 import { Throttle } from '@nestjs/throttler';
 import { FastifyReply, FastifyRequest } from 'fastify';
@@ -22,7 +24,9 @@ import { AiChatRepo } from '@docmost/db/repos/ai-chat/ai-chat.repo';
 import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo';
 import { UserThrottlerGuard } from '../../integrations/throttle/user-throttler.guard';
 import { AI_CHAT_THROTTLER } from '../../integrations/throttle/throttler-names';
+import { FileInterceptor } from '../../common/interceptors/file.interceptor';
 import { AiChatService, AiChatStreamBody } from './ai-chat.service';
+import { AiTranscriptionService } from './ai-transcription.service';
 import {
  ChatIdDto,
  GetChatMessagesDto,
@@ -43,6 +47,7 @@ export class AiChatController {
    private readonly aiChatService: AiChatService,
    private readonly aiChatRepo: AiChatRepo,
    private readonly aiChatMessageRepo: AiChatMessageRepo,
+    private readonly aiTranscription: AiTranscriptionService,
  ) {}

  /** List the requesting user's chats in this workspace (paginated). */
@@ -180,6 +185,74 @@ export class AiChatController {
    }
  }

+  /**
+   * Transcribe an uploaded audio clip to text using the workspace STT model.
+   * Gated by settings.ai.dictation (403 when disabled). Returns { text }.
+   */
+  @HttpCode(HttpStatus.OK)
+  @UseGuards(JwtAuthGuard, UserThrottlerGuard)
+  @Throttle({ [AI_CHAT_THROTTLER]: { limit: 20, ttl: 60000 } })
+  @Post('transcribe')
+  @UseInterceptors(FileInterceptor)
+  async transcribe(
+    @Req() req: any,
+    @AuthWorkspace() workspace: Workspace,
+  ): Promise<{ text: string }> {
+    // Gate: dictation must be explicitly enabled for the workspace.
+    const settings = (workspace.settings ?? {}) as {
+      ai?: { dictation?: boolean };
+    };
+    if (settings.ai?.dictation !== true) {
+      throw new ForbiddenException('Dictation is disabled');
+    }
+
+    let file = null;
+    try {
+      // Whisper hard-caps uploads at 25MB; allow a single file.
+      file = await req.file({ limits: { fileSize: 25 * 1024 * 1024, files: 1 } });
+    } catch (err: any) {
+      if (err?.statusCode === 413) {
+        throw new BadRequestException('Audio file too large (max 25MB)');
+      }
+      throw err;
+    }
+    if (!file) throw new BadRequestException('No audio uploaded');
+
+    // Whitelist audio container types produced by browser MediaRecorder
+    // (Chrome/FF: webm/opus, Safari: mp4) plus common STT-accepted formats.
+    const allowedMime = new Set([
+      'audio/webm',
+      'audio/ogg',
+      'audio/mp4',
+      'audio/mpeg',
+      'audio/wav',
+      'audio/x-wav',
+      'audio/wave',
+      'audio/m4a',
+      'audio/x-m4a',
+    ]);
+    // MediaRecorder mimetypes carry parameters (e.g. "audio/webm;codecs=opus");
+    // compare only the base type.
+    const baseMime = file.mimetype.split(';')[0].trim().toLowerCase();
+    if (!allowedMime.has(baseMime)) {
+      throw new BadRequestException('Unsupported audio format');
+    }
+
+    let buf: Buffer;
+    try {
+      buf = await file.toBuffer();
+    } catch (err: any) {
+      // With @fastify/multipart throwFileSizeLimit:true, the 25MB cap is enforced
+      // when the stream is consumed (here), not at req.file().
+      if (err?.statusCode === 413) {
+        throw new BadRequestException('Audio file too large (max 25MB)');
+      }
+      throw err;
+    }
+    const text = await this.aiTranscription.transcribe(workspace.id, buf);
+    return { text };
+  }
+
  /**
   * Ensure the chat exists, belongs to this workspace, AND was created by the
   * requesting user (per-user isolation). Throws ForbiddenException otherwise.
--- a/apps/server/src/core/ai-chat/ai-chat.module.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.module.ts
@@ -3,6 +3,7 @@ import { AiModule } from '../../integrations/ai/ai.module';
 import { TokenModule } from '../auth/token.module';
 import { AiChatController } from './ai-chat.controller';
 import { AiChatService } from './ai-chat.service';
+import { AiTranscriptionService } from './ai-transcription.service';
 import { AiChatToolsService } from './tools/ai-chat-tools.service';
 import { EmbeddingModule } from './embedding/embedding.module';
 import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@@ -21,6 +22,6 @@ import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@Module({
  imports: [AiModule, TokenModule, EmbeddingModule, ExternalMcpModule],
  controllers: [AiChatController],
-  providers: [AiChatService, AiChatToolsService],
+  providers: [AiChatService, AiTranscriptionService, AiChatToolsService],
 })
 export class AiChatModule {}
--- a/apps/server/src/core/ai-chat/ai-transcription.service.ts
+++ b/apps/server/src/core/ai-chat/ai-transcription.service.ts
@@ -0,0 +1,20 @@
+import { Injectable } from '@nestjs/common';
+import { experimental_transcribe as transcribe } from 'ai';
+import { AiService } from '../../integrations/ai/ai.service';
+
+/**
+ * Transcribes uploaded audio to text using the per-workspace STT model.
+ * Thin wrapper over the AI SDK's experimental_transcribe; never logs the
+ * audio or the key.
+ */
+@Injectable()
+export class AiTranscriptionService {
+  constructor(private readonly ai: AiService) {}
+
+  // Transcribe an uploaded audio buffer using the workspace STT model.
+  async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
+    const model = await this.ai.getTranscriptionModel(workspaceId);
+    const { text } = await transcribe({ model, audio });
+    return text.trim();
+  }
+}
--- a/apps/server/src/core/workspace/dto/update-workspace.dto.ts
+++ b/apps/server/src/core/workspace/dto/update-workspace.dto.ts
@@ -49,6 +49,10 @@ export class UpdateWorkspaceDto extends PartialType(CreateWorkspaceDto) {
  @IsBoolean()
  aiChat: boolean;

+  @IsOptional()
+  @IsBoolean()
+  aiDictation: boolean;
+
  @IsOptional()
  @IsInt()
  @Min(1)
--- a/apps/server/src/core/workspace/services/workspace.service.ts
+++ b/apps/server/src/core/workspace/services/workspace.service.ts
@@ -497,6 +497,20 @@ export class WorkspaceService {
        );
      }

+      if (typeof updateWorkspaceDto.aiDictation !== 'undefined') {
+        const prev = settingsBefore?.ai?.dictation ?? false;
+        if (prev !== updateWorkspaceDto.aiDictation) {
+          before.aiDictation = prev;
+          after.aiDictation = updateWorkspaceDto.aiDictation;
+        }
+        await this.workspaceRepo.updateAiSettings(
+          workspaceId,
+          'dictation',
+          updateWorkspaceDto.aiDictation,
+          trx,
+        );
+      }
+
      delete updateWorkspaceDto.restrictApiToAdmins;
      delete updateWorkspaceDto.aiSearch;
      delete updateWorkspaceDto.generativeAi;
@@ -504,6 +518,7 @@ export class WorkspaceService {
      delete updateWorkspaceDto.mcpEnabled;
      delete updateWorkspaceDto.allowMemberTemplates;
      delete updateWorkspaceDto.aiChat;
+      delete updateWorkspaceDto.aiDictation;

      await this.workspaceRepo.updateWorkspace(
        updateWorkspaceDto,
--- a/apps/server/src/database/migrations/20260618T130000-ai-stt-credentials.ts
+++ b/apps/server/src/database/migrations/20260618T130000-ai-stt-credentials.ts
@@ -0,0 +1,18 @@
+import { type Kysely } from 'kysely';
+
+export async function up(db: Kysely<any>): Promise<void> {
+  // Encrypted, STT-specific provider key. Separate from `api_key_enc`
+  // (the chat key) so the transcription model can use a different token.
+  // When NULL, the STT model falls back to `api_key_enc`.
+  await db.schema
+    .alterTable('ai_provider_credentials')
+    .addColumn('stt_api_key_enc', 'text', (col) => col)
+    .execute();
+}
+
+export async function down(db: Kysely<any>): Promise<void> {
+  await db.schema
+    .alterTable('ai_provider_credentials')
+    .dropColumn('stt_api_key_enc')
+    .execute();
+}
--- a/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts
+++ b/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts
@@ -98,4 +98,42 @@ export class AiProviderCredentialsRepo {
      .where('driver', '=', driver)
      .execute();
  }
+
+  // Upsert the STT-specific encrypted key. If no row exists yet this inserts one
+  // with `apiKeyEnc` left null (the column is nullable). On conflict only
+  // `sttApiKeyEnc` / `updatedAt` are touched, so the chat & embedding keys are kept.
+  async upsertSttKey(
+    workspaceId: string,
+    driver: string,
+    sttApiKeyEnc: string,
+    trx?: KyselyTransaction,
+  ): Promise<AiProviderCredentials> {
+    const db = dbOrTx(this.db, trx);
+    return db
+      .insertInto('aiProviderCredentials')
+      .values({ workspaceId, driver, sttApiKeyEnc })
+      .onConflict((oc) =>
+        oc.columns(['workspaceId', 'driver']).doUpdateSet({
+          sttApiKeyEnc,
+          updatedAt: new Date(),
+        }),
+      )
+      .returningAll()
+      .executeTakeFirst();
+  }
+
+  // Clear only the STT-specific key; the chat & embedding keys are kept.
+  async clearSttKey(
+    workspaceId: string,
+    driver: string,
+    trx?: KyselyTransaction,
+  ): Promise<void> {
+    const db = dbOrTx(this.db, trx);
+    await db
+      .updateTable('aiProviderCredentials')
+      .set({ sttApiKeyEnc: null, updatedAt: new Date() })
+      .where('workspaceId', '=', workspaceId)
+      .where('driver', '=', driver)
+      .execute();
+  }
 }
--- a/apps/server/src/database/repos/workspace/workspace.repo.ts
+++ b/apps/server/src/database/repos/workspace/workspace.repo.ts
@@ -239,7 +239,7 @@ export class WorkspaceRepo {
    // is a real jsonb object, never a double-encoded string. The CASE self-heals
    // workspaces whose settings.ai.provider was previously corrupted into an
    // array/string.
-    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'systemPrompt'];
+    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
    const entries = Object.entries(provider).filter(
      ([k, v]) => v !== undefined && ALLOWED.includes(k),
    );
--- a/apps/server/src/database/types/ai-provider-credentials.types.ts
+++ b/apps/server/src/database/types/ai-provider-credentials.types.ts
@@ -14,6 +14,8 @@ export interface AiProviderCredentials {
  apiKeyEnc: string | null;
  // Encrypted, embedding-specific provider key. Falls back to apiKeyEnc when null.
  embeddingApiKeyEnc: string | null;
+  // Encrypted, STT-specific provider key. Falls back to apiKeyEnc when null.
+  sttApiKeyEnc: string | null;
  createdAt: Generated<Timestamp>;
  updatedAt: Generated<Timestamp>;
 }
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput {
  systemPrompt?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
+  sttApiKey?: string;
 }

 /**
@@ -113,6 +116,7 @@ export class AiSettingsService {
      driver: provider.driver,
      chatModel: provider.chatModel,
      embeddingModel: provider.embeddingModel,
+      sttModel: provider.sttModel,
      baseUrl: provider.baseUrl,
      systemPrompt: provider.systemPrompt,
    };
@@ -122,6 +126,10 @@ export class AiSettingsService {
    // unconditionally.
    config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl;

+    // Effective STT base URL: the STT-specific value, else the chat base URL.
+    // Set unconditionally, same rationale as embeddingBaseUrl.
+    config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl;
+
    if (provider.driver !== 'ollama') {
      const creds = await this.aiProviderCredentialsRepo.find(
        workspaceId,
@@ -134,6 +142,10 @@ export class AiSettingsService {
      config.embeddingApiKey = creds?.embeddingApiKeyEnc
        ? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc)
        : config.apiKey;
+      // Effective STT key: the STT-specific key, else the chat key.
+      config.sttApiKey = creds?.sttApiKeyEnc
+        ? this.secretBox.decryptSecret(creds.sttApiKeyEnc)
+        : config.apiKey;
    }

    return config;
@@ -151,6 +163,7 @@ export class AiSettingsService {

    let hasApiKey = false;
    let hasEmbeddingApiKey = false;
+    let hasSttApiKey = false;
    if (provider.driver) {
      const creds = await this.aiProviderCredentialsRepo.find(
        workspaceId,
@@ -158,6 +171,7 @@ export class AiSettingsService {
      );
      hasApiKey = !!creds?.apiKeyEnc;
      hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
+      hasSttApiKey = !!creds?.sttApiKeyEnc;
    }

    // totalPages now counts only pages with embeddable content (non-empty text
@@ -174,9 +188,12 @@ export class AiSettingsService {
      embeddingModel: provider.embeddingModel,
      baseUrl: provider.baseUrl,
      embeddingBaseUrl: provider.embeddingBaseUrl,
+      sttModel: provider.sttModel,
+      sttBaseUrl: provider.sttBaseUrl,
      systemPrompt: provider.systemPrompt,
      hasApiKey,
      hasEmbeddingApiKey,
+      hasSttApiKey,
      indexedPages,
      totalPages,
    };
@@ -197,7 +214,7 @@ export class AiSettingsService {
    workspaceId: string,
    dto: UpdateAiSettingsInput,
  ): Promise<MaskedAiSettings> {
-    const { apiKey, embeddingApiKey, ...nonSecret } = dto;
+    const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto;

    // Persist non-secret provider fields (only those present in the partial).
    const providerPatch: Partial<AiProviderSettings> = {};
@@ -207,6 +224,8 @@ export class AiSettingsService {
      'embeddingModel',
      'baseUrl',
      'embeddingBaseUrl',
+      'sttModel',
+      'sttBaseUrl',
      'systemPrompt',
    ] as const) {
      if (nonSecret[key] !== undefined) {
@@ -222,7 +241,11 @@ export class AiSettingsService {

    // Key handling (write-only). Both keys share the same target driver and the
    // same "driver required" guard, resolved once.
-    if (apiKey !== undefined || embeddingApiKey !== undefined) {
+    if (
+      apiKey !== undefined ||
+      embeddingApiKey !== undefined ||
+      sttApiKey !== undefined
+    ) {
      const stored = await this.readProvider(workspaceId);
      const targetDriver = dto.driver ?? stored.driver;
      if (!targetDriver) {
@@ -264,6 +287,23 @@ export class AiSettingsService {
          );
        }
      }
+
+      // STT key.
+      if (sttApiKey !== undefined) {
+        if (sttApiKey === '') {
+          await this.aiProviderCredentialsRepo.clearSttKey(
+            workspaceId,
+            targetDriver,
+          );
+        } else {
+          const enc = this.secretBox.encryptSecret(sttApiKey);
+          await this.aiProviderCredentialsRepo.upsertSttKey(
+            workspaceId,
+            targetDriver,
+            enc,
+          );
+        }
+      }
    }

    return this.getMasked(workspaceId);
--- a/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts
+++ b/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts
@@ -0,0 +1,13 @@
+import { ServiceUnavailableException } from '@nestjs/common';
+
+/**
+ * Thrown when no usable STT (speech-to-text) config exists for the workspace
+ * (missing driver / sttModel). Distinct from the chat & embedding variants so
+ * the transcription endpoint can 503 independently of chat/embeddings being
+ * configured.
+ */
+export class AiSttNotConfiguredException extends ServiceUnavailableException {
+  constructor() {
+    super('AI STT model not configured');
+  }
+}
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -4,6 +4,7 @@ import {
  generateText,
  type EmbeddingModel,
  type LanguageModel,
+  type TranscriptionModel,
 } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama';
 import { AiSettingsService } from './ai-settings.service';
 import { AiNotConfiguredException } from './ai-not-configured.exception';
 import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
+import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
 import { describeProviderError } from './ai-error.util';

 /**
@@ -106,6 +108,26 @@ export class AiService {
    }
  }

+  /**
+   * Resolve the workspace config and build the transcription (STT) model.
+   * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
+   * (only @ai-sdk/openai exposes .transcription()), regardless of the chat
+   * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
+   * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
+   * on demand; the decrypted key is never logged.
+   *
+   * Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
+   */
+  async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
+    const cfg = await this.aiSettings.resolve(workspaceId);
+    if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
+    const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
+    // apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
+    return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
+      cfg.sttModel,
+    );
+  }
+
  /**
   * Embed a batch of texts with the workspace embedding model. Returns one
   * vector per input, in the same order. Thin wrapper over the AI SDK's
--- a/apps/server/src/integrations/ai/ai.types.ts
+++ b/apps/server/src/integrations/ai/ai.types.ts
@@ -21,6 +21,9 @@ export interface AiProviderSettings {
  baseUrl?: string;
  // Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
  embeddingBaseUrl?: string;
+  sttModel?: string;
+  // STT-specific base URL. Falls back to baseUrl when empty/unset.
+  sttBaseUrl?: string;
  systemPrompt?: string;
 }

@@ -31,12 +34,15 @@ export interface AiProviderSettings {
 *
 * `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and
 * key, already resolved with the chat-value fallback applied by `resolve`.
+ * `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key,
+ * already resolved with the chat-value fallback applied by `resolve`.
 */
 export interface ResolvedAiConfig extends Partial<AiProviderSettings> {
  driver?: AiDriver;
  chatModel?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttApiKey?: string;
 }

 /**
@@ -50,9 +56,12 @@ export interface MaskedAiSettings {
  embeddingModel?: string;
  baseUrl?: string;
  embeddingBaseUrl?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
  systemPrompt?: string;
  hasApiKey: boolean;
  hasEmbeddingApiKey: boolean;
+  hasSttApiKey: boolean;
  // RAG indexing coverage for the settings UI.
  indexedPages: number;
  totalPages: number;
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
@@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types';
 /**
 * Admin update payload for the workspace AI provider settings.
 *
- * `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored
- * encrypted, '' → cleared, absent → left untouched. They are NEVER returned by
- * any endpoint. The global ValidationPipe runs with `whitelist: true`, so
- * unknown fields are stripped.
+ * `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided →
+ * stored encrypted, '' → cleared, absent → left untouched. They are NEVER
+ * returned by any endpoint. The global ValidationPipe runs with
+ * `whitelist: true`, so unknown fields are stripped.
 */
 export class UpdateAiSettingsDto {
  @IsOptional()
@@ -41,4 +41,16 @@ export class UpdateAiSettingsDto {
  @IsOptional()
  @IsString()
  embeddingApiKey?: string;
+
+  @IsOptional()
+  @IsString()
+  sttModel?: string;
+
+  @IsOptional()
+  @IsString()
+  sttBaseUrl?: string;
+
+  @IsOptional()
+  @IsString()
+  sttApiKey?: string;
 }