feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the
server via the workspace's OpenAI-compatible AI provider (Whisper /
gpt-4o-transcribe / self-hosted whisper), then inserts the text.

Backend:
- New `stt_api_key_enc` column + migration; STT creds parity with chat/
  embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to
  chat baseUrl/key). Both provider whitelists updated (service + repo).
- AiService.getTranscriptionModel + AiTranscriptionService.
- Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace
  scope + throttle, 25MB cap, MIME whitelist, never logs audio/key).
- New `settings.ai.dictation` workspace flag (DTO + service + audit).

Frontend:
- Wire up the Voice/STT settings card (model/base URL/key) and the
  Voice-dictation toggle.
- New `features/dictation`: useDictation (MediaRecorder state machine),
  MicButton, transcribe service; integrated into the chat composer and a
  new editor-toolbar dictation group, both gated by ai.dictation.
This commit is contained in:
vvzvlad
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions

View File

@@ -0,0 +1,76 @@
import { FC } from "react";
import { ActionIcon, Loader, Tooltip } from "@mantine/core";
import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
import { useTranslation } from "react-i18next";
import { useDictation } from "@/features/dictation/hooks/use-dictation";
interface MicButtonProps {
onText: (text: string) => void;
onStart?: () => void;
disabled?: boolean;
// Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
// editor toolbar.
size?: "md" | "lg";
}
/**
* Self-contained dictation toggle. Owns its own capture state machine: a click
* starts recording (mic icon), a second click stops it (stop icon), and while
* the audio is being transcribed it shows a spinner and is disabled to prevent
* overlapping requests.
*/
export const MicButton: FC<MicButtonProps> = ({
onText,
onStart,
disabled,
size = "lg",
}) => {
const { t } = useTranslation();
const { status, start, stop } = useDictation({ onText, onStart });
const iconSize = size === "lg" ? 18 : 16;
if (status === "recording") {
return (
<Tooltip label={t("Stop recording")} withArrow>
<ActionIcon
size={size}
color="red"
variant="light"
onClick={stop}
aria-label={t("Stop recording")}
>
<IconPlayerStopFilled size={iconSize} />
</ActionIcon>
</Tooltip>
);
}
if (status === "transcribing" || status === "error") {
return (
<Tooltip label={t("Transcribing…")} withArrow>
<ActionIcon
size={size}
variant="subtle"
disabled
aria-label={t("Transcribing…")}
>
<Loader size="xs" />
</ActionIcon>
</Tooltip>
);
}
return (
<Tooltip label={t("Start dictation")} withArrow>
<ActionIcon
size={size}
variant="subtle"
onClick={() => void start()}
disabled={disabled}
aria-label={t("Start dictation")}
>
<IconMicrophone size={iconSize} />
</ActionIcon>
</Tooltip>
);
};

View File

@@ -0,0 +1,260 @@
import { useCallback, useEffect, useRef, useState } from "react";
import { notifications } from "@mantine/notifications";
import { useTranslation } from "react-i18next";
import { transcribeAudio } from "@/features/dictation/services/dictation-service";
export type DictationStatus = "idle" | "recording" | "transcribing" | "error";
interface UseDictationOptions {
onText: (text: string) => void;
onStart?: () => void;
maxDurationMs?: number;
}
interface UseDictationResult {
status: DictationStatus;
start: () => Promise<void>;
stop: () => void;
cancel: () => void;
}
// Candidate container/codec combinations in preference order. The first one the
// browser supports wins; if none do we let MediaRecorder pick its own default.
const MIME_CANDIDATES = [
"audio/webm;codecs=opus",
"audio/webm",
"audio/mp4",
"audio/ogg;codecs=opus",
"audio/ogg",
];
// Derive a sensible upload filename from the recorded MIME type. The server keys
// off the blob's MIME, so this only affects the part name, but a matching
// extension keeps things tidy.
function filenameForMime(mime: string): string {
if (mime.includes("mp4")) return "speech.mp4";
if (mime.includes("ogg")) return "speech.ogg";
return "speech.webm";
}
function pickMimeType(): string | undefined {
if (typeof MediaRecorder === "undefined") return undefined;
for (const candidate of MIME_CANDIDATES) {
if (MediaRecorder.isTypeSupported?.(candidate)) return candidate;
}
return undefined;
}
/**
* Encapsulates the browser audio-capture state machine: request the mic, record
* with MediaRecorder, then POST the blob for transcription. Refs hold the live
* recorder/stream/chunks/timer/cancel flag so component re-renders never lose
* them, and every exit path stops the MediaStream tracks.
*/
export function useDictation(
options: UseDictationOptions,
): UseDictationResult {
const { t } = useTranslation();
const [status, setStatus] = useState<DictationStatus>("idle");
// Keep the latest callbacks in a ref so the recorder's onstop closure always
// calls the current handlers without re-creating the recorder.
const optionsRef = useRef(options);
optionsRef.current = options;
const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const chunksRef = useRef<Blob[]>([]);
const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const canceledRef = useRef(false);
const startingRef = useRef(false);
const clearTimer = useCallback(() => {
if (timerRef.current !== null) {
clearTimeout(timerRef.current);
timerRef.current = null;
}
}, []);
const stopTracks = useCallback(() => {
streamRef.current?.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}, []);
const start = useCallback(async (): Promise<void> => {
// Synchronous live guard: status is stale between renders, so also block on
// refs to prevent a double-click from opening two MediaStreams (the first
// would leak).
if (startingRef.current || recorderRef.current || streamRef.current) return;
if (status !== "idle") return;
startingRef.current = true;
let stream: MediaStream;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
const name = (err as { name?: string })?.name;
let message: string;
if (name === "NotAllowedError" || name === "SecurityError") {
message = t("Microphone access denied");
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
message = t("No microphone found");
} else {
message = t("Could not start recording");
}
notifications.show({ color: "red", message });
setStatus("idle");
startingRef.current = false;
return;
}
streamRef.current = stream;
chunksRef.current = [];
canceledRef.current = false;
const mimeType = pickMimeType();
let recorder: MediaRecorder;
try {
recorder = new MediaRecorder(
stream,
mimeType ? { mimeType } : undefined,
);
} catch {
// The stream was acquired but the recorder failed to construct; stop the
// tracks so the MediaStream does not leak before bailing out.
stopTracks();
notifications.show({
color: "red",
message: t("Could not start recording"),
});
setStatus("idle");
startingRef.current = false;
return;
}
recorderRef.current = recorder;
recorder.ondataavailable = (e: BlobEvent) => {
if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
};
recorder.onstop = () => {
clearTimer();
const recordedMime = recorder.mimeType || mimeType || "audio/webm";
const wasCanceled = canceledRef.current;
// Stop the mic tracks regardless of how we got here.
stopTracks();
recorderRef.current = null;
if (wasCanceled) {
chunksRef.current = [];
setStatus("idle");
return;
}
const blob = new Blob(chunksRef.current, { type: recordedMime });
chunksRef.current = [];
setStatus("transcribing");
void transcribeAudio(blob, filenameForMime(recordedMime))
.then((text) => {
// Whisper often returns a leading space; insert the trimmed value.
const trimmed = text.trim();
if (trimmed.length > 0) optionsRef.current.onText(trimmed);
setStatus("idle");
})
.catch((err: unknown) => {
const httpStatus = (err as { response?: { status?: number } })
?.response?.status;
// The server returns 503 when dictation is unconfigured and 403 when
// it is disabled server-side; both map to the same "not configured".
const message =
httpStatus === 503 || httpStatus === 403
? t("Voice dictation is not configured")
: t("Transcription failed");
notifications.show({ color: "red", message });
// Surface the error state briefly, then return to idle. Store the
// timer so it can be cleared on unmount.
setStatus("error");
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
}
errorTimerRef.current = setTimeout(() => {
errorTimerRef.current = null;
setStatus("idle");
}, 1500);
});
};
// Notify the caller right when recording begins (before any async work) so
// the editor can snapshot the caret position.
try {
optionsRef.current.onStart?.();
recorder.start();
} catch {
// recorder.start() can synchronously throw (InvalidStateError /
// NotSupportedError); clean up so the button is not left stuck and the
// MediaStream does not leak.
stopTracks();
recorderRef.current = null;
startingRef.current = false;
notifications.show({
color: "red",
message: t("Could not start recording"),
});
setStatus("idle");
return;
}
setStatus("recording");
// Recording has truly begun; release the synchronous start guard.
startingRef.current = false;
const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
timerRef.current = setTimeout(() => {
if (recorderRef.current?.state === "recording") {
recorderRef.current.stop();
}
}, maxDurationMs);
}, [status, t, clearTimer, stopTracks]);
const stop = useCallback((): void => {
clearTimer();
const recorder = recorderRef.current;
if (recorder && recorder.state === "recording") {
recorder.stop();
}
}, [clearTimer]);
const cancel = useCallback((): void => {
clearTimer();
canceledRef.current = true;
const recorder = recorderRef.current;
if (recorder && recorder.state === "recording") {
// onstop sees canceledRef and skips transcription; it also stops tracks.
recorder.stop();
} else {
stopTracks();
}
setStatus("idle");
}, [clearTimer, stopTracks]);
// Clean up on unmount: stop any live recorder/stream and clear the timers.
useEffect(() => {
return () => {
clearTimer();
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
errorTimerRef.current = null;
}
const recorder = recorderRef.current;
if (recorder && recorder.state === "recording") {
canceledRef.current = true;
recorder.stop();
}
stopTracks();
};
}, [clearTimer, stopTracks]);
return { status, start, stop, cancel };
}

View File

@@ -0,0 +1,17 @@
import api from "@/lib/api-client";
// POST the recorded audio as multipart/form-data; the server transcribes it with
// the workspace STT model and returns { text } (wrapped in the standard envelope,
// so the value is at req.data.text). `filename` only sets the part name; the
// server keys off the blob's MIME type.
export async function transcribeAudio(
blob: Blob,
filename = "speech.webm",
): Promise<string> {
const form = new FormData();
form.append("file", blob, filename);
const req = await api.post<{ text: string }>("/ai-chat/transcribe", form, {
headers: { "Content-Type": "multipart/form-data" },
});
return req.data.text;
}