feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the
server via the workspace's OpenAI-compatible AI provider (Whisper /
gpt-4o-transcribe / self-hosted whisper), then inserts the text.

Backend:
- New `stt_api_key_enc` column + migration; STT creds parity with chat/
  embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to
  chat baseUrl/key). Both provider whitelists updated (service + repo).
- AiService.getTranscriptionModel + AiTranscriptionService.
- Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace
  scope + throttle, 25MB cap, MIME whitelist, never logs audio/key).
- New `settings.ai.dictation` workspace flag (DTO + service + audit).

Frontend:
- Wire up the Voice/STT settings card (model/base URL/key) and the
  Voice-dictation toggle.
- New `features/dictation`: useDictation (MediaRecorder state machine),
  MicButton, transcribe service; integrated into the chat composer and a
  new editor-toolbar dictation group, both gated by ai.dictation.
This commit is contained in:
vvzvlad
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions

View File

@@ -1181,5 +1181,13 @@
"Embeddings": "Embeddings",
"Leave empty to use the chat API key": "Leave empty to use the chat API key",
"Leave empty to use the chat base URL": "Leave empty to use the chat base URL",
"Reindex now": "Reindex now"
"Reindex now": "Reindex now",
"Start dictation": "Start dictation",
"Stop recording": "Stop recording",
"Transcribing…": "Transcribing…",
"Microphone access denied": "Microphone access denied",
"No microphone found": "No microphone found",
"Could not start recording": "Could not start recording",
"Transcription failed": "Transcription failed",
"Voice dictation is not configured": "Voice dictation is not configured"
}

View File

@@ -2,8 +2,10 @@ import { KeyboardEvent } from "react";
import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core";
import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react";
import { useTranslation } from "react-i18next";
import { useAtom } from "jotai";
import { useAtom, useAtomValue } from "jotai";
import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts";
import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
import { MicButton } from "@/features/dictation/components/mic-button";
interface ChatInputProps {
onSend: (text: string) => void;
@@ -25,6 +27,8 @@ export default function ChatInput({
}: ChatInputProps) {
const { t } = useTranslation();
const [value, setValue] = useAtom(aiChatDraftAtom);
const workspace = useAtomValue(workspaceAtom);
const isDictationEnabled = workspace?.settings?.ai?.dictation === true;
const send = (): void => {
const text = value.trim();
@@ -57,6 +61,13 @@ export default function ChatInput({
// switch), so a fresh chat lands with the cursor ready in the field.
autoFocus
/>
{isDictationEnabled && (
<MicButton
size="lg"
disabled={isStreaming || disabled}
onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
/>
)}
{isStreaming ? (
<Tooltip label={t("Stop")} withArrow>
<ActionIcon

View File

@@ -0,0 +1,76 @@
import { FC } from "react";
import { ActionIcon, Loader, Tooltip } from "@mantine/core";
import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
import { useTranslation } from "react-i18next";
import { useDictation } from "@/features/dictation/hooks/use-dictation";
interface MicButtonProps {
onText: (text: string) => void;
onStart?: () => void;
disabled?: boolean;
// Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
// editor toolbar.
size?: "md" | "lg";
}
/**
* Self-contained dictation toggle. Owns its own capture state machine: a click
* starts recording (mic icon), a second click stops it (stop icon), and while
* the audio is being transcribed it shows a spinner and is disabled to prevent
* overlapping requests.
*/
export const MicButton: FC<MicButtonProps> = ({
onText,
onStart,
disabled,
size = "lg",
}) => {
const { t } = useTranslation();
const { status, start, stop } = useDictation({ onText, onStart });
const iconSize = size === "lg" ? 18 : 16;
if (status === "recording") {
return (
<Tooltip label={t("Stop recording")} withArrow>
<ActionIcon
size={size}
color="red"
variant="light"
onClick={stop}
aria-label={t("Stop recording")}
>
<IconPlayerStopFilled size={iconSize} />
</ActionIcon>
</Tooltip>
);
}
if (status === "transcribing" || status === "error") {
return (
<Tooltip label={t("Transcribing…")} withArrow>
<ActionIcon
size={size}
variant="subtle"
disabled
aria-label={t("Transcribing…")}
>
<Loader size="xs" />
</ActionIcon>
</Tooltip>
);
}
return (
<Tooltip label={t("Start dictation")} withArrow>
<ActionIcon
size={size}
variant="subtle"
onClick={() => void start()}
disabled={disabled}
aria-label={t("Start dictation")}
>
<IconMicrophone size={iconSize} />
</ActionIcon>
</Tooltip>
);
};

View File

@@ -0,0 +1,260 @@
import { useCallback, useEffect, useRef, useState } from "react";
import { notifications } from "@mantine/notifications";
import { useTranslation } from "react-i18next";
import { transcribeAudio } from "@/features/dictation/services/dictation-service";
export type DictationStatus = "idle" | "recording" | "transcribing" | "error";
interface UseDictationOptions {
onText: (text: string) => void;
onStart?: () => void;
maxDurationMs?: number;
}
interface UseDictationResult {
status: DictationStatus;
start: () => Promise<void>;
stop: () => void;
cancel: () => void;
}
// Candidate container/codec combinations in preference order. The first one the
// browser supports wins; if none do we let MediaRecorder pick its own default.
const MIME_CANDIDATES = [
"audio/webm;codecs=opus",
"audio/webm",
"audio/mp4",
"audio/ogg;codecs=opus",
"audio/ogg",
];
// Derive a sensible upload filename from the recorded MIME type. The server keys
// off the blob's MIME, so this only affects the part name, but a matching
// extension keeps things tidy.
function filenameForMime(mime: string): string {
if (mime.includes("mp4")) return "speech.mp4";
if (mime.includes("ogg")) return "speech.ogg";
return "speech.webm";
}
function pickMimeType(): string | undefined {
if (typeof MediaRecorder === "undefined") return undefined;
for (const candidate of MIME_CANDIDATES) {
if (MediaRecorder.isTypeSupported?.(candidate)) return candidate;
}
return undefined;
}
/**
* Encapsulates the browser audio-capture state machine: request the mic, record
* with MediaRecorder, then POST the blob for transcription. Refs hold the live
* recorder/stream/chunks/timer/cancel flag so component re-renders never lose
* them, and every exit path stops the MediaStream tracks.
*/
export function useDictation(
options: UseDictationOptions,
): UseDictationResult {
const { t } = useTranslation();
const [status, setStatus] = useState<DictationStatus>("idle");
// Keep the latest callbacks in a ref so the recorder's onstop closure always
// calls the current handlers without re-creating the recorder.
const optionsRef = useRef(options);
optionsRef.current = options;
const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const chunksRef = useRef<Blob[]>([]);
const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const canceledRef = useRef(false);
const startingRef = useRef(false);
const clearTimer = useCallback(() => {
if (timerRef.current !== null) {
clearTimeout(timerRef.current);
timerRef.current = null;
}
}, []);
const stopTracks = useCallback(() => {
streamRef.current?.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}, []);
const start = useCallback(async (): Promise<void> => {
// Synchronous live guard: status is stale between renders, so also block on
// refs to prevent a double-click from opening two MediaStreams (the first
// would leak).
if (startingRef.current || recorderRef.current || streamRef.current) return;
if (status !== "idle") return;
startingRef.current = true;
let stream: MediaStream;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
const name = (err as { name?: string })?.name;
let message: string;
if (name === "NotAllowedError" || name === "SecurityError") {
message = t("Microphone access denied");
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
message = t("No microphone found");
} else {
message = t("Could not start recording");
}
notifications.show({ color: "red", message });
setStatus("idle");
startingRef.current = false;
return;
}
streamRef.current = stream;
chunksRef.current = [];
canceledRef.current = false;
const mimeType = pickMimeType();
let recorder: MediaRecorder;
try {
recorder = new MediaRecorder(
stream,
mimeType ? { mimeType } : undefined,
);
} catch {
// The stream was acquired but the recorder failed to construct; stop the
// tracks so the MediaStream does not leak before bailing out.
stopTracks();
notifications.show({
color: "red",
message: t("Could not start recording"),
});
setStatus("idle");
startingRef.current = false;
return;
}
recorderRef.current = recorder;
recorder.ondataavailable = (e: BlobEvent) => {
if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
};
recorder.onstop = () => {
clearTimer();
const recordedMime = recorder.mimeType || mimeType || "audio/webm";
const wasCanceled = canceledRef.current;
// Stop the mic tracks regardless of how we got here.
stopTracks();
recorderRef.current = null;
if (wasCanceled) {
chunksRef.current = [];
setStatus("idle");
return;
}
const blob = new Blob(chunksRef.current, { type: recordedMime });
chunksRef.current = [];
setStatus("transcribing");
void transcribeAudio(blob, filenameForMime(recordedMime))
.then((text) => {
// Whisper often returns a leading space; insert the trimmed value.
const trimmed = text.trim();
if (trimmed.length > 0) optionsRef.current.onText(trimmed);
setStatus("idle");
})
.catch((err: unknown) => {
const httpStatus = (err as { response?: { status?: number } })
?.response?.status;
// The server returns 503 when dictation is unconfigured and 403 when
// it is disabled server-side; both map to the same "not configured".
const message =
httpStatus === 503 || httpStatus === 403
? t("Voice dictation is not configured")
: t("Transcription failed");
notifications.show({ color: "red", message });
// Surface the error state briefly, then return to idle. Store the
// timer so it can be cleared on unmount.
setStatus("error");
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
}
errorTimerRef.current = setTimeout(() => {
errorTimerRef.current = null;
setStatus("idle");
}, 1500);
});
};
// Notify the caller right when recording begins (before any async work) so
// the editor can snapshot the caret position.
try {
optionsRef.current.onStart?.();
recorder.start();
} catch {
// recorder.start() can synchronously throw (InvalidStateError /
// NotSupportedError); clean up so the button is not left stuck and the
// MediaStream does not leak.
stopTracks();
recorderRef.current = null;
startingRef.current = false;
notifications.show({
color: "red",
message: t("Could not start recording"),
});
setStatus("idle");
return;
}
setStatus("recording");
// Recording has truly begun; release the synchronous start guard.
startingRef.current = false;
const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
timerRef.current = setTimeout(() => {
if (recorderRef.current?.state === "recording") {
recorderRef.current.stop();
}
}, maxDurationMs);
}, [status, t, clearTimer, stopTracks]);
const stop = useCallback((): void => {
clearTimer();
const recorder = recorderRef.current;
if (recorder && recorder.state === "recording") {
recorder.stop();
}
}, [clearTimer]);
const cancel = useCallback((): void => {
clearTimer();
canceledRef.current = true;
const recorder = recorderRef.current;
if (recorder && recorder.state === "recording") {
// onstop sees canceledRef and skips transcription; it also stops tracks.
recorder.stop();
} else {
stopTracks();
}
setStatus("idle");
}, [clearTimer, stopTracks]);
// Clean up on unmount: stop any live recorder/stream and clear the timers.
useEffect(() => {
return () => {
clearTimer();
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
errorTimerRef.current = null;
}
const recorder = recorderRef.current;
if (recorder && recorder.state === "recording") {
canceledRef.current = true;
recorder.stop();
}
stopTracks();
};
}, [clearTimer, stopTracks]);
return { status, start, stop, cancel };
}

View File

@@ -0,0 +1,17 @@
import api from "@/lib/api-client";
// POST the recorded audio as multipart/form-data; the server transcribes it with
// the workspace STT model and returns { text } (wrapped in the standard envelope,
// so the value is at req.data.text). `filename` only sets the part name; the
// server keys off the blob's MIME type.
export async function transcribeAudio(
blob: Blob,
filename = "speech.webm",
): Promise<string> {
const form = new FormData();
form.append("file", blob, filename);
const req = await api.post<{ text: string }>("/ai-chat/transcribe", form, {
headers: { "Content-Type": "multipart/form-data" },
});
return req.data.text;
}

View File

@@ -13,6 +13,7 @@ import { QuickInsertsGroup } from "./groups/quick-inserts-group";
import { MoreInsertsGroup } from "./groups/more-inserts-group";
import { HistoryGroup } from "./groups/history-group";
import { AskAiGroup } from "./groups/ask-ai-group";
import { DictationGroup } from "./groups/dictation-group";
import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
import classes from "./fixed-toolbar.module.css";
@@ -30,6 +31,7 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
const state = useToolbarState(editor);
const workspace = useAtomValue(workspaceAtom);
const isGenerativeAiEnabled = workspace?.settings?.ai?.generative === true;
const isDictationEnabled = workspace?.settings?.ai?.dictation === true;
if (!editor || !state) return null;
@@ -65,6 +67,12 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
<MoreInsertsGroup editor={editor} templateMode={templateMode} />
<div className={classes.divider} />
<HistoryGroup editor={editor} state={state} />
{isDictationEnabled && (
<>
<div className={classes.divider} />
<DictationGroup editor={editor} />
</>
)}
</div>
</div>
<div className={classes.spacer} aria-hidden />

View File

@@ -0,0 +1,61 @@
import { FC, useRef } from "react";
import type { Editor } from "@tiptap/react";
import { MicButton } from "@/features/dictation/components/mic-button";
interface Props {
editor: Editor;
}
export const DictationGroup: FC<Props> = ({ editor }) => {
const rangeRef = useRef<{ from: number; to: number } | null>(null);
const handleStart = () => {
const { from, to } = editor.state.selection;
rangeRef.current = { from, to };
};
const handleText = (text: string) => {
// The editor may be gone by the time async transcription returns; bail out
// instead of operating on a destroyed instance.
if (!editor || editor.isDestroyed) return;
const snapshot = rangeRef.current;
rangeRef.current = null;
// The document may have shrunk during transcription (e.g. a collaborative
// edit), so clamp the snapshot into the current bounds before inserting.
const docSize = editor.state.doc.content.size;
const clamp = (p: number) => Math.max(0, Math.min(p, docSize));
try {
if (snapshot) {
// Insert at the snapshotted caret; a trailing space keeps words
// separated (the hook already trims the transcribed text).
editor
.chain()
.focus()
.insertContentAt(
{ from: clamp(snapshot.from), to: clamp(snapshot.to) },
`${text} `,
)
.run();
} else {
editor.chain().focus().insertContent(`${text} `).run();
}
} catch {
// The snapshot drifted out of range; fall back to the current caret.
try {
editor.chain().focus().insertContent(`${text} `).run();
} catch {
// The editor may have been destroyed; ignore so a dead editor can't
// surface an uncaught error.
}
}
};
return (
<MicButton
size="md"
onStart={handleStart}
onText={handleText}
disabled={!editor.isEditable}
/>
);
};

View File

@@ -47,6 +47,10 @@ const formSchema = z.object({
systemPrompt: z.string(),
apiKey: z.string(),
embeddingApiKey: z.string(),
// STT-specific fields. Empty base URL / key fall back to the chat ones.
sttModel: z.string(),
sttBaseUrl: z.string(),
sttApiKey: z.string(),
});
type FormValues = z.infer<typeof formSchema>;
@@ -101,8 +105,12 @@ export default function AiProviderSettings() {
const [searchEnabled, setSearchEnabled] = useState<boolean>(
workspace?.settings?.ai?.search ?? false,
);
const [dictationEnabled, setDictationEnabled] = useState<boolean>(
workspace?.settings?.ai?.dictation ?? false,
);
const [chatToggleLoading, setChatToggleLoading] = useState(false);
const [searchToggleLoading, setSearchToggleLoading] = useState(false);
const [dictationToggleLoading, setDictationToggleLoading] = useState(false);
// Whether a key is currently stored server-side (drives the placeholder).
const [hasApiKey, setHasApiKey] = useState(false);
@@ -111,6 +119,9 @@ export default function AiProviderSettings() {
// Same, for the embedding-specific key.
const [hasEmbeddingApiKey, setHasEmbeddingApiKey] = useState(false);
const [embeddingKeyCleared, setEmbeddingKeyCleared] = useState(false);
// Same, for the STT-specific key.
const [hasSttApiKey, setHasSttApiKey] = useState(false);
const [sttKeyCleared, setSttKeyCleared] = useState(false);
// Modal for the (large) system message editor.
const [promptOpened, promptHandlers] = useDisclosure(false);
@@ -125,6 +136,9 @@ export default function AiProviderSettings() {
systemPrompt: "",
apiKey: "",
embeddingApiKey: "",
sttModel: "",
sttBaseUrl: "",
sttApiKey: "",
},
});
@@ -140,12 +154,17 @@ export default function AiProviderSettings() {
systemPrompt: settings.systemPrompt ?? "",
apiKey: "",
embeddingApiKey: "",
sttModel: settings.sttModel ?? "",
sttBaseUrl: settings.sttBaseUrl ?? "",
sttApiKey: "",
});
form.resetDirty();
setHasApiKey(settings.hasApiKey);
setKeyCleared(false);
setHasEmbeddingApiKey(settings.hasEmbeddingApiKey);
setEmbeddingKeyCleared(false);
setHasSttApiKey(settings.hasSttApiKey);
setSttKeyCleared(false);
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [settings]);
@@ -160,6 +179,10 @@ export default function AiProviderSettings() {
baseUrl: values.baseUrl,
embeddingBaseUrl: values.embeddingBaseUrl,
systemPrompt: values.systemPrompt,
// The STT base URL is optional; empty falls back to the chat base URL
// server-side.
sttModel: values.sttModel,
sttBaseUrl: values.sttBaseUrl,
};
// Key semantics (never send the stored key back):
@@ -179,6 +202,13 @@ export default function AiProviderSettings() {
payload.embeddingApiKey = "";
}
// Same write-only semantics for the STT-specific key.
if (values.sttApiKey.length > 0) {
payload.sttApiKey = values.sttApiKey;
} else if (sttKeyCleared) {
payload.sttApiKey = "";
}
return payload;
}
@@ -191,6 +221,9 @@ export default function AiProviderSettings() {
setHasEmbeddingApiKey(updated.hasEmbeddingApiKey);
setEmbeddingKeyCleared(false);
form.setFieldValue("embeddingApiKey", "");
setHasSttApiKey(updated.hasSttApiKey);
setSttKeyCleared(false);
form.setFieldValue("sttApiKey", "");
form.resetDirty();
}
@@ -206,6 +239,12 @@ export default function AiProviderSettings() {
form.setFieldValue("embeddingApiKey", "");
}
function handleClearSttKey() {
setSttKeyCleared(true);
setHasSttApiKey(false);
form.setFieldValue("sttApiKey", "");
}
// Optimistic toggle for the "AI chat" feature (settings.ai.chat).
async function handleToggleChat(value: boolean) {
setChatToggleLoading(true);
@@ -268,6 +307,34 @@ export default function AiProviderSettings() {
}
}
// Optimistic toggle for the "Voice dictation" feature (settings.ai.dictation).
async function handleToggleDictation(value: boolean) {
setDictationToggleLoading(true);
const previous = dictationEnabled;
setDictationEnabled(value);
try {
const updated = await updateWorkspace({ aiDictation: value });
setWorkspace({
...updated,
settings: {
...updated.settings,
ai: { ...updated.settings?.ai, dictation: value },
},
});
notifications.show({ message: t("Updated successfully") });
} catch (err) {
setDictationEnabled(previous);
const message = (err as { response?: { data?: { message?: string } } })
?.response?.data?.message;
notifications.show({
message: message ?? t("Failed to update data"),
color: "red",
});
} finally {
setDictationToggleLoading(false);
}
}
// Admins only — match the previous behavior.
if (!isAdmin) {
return (
@@ -294,6 +361,11 @@ export default function AiProviderSettings() {
"/embeddings",
form.values.baseUrl,
);
const sttResolved = resolveUrl(
form.values.sttBaseUrl,
"/audio/transcriptions",
form.values.baseUrl,
);
const monoFont = "ui-monospace, Menlo, monospace";
@@ -541,8 +613,8 @@ export default function AiProviderSettings() {
</Box>
</Paper>
{/* Card 3 — Voice / STT (disabled stub, not wired to the form/backend) */}
<Paper withBorder radius="md" p="lg" opacity={0.6}>
{/* Card 3 — Voice / STT */}
<Paper withBorder radius="md" p="lg">
<Group justify="space-between" align="center" wrap="nowrap">
<Group gap="xs" align="center" wrap="nowrap">
<StatusDot status="idle" />
@@ -551,8 +623,9 @@ export default function AiProviderSettings() {
<Switch
label={t("Voice dictation")}
labelPosition="left"
checked={false}
disabled
checked={dictationEnabled}
disabled={dictationToggleLoading}
onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
/>
</Group>
<Text size="xs" c="dimmed" mt={4} mb="md">
@@ -562,33 +635,46 @@ export default function AiProviderSettings() {
</Text>
<Group grow align="flex-start">
<TextInput label={t("Model")} value="" disabled readOnly />
<PasswordInput label={t("API key")} value="" disabled readOnly />
</Group>
<TextInput mt="sm" label={t("Base URL")} value="" disabled readOnly />
<Group mt="md">
<Button variant="default" size="sm" disabled>
{t("Test endpoint")}
</Button>
<TextInput
label={t("Model")}
disabled={isLoading}
{...form.getInputProps("sttModel")}
/>
<Stack gap={4}>
<PasswordInput
label={t("API key")}
placeholder={
hasSttApiKey
? t("•••• set")
: t("Leave empty to use the chat API key")
}
autoComplete="off"
{...form.getInputProps("sttApiKey")}
/>
{hasSttApiKey && (
<Anchor
component="button"
type="button"
c="red"
size="xs"
onClick={handleClearSttKey}
>
{t("Clear")}
</Anchor>
)}
</Stack>
</Group>
<Box
mt="md"
mx="calc(var(--mantine-spacing-lg) * -1)"
mb="calc(var(--mantine-spacing-lg) * -1)"
px="lg"
py="md"
style={{
borderTop: "1px solid var(--mantine-color-default-border)",
background: "var(--mantine-color-default-hover)",
borderRadius: "0 0 var(--mantine-radius-md) var(--mantine-radius-md)",
}}
>
<Text size="xs" c="dimmed">
{t("Voice dictation is not available yet.")}
</Text>
</Box>
<TextInput
mt="sm"
label={t("Base URL")}
placeholder={t("Leave empty to use the chat base URL")}
disabled={isLoading}
{...form.getInputProps("sttBaseUrl")}
/>
<Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
{t("Resolves to {{url}}", { url: sttResolved })}
</Text>
</Paper>
{/* Nested: external MCP tools the agent calls out to */}

View File

@@ -16,6 +16,12 @@ export interface IAiSettings {
systemPrompt?: string;
hasApiKey: boolean;
hasEmbeddingApiKey: boolean;
// STT-specific settings. `sttBaseUrl` is the RAW stored value (empty means
// "uses the chat base URL"). `hasSttApiKey` indicates whether an STT-specific
// key is stored (empty means "uses the chat API key").
sttModel?: string;
sttBaseUrl?: string;
hasSttApiKey: boolean;
// RAG indexing coverage (pages indexed for semantic search).
indexedPages: number;
totalPages: number;
@@ -35,6 +41,10 @@ export interface IAiSettingsUpdate {
systemPrompt?: string;
apiKey?: string;
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
sttApiKey?: string;
}
// Result of a connection test against the configured provider.

View File

@@ -24,6 +24,7 @@ export interface IWorkspace {
disablePublicSharing?: boolean;
mcpEnabled?: boolean;
aiChat?: boolean;
aiDictation?: boolean;
trashRetentionDays?: number;
restrictApiToAdmins?: boolean;
allowMemberTemplates?: boolean;
@@ -46,6 +47,7 @@ export interface IWorkspaceAiSettings {
generative?: boolean;
mcp?: boolean;
chat?: boolean;
dictation?: boolean;
}
export interface IWorkspaceSharingSettings {

View File

@@ -1,4 +1,5 @@
import {
BadRequestException,
Body,
Controller,
ForbiddenException,
@@ -9,6 +10,7 @@ import {
Req,
Res,
UseGuards,
UseInterceptors,
} from '@nestjs/common';
import { Throttle } from '@nestjs/throttler';
import { FastifyReply, FastifyRequest } from 'fastify';
@@ -22,7 +24,9 @@ import { AiChatRepo } from '@docmost/db/repos/ai-chat/ai-chat.repo';
import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo';
import { UserThrottlerGuard } from '../../integrations/throttle/user-throttler.guard';
import { AI_CHAT_THROTTLER } from '../../integrations/throttle/throttler-names';
import { FileInterceptor } from '../../common/interceptors/file.interceptor';
import { AiChatService, AiChatStreamBody } from './ai-chat.service';
import { AiTranscriptionService } from './ai-transcription.service';
import {
ChatIdDto,
GetChatMessagesDto,
@@ -43,6 +47,7 @@ export class AiChatController {
private readonly aiChatService: AiChatService,
private readonly aiChatRepo: AiChatRepo,
private readonly aiChatMessageRepo: AiChatMessageRepo,
private readonly aiTranscription: AiTranscriptionService,
) {}
/** List the requesting user's chats in this workspace (paginated). */
@@ -180,6 +185,74 @@ export class AiChatController {
}
}
/**
* Transcribe an uploaded audio clip to text using the workspace STT model.
* Gated by settings.ai.dictation (403 when disabled). Returns { text }.
*/
@HttpCode(HttpStatus.OK)
@UseGuards(JwtAuthGuard, UserThrottlerGuard)
@Throttle({ [AI_CHAT_THROTTLER]: { limit: 20, ttl: 60000 } })
@Post('transcribe')
@UseInterceptors(FileInterceptor)
async transcribe(
@Req() req: any,
@AuthWorkspace() workspace: Workspace,
): Promise<{ text: string }> {
// Gate: dictation must be explicitly enabled for the workspace.
const settings = (workspace.settings ?? {}) as {
ai?: { dictation?: boolean };
};
if (settings.ai?.dictation !== true) {
throw new ForbiddenException('Dictation is disabled');
}
let file = null;
try {
// Whisper hard-caps uploads at 25MB; allow a single file.
file = await req.file({ limits: { fileSize: 25 * 1024 * 1024, files: 1 } });
} catch (err: any) {
if (err?.statusCode === 413) {
throw new BadRequestException('Audio file too large (max 25MB)');
}
throw err;
}
if (!file) throw new BadRequestException('No audio uploaded');
// Whitelist audio container types produced by browser MediaRecorder
// (Chrome/FF: webm/opus, Safari: mp4) plus common STT-accepted formats.
const allowedMime = new Set([
'audio/webm',
'audio/ogg',
'audio/mp4',
'audio/mpeg',
'audio/wav',
'audio/x-wav',
'audio/wave',
'audio/m4a',
'audio/x-m4a',
]);
// MediaRecorder mimetypes carry parameters (e.g. "audio/webm;codecs=opus");
// compare only the base type.
const baseMime = file.mimetype.split(';')[0].trim().toLowerCase();
if (!allowedMime.has(baseMime)) {
throw new BadRequestException('Unsupported audio format');
}
let buf: Buffer;
try {
buf = await file.toBuffer();
} catch (err: any) {
// With @fastify/multipart throwFileSizeLimit:true, the 25MB cap is enforced
// when the stream is consumed (here), not at req.file().
if (err?.statusCode === 413) {
throw new BadRequestException('Audio file too large (max 25MB)');
}
throw err;
}
const text = await this.aiTranscription.transcribe(workspace.id, buf);
return { text };
}
/**
* Ensure the chat exists, belongs to this workspace, AND was created by the
* requesting user (per-user isolation). Throws ForbiddenException otherwise.

View File

@@ -3,6 +3,7 @@ import { AiModule } from '../../integrations/ai/ai.module';
import { TokenModule } from '../auth/token.module';
import { AiChatController } from './ai-chat.controller';
import { AiChatService } from './ai-chat.service';
import { AiTranscriptionService } from './ai-transcription.service';
import { AiChatToolsService } from './tools/ai-chat-tools.service';
import { EmbeddingModule } from './embedding/embedding.module';
import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@@ -21,6 +22,6 @@ import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@Module({
imports: [AiModule, TokenModule, EmbeddingModule, ExternalMcpModule],
controllers: [AiChatController],
providers: [AiChatService, AiChatToolsService],
providers: [AiChatService, AiTranscriptionService, AiChatToolsService],
})
export class AiChatModule {}

View File

@@ -0,0 +1,20 @@
import { Injectable } from '@nestjs/common';
import { experimental_transcribe as transcribe } from 'ai';
import { AiService } from '../../integrations/ai/ai.service';
/**
* Transcribes uploaded audio to text using the per-workspace STT model.
* Thin wrapper over the AI SDK's experimental_transcribe; never logs the
* audio or the key.
*/
@Injectable()
export class AiTranscriptionService {
constructor(private readonly ai: AiService) {}
// Transcribe an uploaded audio buffer using the workspace STT model.
async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
const model = await this.ai.getTranscriptionModel(workspaceId);
const { text } = await transcribe({ model, audio });
return text.trim();
}
}

View File

@@ -49,6 +49,10 @@ export class UpdateWorkspaceDto extends PartialType(CreateWorkspaceDto) {
@IsBoolean()
aiChat: boolean;
@IsOptional()
@IsBoolean()
aiDictation: boolean;
@IsOptional()
@IsInt()
@Min(1)

View File

@@ -497,6 +497,20 @@ export class WorkspaceService {
);
}
if (typeof updateWorkspaceDto.aiDictation !== 'undefined') {
const prev = settingsBefore?.ai?.dictation ?? false;
if (prev !== updateWorkspaceDto.aiDictation) {
before.aiDictation = prev;
after.aiDictation = updateWorkspaceDto.aiDictation;
}
await this.workspaceRepo.updateAiSettings(
workspaceId,
'dictation',
updateWorkspaceDto.aiDictation,
trx,
);
}
delete updateWorkspaceDto.restrictApiToAdmins;
delete updateWorkspaceDto.aiSearch;
delete updateWorkspaceDto.generativeAi;
@@ -504,6 +518,7 @@ export class WorkspaceService {
delete updateWorkspaceDto.mcpEnabled;
delete updateWorkspaceDto.allowMemberTemplates;
delete updateWorkspaceDto.aiChat;
delete updateWorkspaceDto.aiDictation;
await this.workspaceRepo.updateWorkspace(
updateWorkspaceDto,

View File

@@ -0,0 +1,18 @@
import { type Kysely } from 'kysely';
export async function up(db: Kysely<any>): Promise<void> {
// Encrypted, STT-specific provider key. Separate from `api_key_enc`
// (the chat key) so the transcription model can use a different token.
// When NULL, the STT model falls back to `api_key_enc`.
await db.schema
.alterTable('ai_provider_credentials')
.addColumn('stt_api_key_enc', 'text', (col) => col)
.execute();
}
export async function down(db: Kysely<any>): Promise<void> {
await db.schema
.alterTable('ai_provider_credentials')
.dropColumn('stt_api_key_enc')
.execute();
}

View File

@@ -98,4 +98,42 @@ export class AiProviderCredentialsRepo {
.where('driver', '=', driver)
.execute();
}
// Upsert the STT-specific encrypted key. If no row exists yet this inserts one
// with `apiKeyEnc` left null (the column is nullable). On conflict only
// `sttApiKeyEnc` / `updatedAt` are touched, so the chat & embedding keys are kept.
async upsertSttKey(
workspaceId: string,
driver: string,
sttApiKeyEnc: string,
trx?: KyselyTransaction,
): Promise<AiProviderCredentials> {
const db = dbOrTx(this.db, trx);
return db
.insertInto('aiProviderCredentials')
.values({ workspaceId, driver, sttApiKeyEnc })
.onConflict((oc) =>
oc.columns(['workspaceId', 'driver']).doUpdateSet({
sttApiKeyEnc,
updatedAt: new Date(),
}),
)
.returningAll()
.executeTakeFirst();
}
// Clear only the STT-specific key; the chat & embedding keys are kept.
async clearSttKey(
workspaceId: string,
driver: string,
trx?: KyselyTransaction,
): Promise<void> {
const db = dbOrTx(this.db, trx);
await db
.updateTable('aiProviderCredentials')
.set({ sttApiKeyEnc: null, updatedAt: new Date() })
.where('workspaceId', '=', workspaceId)
.where('driver', '=', driver)
.execute();
}
}

View File

@@ -239,7 +239,7 @@ export class WorkspaceRepo {
// is a real jsonb object, never a double-encoded string. The CASE self-heals
// workspaces whose settings.ai.provider was previously corrupted into an
// array/string.
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'systemPrompt'];
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
const entries = Object.entries(provider).filter(
([k, v]) => v !== undefined && ALLOWED.includes(k),
);

View File

@@ -14,6 +14,8 @@ export interface AiProviderCredentials {
apiKeyEnc: string | null;
// Encrypted, embedding-specific provider key. Falls back to apiKeyEnc when null.
embeddingApiKeyEnc: string | null;
// Encrypted, STT-specific provider key. Falls back to apiKeyEnc when null.
sttApiKeyEnc: string | null;
createdAt: Generated<Timestamp>;
updatedAt: Generated<Timestamp>;
}

View File

@@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput {
systemPrompt?: string;
apiKey?: string;
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
sttApiKey?: string;
}
/**
@@ -113,6 +116,7 @@ export class AiSettingsService {
driver: provider.driver,
chatModel: provider.chatModel,
embeddingModel: provider.embeddingModel,
sttModel: provider.sttModel,
baseUrl: provider.baseUrl,
systemPrompt: provider.systemPrompt,
};
@@ -122,6 +126,10 @@ export class AiSettingsService {
// unconditionally.
config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl;
// Effective STT base URL: the STT-specific value, else the chat base URL.
// Set unconditionally, same rationale as embeddingBaseUrl.
config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl;
if (provider.driver !== 'ollama') {
const creds = await this.aiProviderCredentialsRepo.find(
workspaceId,
@@ -134,6 +142,10 @@ export class AiSettingsService {
config.embeddingApiKey = creds?.embeddingApiKeyEnc
? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc)
: config.apiKey;
// Effective STT key: the STT-specific key, else the chat key.
config.sttApiKey = creds?.sttApiKeyEnc
? this.secretBox.decryptSecret(creds.sttApiKeyEnc)
: config.apiKey;
}
return config;
@@ -151,6 +163,7 @@ export class AiSettingsService {
let hasApiKey = false;
let hasEmbeddingApiKey = false;
let hasSttApiKey = false;
if (provider.driver) {
const creds = await this.aiProviderCredentialsRepo.find(
workspaceId,
@@ -158,6 +171,7 @@ export class AiSettingsService {
);
hasApiKey = !!creds?.apiKeyEnc;
hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
hasSttApiKey = !!creds?.sttApiKeyEnc;
}
// totalPages now counts only pages with embeddable content (non-empty text
@@ -174,9 +188,12 @@ export class AiSettingsService {
embeddingModel: provider.embeddingModel,
baseUrl: provider.baseUrl,
embeddingBaseUrl: provider.embeddingBaseUrl,
sttModel: provider.sttModel,
sttBaseUrl: provider.sttBaseUrl,
systemPrompt: provider.systemPrompt,
hasApiKey,
hasEmbeddingApiKey,
hasSttApiKey,
indexedPages,
totalPages,
};
@@ -197,7 +214,7 @@ export class AiSettingsService {
workspaceId: string,
dto: UpdateAiSettingsInput,
): Promise<MaskedAiSettings> {
const { apiKey, embeddingApiKey, ...nonSecret } = dto;
const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto;
// Persist non-secret provider fields (only those present in the partial).
const providerPatch: Partial<AiProviderSettings> = {};
@@ -207,6 +224,8 @@ export class AiSettingsService {
'embeddingModel',
'baseUrl',
'embeddingBaseUrl',
'sttModel',
'sttBaseUrl',
'systemPrompt',
] as const) {
if (nonSecret[key] !== undefined) {
@@ -222,7 +241,11 @@ export class AiSettingsService {
// Key handling (write-only). Both keys share the same target driver and the
// same "driver required" guard, resolved once.
if (apiKey !== undefined || embeddingApiKey !== undefined) {
if (
apiKey !== undefined ||
embeddingApiKey !== undefined ||
sttApiKey !== undefined
) {
const stored = await this.readProvider(workspaceId);
const targetDriver = dto.driver ?? stored.driver;
if (!targetDriver) {
@@ -264,6 +287,23 @@ export class AiSettingsService {
);
}
}
// STT key.
if (sttApiKey !== undefined) {
if (sttApiKey === '') {
await this.aiProviderCredentialsRepo.clearSttKey(
workspaceId,
targetDriver,
);
} else {
const enc = this.secretBox.encryptSecret(sttApiKey);
await this.aiProviderCredentialsRepo.upsertSttKey(
workspaceId,
targetDriver,
enc,
);
}
}
}
return this.getMasked(workspaceId);

View File

@@ -0,0 +1,13 @@
import { ServiceUnavailableException } from '@nestjs/common';
/**
* Thrown when no usable STT (speech-to-text) config exists for the workspace
* (missing driver / sttModel). Distinct from the chat & embedding variants so
* the transcription endpoint can 503 independently of chat/embeddings being
* configured.
*/
export class AiSttNotConfiguredException extends ServiceUnavailableException {
constructor() {
super('AI STT model not configured');
}
}

View File

@@ -4,6 +4,7 @@ import {
generateText,
type EmbeddingModel,
type LanguageModel,
type TranscriptionModel,
} from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama';
import { AiSettingsService } from './ai-settings.service';
import { AiNotConfiguredException } from './ai-not-configured.exception';
import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
import { describeProviderError } from './ai-error.util';
/**
@@ -106,6 +108,26 @@ export class AiService {
}
}
/**
* Resolve the workspace config and build the transcription (STT) model.
* STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
* (only @ai-sdk/openai exposes .transcription()), regardless of the chat
* driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
* to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
* on demand; the decrypted key is never logged.
*
* Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
*/
async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
const cfg = await this.aiSettings.resolve(workspaceId);
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
// apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
cfg.sttModel,
);
}
/**
* Embed a batch of texts with the workspace embedding model. Returns one
* vector per input, in the same order. Thin wrapper over the AI SDK's

View File

@@ -21,6 +21,9 @@ export interface AiProviderSettings {
baseUrl?: string;
// Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
embeddingBaseUrl?: string;
sttModel?: string;
// STT-specific base URL. Falls back to baseUrl when empty/unset.
sttBaseUrl?: string;
systemPrompt?: string;
}
@@ -31,12 +34,15 @@ export interface AiProviderSettings {
*
* `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and
* key, already resolved with the chat-value fallback applied by `resolve`.
* `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key,
* already resolved with the chat-value fallback applied by `resolve`.
*/
export interface ResolvedAiConfig extends Partial<AiProviderSettings> {
driver?: AiDriver;
chatModel?: string;
apiKey?: string;
embeddingApiKey?: string;
sttApiKey?: string;
}
/**
@@ -50,9 +56,12 @@ export interface MaskedAiSettings {
embeddingModel?: string;
baseUrl?: string;
embeddingBaseUrl?: string;
sttModel?: string;
sttBaseUrl?: string;
systemPrompt?: string;
hasApiKey: boolean;
hasEmbeddingApiKey: boolean;
hasSttApiKey: boolean;
// RAG indexing coverage for the settings UI.
indexedPages: number;
totalPages: number;

View File

@@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types';
/**
* Admin update payload for the workspace AI provider settings.
*
* `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored
* encrypted, '' → cleared, absent → left untouched. They are NEVER returned by
* any endpoint. The global ValidationPipe runs with `whitelist: true`, so
* unknown fields are stripped.
* `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided →
* stored encrypted, '' → cleared, absent → left untouched. They are NEVER
* returned by any endpoint. The global ValidationPipe runs with
* `whitelist: true`, so unknown fields are stripped.
*/
export class UpdateAiSettingsDto {
@IsOptional()
@@ -41,4 +41,16 @@ export class UpdateAiSettingsDto {
@IsOptional()
@IsString()
embeddingApiKey?: string;
@IsOptional()
@IsString()
sttModel?: string;
@IsOptional()
@IsString()
sttBaseUrl?: string;
@IsOptional()
@IsString()
sttApiKey?: string;
}