diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json index 847b67a4..591b362a 100644 --- a/apps/client/public/locales/en-US/translation.json +++ b/apps/client/public/locales/en-US/translation.json @@ -1181,5 +1181,13 @@ "Embeddings": "Embeddings", "Leave empty to use the chat API key": "Leave empty to use the chat API key", "Leave empty to use the chat base URL": "Leave empty to use the chat base URL", - "Reindex now": "Reindex now" + "Reindex now": "Reindex now", + "Start dictation": "Start dictation", + "Stop recording": "Stop recording", + "Transcribing…": "Transcribing…", + "Microphone access denied": "Microphone access denied", + "No microphone found": "No microphone found", + "Could not start recording": "Could not start recording", + "Transcription failed": "Transcription failed", + "Voice dictation is not configured": "Voice dictation is not configured" } diff --git a/apps/client/src/features/ai-chat/components/chat-input.tsx b/apps/client/src/features/ai-chat/components/chat-input.tsx index 42649aa6..3bb67535 100644 --- a/apps/client/src/features/ai-chat/components/chat-input.tsx +++ b/apps/client/src/features/ai-chat/components/chat-input.tsx @@ -2,8 +2,10 @@ import { KeyboardEvent } from "react"; import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core"; import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react"; import { useTranslation } from "react-i18next"; -import { useAtom } from "jotai"; +import { useAtom, useAtomValue } from "jotai"; import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts"; +import { workspaceAtom } from "@/features/user/atoms/current-user-atom"; +import { MicButton } from "@/features/dictation/components/mic-button"; interface ChatInputProps { onSend: (text: string) => void; @@ -25,6 +27,8 @@ export default function ChatInput({ }: ChatInputProps) { const { t } = useTranslation(); const [value, setValue] = useAtom(aiChatDraftAtom); + const workspace = useAtomValue(workspaceAtom); + const isDictationEnabled = workspace?.settings?.ai?.dictation === true; const send = (): void => { const text = value.trim(); @@ -57,6 +61,13 @@ export default function ChatInput({ // switch), so a fresh chat lands with the cursor ready in the field. autoFocus /> + {isDictationEnabled && ( + setValue((v) => (v ? `${v} ${text}` : text))} + /> + )} {isStreaming ? ( void; + onStart?: () => void; + disabled?: boolean; + // Mantine ActionIcon size token; "lg" matches the chat composer, "md" the + // editor toolbar. + size?: "md" | "lg"; +} + +/** + * Self-contained dictation toggle. Owns its own capture state machine: a click + * starts recording (mic icon), a second click stops it (stop icon), and while + * the audio is being transcribed it shows a spinner and is disabled to prevent + * overlapping requests. + */ +export const MicButton: FC = ({ + onText, + onStart, + disabled, + size = "lg", +}) => { + const { t } = useTranslation(); + const { status, start, stop } = useDictation({ onText, onStart }); + const iconSize = size === "lg" ? 18 : 16; + + if (status === "recording") { + return ( + + + + + + ); + } + + if (status === "transcribing" || status === "error") { + return ( + + + + + + ); + } + + return ( + + void start()} + disabled={disabled} + aria-label={t("Start dictation")} + > + + + + ); +}; diff --git a/apps/client/src/features/dictation/hooks/use-dictation.ts b/apps/client/src/features/dictation/hooks/use-dictation.ts new file mode 100644 index 00000000..059949f0 --- /dev/null +++ b/apps/client/src/features/dictation/hooks/use-dictation.ts @@ -0,0 +1,260 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { notifications } from "@mantine/notifications"; +import { useTranslation } from "react-i18next"; +import { transcribeAudio } from "@/features/dictation/services/dictation-service"; + +export type DictationStatus = "idle" | "recording" | "transcribing" | "error"; + +interface UseDictationOptions { + onText: (text: string) => void; + onStart?: () => void; + maxDurationMs?: number; +} + +interface UseDictationResult { + status: DictationStatus; + start: () => Promise; + stop: () => void; + cancel: () => void; +} + +// Candidate container/codec combinations in preference order. The first one the +// browser supports wins; if none do we let MediaRecorder pick its own default. +const MIME_CANDIDATES = [ + "audio/webm;codecs=opus", + "audio/webm", + "audio/mp4", + "audio/ogg;codecs=opus", + "audio/ogg", +]; + +// Derive a sensible upload filename from the recorded MIME type. The server keys +// off the blob's MIME, so this only affects the part name, but a matching +// extension keeps things tidy. +function filenameForMime(mime: string): string { + if (mime.includes("mp4")) return "speech.mp4"; + if (mime.includes("ogg")) return "speech.ogg"; + return "speech.webm"; +} + +function pickMimeType(): string | undefined { + if (typeof MediaRecorder === "undefined") return undefined; + for (const candidate of MIME_CANDIDATES) { + if (MediaRecorder.isTypeSupported?.(candidate)) return candidate; + } + return undefined; +} + +/** + * Encapsulates the browser audio-capture state machine: request the mic, record + * with MediaRecorder, then POST the blob for transcription. Refs hold the live + * recorder/stream/chunks/timer/cancel flag so component re-renders never lose + * them, and every exit path stops the MediaStream tracks. + */ +export function useDictation( + options: UseDictationOptions, +): UseDictationResult { + const { t } = useTranslation(); + const [status, setStatus] = useState("idle"); + + // Keep the latest callbacks in a ref so the recorder's onstop closure always + // calls the current handlers without re-creating the recorder. + const optionsRef = useRef(options); + optionsRef.current = options; + + const recorderRef = useRef(null); + const streamRef = useRef(null); + const chunksRef = useRef([]); + const timerRef = useRef | null>(null); + const errorTimerRef = useRef | null>(null); + const canceledRef = useRef(false); + const startingRef = useRef(false); + + const clearTimer = useCallback(() => { + if (timerRef.current !== null) { + clearTimeout(timerRef.current); + timerRef.current = null; + } + }, []); + + const stopTracks = useCallback(() => { + streamRef.current?.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + }, []); + + const start = useCallback(async (): Promise => { + // Synchronous live guard: status is stale between renders, so also block on + // refs to prevent a double-click from opening two MediaStreams (the first + // would leak). + if (startingRef.current || recorderRef.current || streamRef.current) return; + if (status !== "idle") return; + startingRef.current = true; + + let stream: MediaStream; + try { + stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + } catch (err) { + const name = (err as { name?: string })?.name; + let message: string; + if (name === "NotAllowedError" || name === "SecurityError") { + message = t("Microphone access denied"); + } else if (name === "NotFoundError" || name === "OverconstrainedError") { + message = t("No microphone found"); + } else { + message = t("Could not start recording"); + } + notifications.show({ color: "red", message }); + setStatus("idle"); + startingRef.current = false; + return; + } + + streamRef.current = stream; + chunksRef.current = []; + canceledRef.current = false; + + const mimeType = pickMimeType(); + let recorder: MediaRecorder; + try { + recorder = new MediaRecorder( + stream, + mimeType ? { mimeType } : undefined, + ); + } catch { + // The stream was acquired but the recorder failed to construct; stop the + // tracks so the MediaStream does not leak before bailing out. + stopTracks(); + notifications.show({ + color: "red", + message: t("Could not start recording"), + }); + setStatus("idle"); + startingRef.current = false; + return; + } + recorderRef.current = recorder; + + recorder.ondataavailable = (e: BlobEvent) => { + if (e.data && e.data.size > 0) chunksRef.current.push(e.data); + }; + + recorder.onstop = () => { + clearTimer(); + const recordedMime = recorder.mimeType || mimeType || "audio/webm"; + const wasCanceled = canceledRef.current; + + // Stop the mic tracks regardless of how we got here. + stopTracks(); + recorderRef.current = null; + + if (wasCanceled) { + chunksRef.current = []; + setStatus("idle"); + return; + } + + const blob = new Blob(chunksRef.current, { type: recordedMime }); + chunksRef.current = []; + + setStatus("transcribing"); + void transcribeAudio(blob, filenameForMime(recordedMime)) + .then((text) => { + // Whisper often returns a leading space; insert the trimmed value. + const trimmed = text.trim(); + if (trimmed.length > 0) optionsRef.current.onText(trimmed); + setStatus("idle"); + }) + .catch((err: unknown) => { + const httpStatus = (err as { response?: { status?: number } }) + ?.response?.status; + // The server returns 503 when dictation is unconfigured and 403 when + // it is disabled server-side; both map to the same "not configured". + const message = + httpStatus === 503 || httpStatus === 403 + ? t("Voice dictation is not configured") + : t("Transcription failed"); + notifications.show({ color: "red", message }); + // Surface the error state briefly, then return to idle. Store the + // timer so it can be cleared on unmount. + setStatus("error"); + if (errorTimerRef.current !== null) { + clearTimeout(errorTimerRef.current); + } + errorTimerRef.current = setTimeout(() => { + errorTimerRef.current = null; + setStatus("idle"); + }, 1500); + }); + }; + + // Notify the caller right when recording begins (before any async work) so + // the editor can snapshot the caret position. + try { + optionsRef.current.onStart?.(); + recorder.start(); + } catch { + // recorder.start() can synchronously throw (InvalidStateError / + // NotSupportedError); clean up so the button is not left stuck and the + // MediaStream does not leak. + stopTracks(); + recorderRef.current = null; + startingRef.current = false; + notifications.show({ + color: "red", + message: t("Could not start recording"), + }); + setStatus("idle"); + return; + } + setStatus("recording"); + // Recording has truly begun; release the synchronous start guard. + startingRef.current = false; + + const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000; + timerRef.current = setTimeout(() => { + if (recorderRef.current?.state === "recording") { + recorderRef.current.stop(); + } + }, maxDurationMs); + }, [status, t, clearTimer, stopTracks]); + + const stop = useCallback((): void => { + clearTimer(); + const recorder = recorderRef.current; + if (recorder && recorder.state === "recording") { + recorder.stop(); + } + }, [clearTimer]); + + const cancel = useCallback((): void => { + clearTimer(); + canceledRef.current = true; + const recorder = recorderRef.current; + if (recorder && recorder.state === "recording") { + // onstop sees canceledRef and skips transcription; it also stops tracks. + recorder.stop(); + } else { + stopTracks(); + } + setStatus("idle"); + }, [clearTimer, stopTracks]); + + // Clean up on unmount: stop any live recorder/stream and clear the timers. + useEffect(() => { + return () => { + clearTimer(); + if (errorTimerRef.current !== null) { + clearTimeout(errorTimerRef.current); + errorTimerRef.current = null; + } + const recorder = recorderRef.current; + if (recorder && recorder.state === "recording") { + canceledRef.current = true; + recorder.stop(); + } + stopTracks(); + }; + }, [clearTimer, stopTracks]); + + return { status, start, stop, cancel }; +} diff --git a/apps/client/src/features/dictation/services/dictation-service.ts b/apps/client/src/features/dictation/services/dictation-service.ts new file mode 100644 index 00000000..01dfbca7 --- /dev/null +++ b/apps/client/src/features/dictation/services/dictation-service.ts @@ -0,0 +1,17 @@ +import api from "@/lib/api-client"; + +// POST the recorded audio as multipart/form-data; the server transcribes it with +// the workspace STT model and returns { text } (wrapped in the standard envelope, +// so the value is at req.data.text). `filename` only sets the part name; the +// server keys off the blob's MIME type. +export async function transcribeAudio( + blob: Blob, + filename = "speech.webm", +): Promise { + const form = new FormData(); + form.append("file", blob, filename); + const req = await api.post<{ text: string }>("/ai-chat/transcribe", form, { + headers: { "Content-Type": "multipart/form-data" }, + }); + return req.data.text; +} diff --git a/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx b/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx index b425753e..e59f9863 100644 --- a/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx +++ b/apps/client/src/features/editor/components/fixed-toolbar/fixed-toolbar.tsx @@ -13,6 +13,7 @@ import { QuickInsertsGroup } from "./groups/quick-inserts-group"; import { MoreInsertsGroup } from "./groups/more-inserts-group"; import { HistoryGroup } from "./groups/history-group"; import { AskAiGroup } from "./groups/ask-ai-group"; +import { DictationGroup } from "./groups/dictation-group"; import { workspaceAtom } from "@/features/user/atoms/current-user-atom"; import classes from "./fixed-toolbar.module.css"; @@ -30,6 +31,7 @@ export const FixedToolbar: FC = ({ const state = useToolbarState(editor); const workspace = useAtomValue(workspaceAtom); const isGenerativeAiEnabled = workspace?.settings?.ai?.generative === true; + const isDictationEnabled = workspace?.settings?.ai?.dictation === true; if (!editor || !state) return null; @@ -65,6 +67,12 @@ export const FixedToolbar: FC = ({
+ {isDictationEnabled && ( + <> +
+ + + )}
diff --git a/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx new file mode 100644 index 00000000..8a88f0e3 --- /dev/null +++ b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx @@ -0,0 +1,61 @@ +import { FC, useRef } from "react"; +import type { Editor } from "@tiptap/react"; +import { MicButton } from "@/features/dictation/components/mic-button"; + +interface Props { + editor: Editor; +} + +export const DictationGroup: FC = ({ editor }) => { + const rangeRef = useRef<{ from: number; to: number } | null>(null); + + const handleStart = () => { + const { from, to } = editor.state.selection; + rangeRef.current = { from, to }; + }; + + const handleText = (text: string) => { + // The editor may be gone by the time async transcription returns; bail out + // instead of operating on a destroyed instance. + if (!editor || editor.isDestroyed) return; + const snapshot = rangeRef.current; + rangeRef.current = null; + // The document may have shrunk during transcription (e.g. a collaborative + // edit), so clamp the snapshot into the current bounds before inserting. + const docSize = editor.state.doc.content.size; + const clamp = (p: number) => Math.max(0, Math.min(p, docSize)); + try { + if (snapshot) { + // Insert at the snapshotted caret; a trailing space keeps words + // separated (the hook already trims the transcribed text). + editor + .chain() + .focus() + .insertContentAt( + { from: clamp(snapshot.from), to: clamp(snapshot.to) }, + `${text} `, + ) + .run(); + } else { + editor.chain().focus().insertContent(`${text} `).run(); + } + } catch { + // The snapshot drifted out of range; fall back to the current caret. + try { + editor.chain().focus().insertContent(`${text} `).run(); + } catch { + // The editor may have been destroyed; ignore so a dead editor can't + // surface an uncaught error. + } + } + }; + + return ( + + ); +}; diff --git a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx index 827297ea..e39176fd 100644 --- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx +++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx @@ -47,6 +47,10 @@ const formSchema = z.object({ systemPrompt: z.string(), apiKey: z.string(), embeddingApiKey: z.string(), + // STT-specific fields. Empty base URL / key fall back to the chat ones. + sttModel: z.string(), + sttBaseUrl: z.string(), + sttApiKey: z.string(), }); type FormValues = z.infer; @@ -101,8 +105,12 @@ export default function AiProviderSettings() { const [searchEnabled, setSearchEnabled] = useState( workspace?.settings?.ai?.search ?? false, ); + const [dictationEnabled, setDictationEnabled] = useState( + workspace?.settings?.ai?.dictation ?? false, + ); const [chatToggleLoading, setChatToggleLoading] = useState(false); const [searchToggleLoading, setSearchToggleLoading] = useState(false); + const [dictationToggleLoading, setDictationToggleLoading] = useState(false); // Whether a key is currently stored server-side (drives the placeholder). const [hasApiKey, setHasApiKey] = useState(false); @@ -111,6 +119,9 @@ export default function AiProviderSettings() { // Same, for the embedding-specific key. const [hasEmbeddingApiKey, setHasEmbeddingApiKey] = useState(false); const [embeddingKeyCleared, setEmbeddingKeyCleared] = useState(false); + // Same, for the STT-specific key. + const [hasSttApiKey, setHasSttApiKey] = useState(false); + const [sttKeyCleared, setSttKeyCleared] = useState(false); // Modal for the (large) system message editor. const [promptOpened, promptHandlers] = useDisclosure(false); @@ -125,6 +136,9 @@ export default function AiProviderSettings() { systemPrompt: "", apiKey: "", embeddingApiKey: "", + sttModel: "", + sttBaseUrl: "", + sttApiKey: "", }, }); @@ -140,12 +154,17 @@ export default function AiProviderSettings() { systemPrompt: settings.systemPrompt ?? "", apiKey: "", embeddingApiKey: "", + sttModel: settings.sttModel ?? "", + sttBaseUrl: settings.sttBaseUrl ?? "", + sttApiKey: "", }); form.resetDirty(); setHasApiKey(settings.hasApiKey); setKeyCleared(false); setHasEmbeddingApiKey(settings.hasEmbeddingApiKey); setEmbeddingKeyCleared(false); + setHasSttApiKey(settings.hasSttApiKey); + setSttKeyCleared(false); // eslint-disable-next-line react-hooks/exhaustive-deps }, [settings]); @@ -160,6 +179,10 @@ export default function AiProviderSettings() { baseUrl: values.baseUrl, embeddingBaseUrl: values.embeddingBaseUrl, systemPrompt: values.systemPrompt, + // The STT base URL is optional; empty falls back to the chat base URL + // server-side. + sttModel: values.sttModel, + sttBaseUrl: values.sttBaseUrl, }; // Key semantics (never send the stored key back): @@ -179,6 +202,13 @@ export default function AiProviderSettings() { payload.embeddingApiKey = ""; } + // Same write-only semantics for the STT-specific key. + if (values.sttApiKey.length > 0) { + payload.sttApiKey = values.sttApiKey; + } else if (sttKeyCleared) { + payload.sttApiKey = ""; + } + return payload; } @@ -191,6 +221,9 @@ export default function AiProviderSettings() { setHasEmbeddingApiKey(updated.hasEmbeddingApiKey); setEmbeddingKeyCleared(false); form.setFieldValue("embeddingApiKey", ""); + setHasSttApiKey(updated.hasSttApiKey); + setSttKeyCleared(false); + form.setFieldValue("sttApiKey", ""); form.resetDirty(); } @@ -206,6 +239,12 @@ export default function AiProviderSettings() { form.setFieldValue("embeddingApiKey", ""); } + function handleClearSttKey() { + setSttKeyCleared(true); + setHasSttApiKey(false); + form.setFieldValue("sttApiKey", ""); + } + // Optimistic toggle for the "AI chat" feature (settings.ai.chat). async function handleToggleChat(value: boolean) { setChatToggleLoading(true); @@ -268,6 +307,34 @@ export default function AiProviderSettings() { } } + // Optimistic toggle for the "Voice dictation" feature (settings.ai.dictation). + async function handleToggleDictation(value: boolean) { + setDictationToggleLoading(true); + const previous = dictationEnabled; + setDictationEnabled(value); + try { + const updated = await updateWorkspace({ aiDictation: value }); + setWorkspace({ + ...updated, + settings: { + ...updated.settings, + ai: { ...updated.settings?.ai, dictation: value }, + }, + }); + notifications.show({ message: t("Updated successfully") }); + } catch (err) { + setDictationEnabled(previous); + const message = (err as { response?: { data?: { message?: string } } }) + ?.response?.data?.message; + notifications.show({ + message: message ?? t("Failed to update data"), + color: "red", + }); + } finally { + setDictationToggleLoading(false); + } + } + // Admins only — match the previous behavior. if (!isAdmin) { return ( @@ -294,6 +361,11 @@ export default function AiProviderSettings() { "/embeddings", form.values.baseUrl, ); + const sttResolved = resolveUrl( + form.values.sttBaseUrl, + "/audio/transcriptions", + form.values.baseUrl, + ); const monoFont = "ui-monospace, Menlo, monospace"; @@ -541,8 +613,8 @@ export default function AiProviderSettings() { - {/* Card 3 — Voice / STT (disabled stub, not wired to the form/backend) */} - + {/* Card 3 — Voice / STT */} + @@ -551,8 +623,9 @@ export default function AiProviderSettings() { handleToggleDictation(e.currentTarget.checked)} /> @@ -562,33 +635,46 @@ export default function AiProviderSettings() { - - - - - - - + + + + {hasSttApiKey && ( + + {t("Clear")} + + )} + - - - {t("Voice dictation is not available yet.")} - - + + + {t("Resolves to {{url}}", { url: sttResolved })} + {/* Nested: external MCP tools the agent calls out to */} diff --git a/apps/client/src/features/workspace/services/ai-settings-service.ts b/apps/client/src/features/workspace/services/ai-settings-service.ts index 19f473ec..53809ab9 100644 --- a/apps/client/src/features/workspace/services/ai-settings-service.ts +++ b/apps/client/src/features/workspace/services/ai-settings-service.ts @@ -16,6 +16,12 @@ export interface IAiSettings { systemPrompt?: string; hasApiKey: boolean; hasEmbeddingApiKey: boolean; + // STT-specific settings. `sttBaseUrl` is the RAW stored value (empty means + // "uses the chat base URL"). `hasSttApiKey` indicates whether an STT-specific + // key is stored (empty means "uses the chat API key"). + sttModel?: string; + sttBaseUrl?: string; + hasSttApiKey: boolean; // RAG indexing coverage (pages indexed for semantic search). indexedPages: number; totalPages: number; @@ -35,6 +41,10 @@ export interface IAiSettingsUpdate { systemPrompt?: string; apiKey?: string; embeddingApiKey?: string; + sttModel?: string; + sttBaseUrl?: string; + // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`). + sttApiKey?: string; } // Result of a connection test against the configured provider. diff --git a/apps/client/src/features/workspace/types/workspace.types.ts b/apps/client/src/features/workspace/types/workspace.types.ts index 7ea544cc..9a44ed8d 100644 --- a/apps/client/src/features/workspace/types/workspace.types.ts +++ b/apps/client/src/features/workspace/types/workspace.types.ts @@ -24,6 +24,7 @@ export interface IWorkspace { disablePublicSharing?: boolean; mcpEnabled?: boolean; aiChat?: boolean; + aiDictation?: boolean; trashRetentionDays?: number; restrictApiToAdmins?: boolean; allowMemberTemplates?: boolean; @@ -46,6 +47,7 @@ export interface IWorkspaceAiSettings { generative?: boolean; mcp?: boolean; chat?: boolean; + dictation?: boolean; } export interface IWorkspaceSharingSettings { diff --git a/apps/server/src/core/ai-chat/ai-chat.controller.ts b/apps/server/src/core/ai-chat/ai-chat.controller.ts index 206627fe..d1007a78 100644 --- a/apps/server/src/core/ai-chat/ai-chat.controller.ts +++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts @@ -1,4 +1,5 @@ import { + BadRequestException, Body, Controller, ForbiddenException, @@ -9,6 +10,7 @@ import { Req, Res, UseGuards, + UseInterceptors, } from '@nestjs/common'; import { Throttle } from '@nestjs/throttler'; import { FastifyReply, FastifyRequest } from 'fastify'; @@ -22,7 +24,9 @@ import { AiChatRepo } from '@docmost/db/repos/ai-chat/ai-chat.repo'; import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo'; import { UserThrottlerGuard } from '../../integrations/throttle/user-throttler.guard'; import { AI_CHAT_THROTTLER } from '../../integrations/throttle/throttler-names'; +import { FileInterceptor } from '../../common/interceptors/file.interceptor'; import { AiChatService, AiChatStreamBody } from './ai-chat.service'; +import { AiTranscriptionService } from './ai-transcription.service'; import { ChatIdDto, GetChatMessagesDto, @@ -43,6 +47,7 @@ export class AiChatController { private readonly aiChatService: AiChatService, private readonly aiChatRepo: AiChatRepo, private readonly aiChatMessageRepo: AiChatMessageRepo, + private readonly aiTranscription: AiTranscriptionService, ) {} /** List the requesting user's chats in this workspace (paginated). */ @@ -180,6 +185,74 @@ export class AiChatController { } } + /** + * Transcribe an uploaded audio clip to text using the workspace STT model. + * Gated by settings.ai.dictation (403 when disabled). Returns { text }. + */ + @HttpCode(HttpStatus.OK) + @UseGuards(JwtAuthGuard, UserThrottlerGuard) + @Throttle({ [AI_CHAT_THROTTLER]: { limit: 20, ttl: 60000 } }) + @Post('transcribe') + @UseInterceptors(FileInterceptor) + async transcribe( + @Req() req: any, + @AuthWorkspace() workspace: Workspace, + ): Promise<{ text: string }> { + // Gate: dictation must be explicitly enabled for the workspace. + const settings = (workspace.settings ?? {}) as { + ai?: { dictation?: boolean }; + }; + if (settings.ai?.dictation !== true) { + throw new ForbiddenException('Dictation is disabled'); + } + + let file = null; + try { + // Whisper hard-caps uploads at 25MB; allow a single file. + file = await req.file({ limits: { fileSize: 25 * 1024 * 1024, files: 1 } }); + } catch (err: any) { + if (err?.statusCode === 413) { + throw new BadRequestException('Audio file too large (max 25MB)'); + } + throw err; + } + if (!file) throw new BadRequestException('No audio uploaded'); + + // Whitelist audio container types produced by browser MediaRecorder + // (Chrome/FF: webm/opus, Safari: mp4) plus common STT-accepted formats. + const allowedMime = new Set([ + 'audio/webm', + 'audio/ogg', + 'audio/mp4', + 'audio/mpeg', + 'audio/wav', + 'audio/x-wav', + 'audio/wave', + 'audio/m4a', + 'audio/x-m4a', + ]); + // MediaRecorder mimetypes carry parameters (e.g. "audio/webm;codecs=opus"); + // compare only the base type. + const baseMime = file.mimetype.split(';')[0].trim().toLowerCase(); + if (!allowedMime.has(baseMime)) { + throw new BadRequestException('Unsupported audio format'); + } + + let buf: Buffer; + try { + buf = await file.toBuffer(); + } catch (err: any) { + // With @fastify/multipart throwFileSizeLimit:true, the 25MB cap is enforced + // when the stream is consumed (here), not at req.file(). + if (err?.statusCode === 413) { + throw new BadRequestException('Audio file too large (max 25MB)'); + } + throw err; + } + const text = await this.aiTranscription.transcribe(workspace.id, buf); + return { text }; + } + /** * Ensure the chat exists, belongs to this workspace, AND was created by the * requesting user (per-user isolation). Throws ForbiddenException otherwise. diff --git a/apps/server/src/core/ai-chat/ai-chat.module.ts b/apps/server/src/core/ai-chat/ai-chat.module.ts index 5a5f2f7e..c8e863fb 100644 --- a/apps/server/src/core/ai-chat/ai-chat.module.ts +++ b/apps/server/src/core/ai-chat/ai-chat.module.ts @@ -3,6 +3,7 @@ import { AiModule } from '../../integrations/ai/ai.module'; import { TokenModule } from '../auth/token.module'; import { AiChatController } from './ai-chat.controller'; import { AiChatService } from './ai-chat.service'; +import { AiTranscriptionService } from './ai-transcription.service'; import { AiChatToolsService } from './tools/ai-chat-tools.service'; import { EmbeddingModule } from './embedding/embedding.module'; import { ExternalMcpModule } from './external-mcp/external-mcp.module'; @@ -21,6 +22,6 @@ import { ExternalMcpModule } from './external-mcp/external-mcp.module'; @Module({ imports: [AiModule, TokenModule, EmbeddingModule, ExternalMcpModule], controllers: [AiChatController], - providers: [AiChatService, AiChatToolsService], + providers: [AiChatService, AiTranscriptionService, AiChatToolsService], }) export class AiChatModule {} diff --git a/apps/server/src/core/ai-chat/ai-transcription.service.ts b/apps/server/src/core/ai-chat/ai-transcription.service.ts new file mode 100644 index 00000000..72d3ea9f --- /dev/null +++ b/apps/server/src/core/ai-chat/ai-transcription.service.ts @@ -0,0 +1,20 @@ +import { Injectable } from '@nestjs/common'; +import { experimental_transcribe as transcribe } from 'ai'; +import { AiService } from '../../integrations/ai/ai.service'; + +/** + * Transcribes uploaded audio to text using the per-workspace STT model. + * Thin wrapper over the AI SDK's experimental_transcribe; never logs the + * audio or the key. + */ +@Injectable() +export class AiTranscriptionService { + constructor(private readonly ai: AiService) {} + + // Transcribe an uploaded audio buffer using the workspace STT model. + async transcribe(workspaceId: string, audio: Uint8Array): Promise { + const model = await this.ai.getTranscriptionModel(workspaceId); + const { text } = await transcribe({ model, audio }); + return text.trim(); + } +} diff --git a/apps/server/src/core/workspace/dto/update-workspace.dto.ts b/apps/server/src/core/workspace/dto/update-workspace.dto.ts index 25697a4b..08ba967d 100644 --- a/apps/server/src/core/workspace/dto/update-workspace.dto.ts +++ b/apps/server/src/core/workspace/dto/update-workspace.dto.ts @@ -49,6 +49,10 @@ export class UpdateWorkspaceDto extends PartialType(CreateWorkspaceDto) { @IsBoolean() aiChat: boolean; + @IsOptional() + @IsBoolean() + aiDictation: boolean; + @IsOptional() @IsInt() @Min(1) diff --git a/apps/server/src/core/workspace/services/workspace.service.ts b/apps/server/src/core/workspace/services/workspace.service.ts index fd6e430a..ec419fba 100644 --- a/apps/server/src/core/workspace/services/workspace.service.ts +++ b/apps/server/src/core/workspace/services/workspace.service.ts @@ -497,6 +497,20 @@ export class WorkspaceService { ); } + if (typeof updateWorkspaceDto.aiDictation !== 'undefined') { + const prev = settingsBefore?.ai?.dictation ?? false; + if (prev !== updateWorkspaceDto.aiDictation) { + before.aiDictation = prev; + after.aiDictation = updateWorkspaceDto.aiDictation; + } + await this.workspaceRepo.updateAiSettings( + workspaceId, + 'dictation', + updateWorkspaceDto.aiDictation, + trx, + ); + } + delete updateWorkspaceDto.restrictApiToAdmins; delete updateWorkspaceDto.aiSearch; delete updateWorkspaceDto.generativeAi; @@ -504,6 +518,7 @@ export class WorkspaceService { delete updateWorkspaceDto.mcpEnabled; delete updateWorkspaceDto.allowMemberTemplates; delete updateWorkspaceDto.aiChat; + delete updateWorkspaceDto.aiDictation; await this.workspaceRepo.updateWorkspace( updateWorkspaceDto, diff --git a/apps/server/src/database/migrations/20260618T130000-ai-stt-credentials.ts b/apps/server/src/database/migrations/20260618T130000-ai-stt-credentials.ts new file mode 100644 index 00000000..def19dfd --- /dev/null +++ b/apps/server/src/database/migrations/20260618T130000-ai-stt-credentials.ts @@ -0,0 +1,18 @@ +import { type Kysely } from 'kysely'; + +export async function up(db: Kysely): Promise { + // Encrypted, STT-specific provider key. Separate from `api_key_enc` + // (the chat key) so the transcription model can use a different token. + // When NULL, the STT model falls back to `api_key_enc`. + await db.schema + .alterTable('ai_provider_credentials') + .addColumn('stt_api_key_enc', 'text', (col) => col) + .execute(); +} + +export async function down(db: Kysely): Promise { + await db.schema + .alterTable('ai_provider_credentials') + .dropColumn('stt_api_key_enc') + .execute(); +} diff --git a/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts b/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts index 4709ba96..e977fc5a 100644 --- a/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts +++ b/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts @@ -98,4 +98,42 @@ export class AiProviderCredentialsRepo { .where('driver', '=', driver) .execute(); } + + // Upsert the STT-specific encrypted key. If no row exists yet this inserts one + // with `apiKeyEnc` left null (the column is nullable). On conflict only + // `sttApiKeyEnc` / `updatedAt` are touched, so the chat & embedding keys are kept. + async upsertSttKey( + workspaceId: string, + driver: string, + sttApiKeyEnc: string, + trx?: KyselyTransaction, + ): Promise { + const db = dbOrTx(this.db, trx); + return db + .insertInto('aiProviderCredentials') + .values({ workspaceId, driver, sttApiKeyEnc }) + .onConflict((oc) => + oc.columns(['workspaceId', 'driver']).doUpdateSet({ + sttApiKeyEnc, + updatedAt: new Date(), + }), + ) + .returningAll() + .executeTakeFirst(); + } + + // Clear only the STT-specific key; the chat & embedding keys are kept. + async clearSttKey( + workspaceId: string, + driver: string, + trx?: KyselyTransaction, + ): Promise { + const db = dbOrTx(this.db, trx); + await db + .updateTable('aiProviderCredentials') + .set({ sttApiKeyEnc: null, updatedAt: new Date() }) + .where('workspaceId', '=', workspaceId) + .where('driver', '=', driver) + .execute(); + } } diff --git a/apps/server/src/database/repos/workspace/workspace.repo.ts b/apps/server/src/database/repos/workspace/workspace.repo.ts index f61ce9db..2f8e1b08 100644 --- a/apps/server/src/database/repos/workspace/workspace.repo.ts +++ b/apps/server/src/database/repos/workspace/workspace.repo.ts @@ -239,7 +239,7 @@ export class WorkspaceRepo { // is a real jsonb object, never a double-encoded string. The CASE self-heals // workspaces whose settings.ai.provider was previously corrupted into an // array/string. - const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'systemPrompt']; + const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt']; const entries = Object.entries(provider).filter( ([k, v]) => v !== undefined && ALLOWED.includes(k), ); diff --git a/apps/server/src/database/types/ai-provider-credentials.types.ts b/apps/server/src/database/types/ai-provider-credentials.types.ts index 5bd7db33..edc6e491 100644 --- a/apps/server/src/database/types/ai-provider-credentials.types.ts +++ b/apps/server/src/database/types/ai-provider-credentials.types.ts @@ -14,6 +14,8 @@ export interface AiProviderCredentials { apiKeyEnc: string | null; // Encrypted, embedding-specific provider key. Falls back to apiKeyEnc when null. embeddingApiKeyEnc: string | null; + // Encrypted, STT-specific provider key. Falls back to apiKeyEnc when null. + sttApiKeyEnc: string | null; createdAt: Generated; updatedAt: Generated; } diff --git a/apps/server/src/integrations/ai/ai-settings.service.ts b/apps/server/src/integrations/ai/ai-settings.service.ts index 8e4d493a..315ff380 100644 --- a/apps/server/src/integrations/ai/ai-settings.service.ts +++ b/apps/server/src/integrations/ai/ai-settings.service.ts @@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput { systemPrompt?: string; apiKey?: string; embeddingApiKey?: string; + sttModel?: string; + sttBaseUrl?: string; + sttApiKey?: string; } /** @@ -113,6 +116,7 @@ export class AiSettingsService { driver: provider.driver, chatModel: provider.chatModel, embeddingModel: provider.embeddingModel, + sttModel: provider.sttModel, baseUrl: provider.baseUrl, systemPrompt: provider.systemPrompt, }; @@ -122,6 +126,10 @@ export class AiSettingsService { // unconditionally. config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl; + // Effective STT base URL: the STT-specific value, else the chat base URL. + // Set unconditionally, same rationale as embeddingBaseUrl. + config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl; + if (provider.driver !== 'ollama') { const creds = await this.aiProviderCredentialsRepo.find( workspaceId, @@ -134,6 +142,10 @@ export class AiSettingsService { config.embeddingApiKey = creds?.embeddingApiKeyEnc ? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc) : config.apiKey; + // Effective STT key: the STT-specific key, else the chat key. + config.sttApiKey = creds?.sttApiKeyEnc + ? this.secretBox.decryptSecret(creds.sttApiKeyEnc) + : config.apiKey; } return config; @@ -151,6 +163,7 @@ export class AiSettingsService { let hasApiKey = false; let hasEmbeddingApiKey = false; + let hasSttApiKey = false; if (provider.driver) { const creds = await this.aiProviderCredentialsRepo.find( workspaceId, @@ -158,6 +171,7 @@ export class AiSettingsService { ); hasApiKey = !!creds?.apiKeyEnc; hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc; + hasSttApiKey = !!creds?.sttApiKeyEnc; } // totalPages now counts only pages with embeddable content (non-empty text @@ -174,9 +188,12 @@ export class AiSettingsService { embeddingModel: provider.embeddingModel, baseUrl: provider.baseUrl, embeddingBaseUrl: provider.embeddingBaseUrl, + sttModel: provider.sttModel, + sttBaseUrl: provider.sttBaseUrl, systemPrompt: provider.systemPrompt, hasApiKey, hasEmbeddingApiKey, + hasSttApiKey, indexedPages, totalPages, }; @@ -197,7 +214,7 @@ export class AiSettingsService { workspaceId: string, dto: UpdateAiSettingsInput, ): Promise { - const { apiKey, embeddingApiKey, ...nonSecret } = dto; + const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto; // Persist non-secret provider fields (only those present in the partial). const providerPatch: Partial = {}; @@ -207,6 +224,8 @@ export class AiSettingsService { 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', + 'sttModel', + 'sttBaseUrl', 'systemPrompt', ] as const) { if (nonSecret[key] !== undefined) { @@ -222,7 +241,11 @@ export class AiSettingsService { // Key handling (write-only). Both keys share the same target driver and the // same "driver required" guard, resolved once. - if (apiKey !== undefined || embeddingApiKey !== undefined) { + if ( + apiKey !== undefined || + embeddingApiKey !== undefined || + sttApiKey !== undefined + ) { const stored = await this.readProvider(workspaceId); const targetDriver = dto.driver ?? stored.driver; if (!targetDriver) { @@ -264,6 +287,23 @@ export class AiSettingsService { ); } } + + // STT key. + if (sttApiKey !== undefined) { + if (sttApiKey === '') { + await this.aiProviderCredentialsRepo.clearSttKey( + workspaceId, + targetDriver, + ); + } else { + const enc = this.secretBox.encryptSecret(sttApiKey); + await this.aiProviderCredentialsRepo.upsertSttKey( + workspaceId, + targetDriver, + enc, + ); + } + } } return this.getMasked(workspaceId); diff --git a/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts b/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts new file mode 100644 index 00000000..e7bbc8ae --- /dev/null +++ b/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts @@ -0,0 +1,13 @@ +import { ServiceUnavailableException } from '@nestjs/common'; + +/** + * Thrown when no usable STT (speech-to-text) config exists for the workspace + * (missing driver / sttModel). Distinct from the chat & embedding variants so + * the transcription endpoint can 503 independently of chat/embeddings being + * configured. + */ +export class AiSttNotConfiguredException extends ServiceUnavailableException { + constructor() { + super('AI STT model not configured'); + } +} diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts index 599406fa..b93416d0 100644 --- a/apps/server/src/integrations/ai/ai.service.ts +++ b/apps/server/src/integrations/ai/ai.service.ts @@ -4,6 +4,7 @@ import { generateText, type EmbeddingModel, type LanguageModel, + type TranscriptionModel, } from 'ai'; import { createOpenAI } from '@ai-sdk/openai'; import { createGoogleGenerativeAI } from '@ai-sdk/google'; @@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama'; import { AiSettingsService } from './ai-settings.service'; import { AiNotConfiguredException } from './ai-not-configured.exception'; import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception'; +import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception'; import { describeProviderError } from './ai-error.util'; /** @@ -106,6 +108,26 @@ export class AiService { } } + /** + * Resolve the workspace config and build the transcription (STT) model. + * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API + * (only @ai-sdk/openai exposes .transcription()), regardless of the chat + * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back + * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE + * on demand; the decrypted key is never logged. + * + * Throws AiSttNotConfiguredException (-> 503) when no STT model is set. + */ + async getTranscriptionModel(workspaceId: string): Promise { + const cfg = await this.aiSettings.resolve(workspaceId); + if (!cfg?.sttModel) throw new AiSttNotConfiguredException(); + const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat + // apiKey may be unused for keyless self-hosted whisper; pass a placeholder. + return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription( + cfg.sttModel, + ); + } + /** * Embed a batch of texts with the workspace embedding model. Returns one * vector per input, in the same order. Thin wrapper over the AI SDK's diff --git a/apps/server/src/integrations/ai/ai.types.ts b/apps/server/src/integrations/ai/ai.types.ts index 3e52ec05..32f043c8 100644 --- a/apps/server/src/integrations/ai/ai.types.ts +++ b/apps/server/src/integrations/ai/ai.types.ts @@ -21,6 +21,9 @@ export interface AiProviderSettings { baseUrl?: string; // Embedding-specific base URL. Falls back to `baseUrl` when empty/unset. embeddingBaseUrl?: string; + sttModel?: string; + // STT-specific base URL. Falls back to baseUrl when empty/unset. + sttBaseUrl?: string; systemPrompt?: string; } @@ -31,12 +34,15 @@ export interface AiProviderSettings { * * `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and * key, already resolved with the chat-value fallback applied by `resolve`. + * `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key, + * already resolved with the chat-value fallback applied by `resolve`. */ export interface ResolvedAiConfig extends Partial { driver?: AiDriver; chatModel?: string; apiKey?: string; embeddingApiKey?: string; + sttApiKey?: string; } /** @@ -50,9 +56,12 @@ export interface MaskedAiSettings { embeddingModel?: string; baseUrl?: string; embeddingBaseUrl?: string; + sttModel?: string; + sttBaseUrl?: string; systemPrompt?: string; hasApiKey: boolean; hasEmbeddingApiKey: boolean; + hasSttApiKey: boolean; // RAG indexing coverage for the settings UI. indexedPages: number; totalPages: number; diff --git a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts index 9bdd3762..49199bc0 100644 --- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts +++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts @@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types'; /** * Admin update payload for the workspace AI provider settings. * - * `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored - * encrypted, '' → cleared, absent → left untouched. They are NEVER returned by - * any endpoint. The global ValidationPipe runs with `whitelist: true`, so - * unknown fields are stripped. + * `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided → + * stored encrypted, '' → cleared, absent → left untouched. They are NEVER + * returned by any endpoint. The global ValidationPipe runs with + * `whitelist: true`, so unknown fields are stripped. */ export class UpdateAiSettingsDto { @IsOptional() @@ -41,4 +41,16 @@ export class UpdateAiSettingsDto { @IsOptional() @IsString() embeddingApiKey?: string; + + @IsOptional() + @IsString() + sttModel?: string; + + @IsOptional() + @IsString() + sttBaseUrl?: string; + + @IsOptional() + @IsString() + sttApiKey?: string; }