feat(ai): server-side voice dictation (STT) with mic in chat and editor
Add push-to-talk voice dictation that transcribes recorded audio on the server via the workspace's OpenAI-compatible AI provider (Whisper / gpt-4o-transcribe / self-hosted whisper), then inserts the text. Backend: - New `stt_api_key_enc` column + migration; STT creds parity with chat/ embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to chat baseUrl/key). Both provider whitelists updated (service + repo). - AiService.getTranscriptionModel + AiTranscriptionService. - Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace scope + throttle, 25MB cap, MIME whitelist, never logs audio/key). - New `settings.ai.dictation` workspace flag (DTO + service + audit). Frontend: - Wire up the Voice/STT settings card (model/base URL/key) and the Voice-dictation toggle. - New `features/dictation`: useDictation (MediaRecorder state machine), MicButton, transcribe service; integrated into the chat composer and a new editor-toolbar dictation group, both gated by ai.dictation.
This commit is contained in:
@@ -1181,5 +1181,13 @@
|
||||
"Embeddings": "Embeddings",
|
||||
"Leave empty to use the chat API key": "Leave empty to use the chat API key",
|
||||
"Leave empty to use the chat base URL": "Leave empty to use the chat base URL",
|
||||
"Reindex now": "Reindex now"
|
||||
"Reindex now": "Reindex now",
|
||||
"Start dictation": "Start dictation",
|
||||
"Stop recording": "Stop recording",
|
||||
"Transcribing…": "Transcribing…",
|
||||
"Microphone access denied": "Microphone access denied",
|
||||
"No microphone found": "No microphone found",
|
||||
"Could not start recording": "Could not start recording",
|
||||
"Transcription failed": "Transcription failed",
|
||||
"Voice dictation is not configured": "Voice dictation is not configured"
|
||||
}
|
||||
|
||||
@@ -2,8 +2,10 @@ import { KeyboardEvent } from "react";
|
||||
import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core";
|
||||
import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { useAtom } from "jotai";
|
||||
import { useAtom, useAtomValue } from "jotai";
|
||||
import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts";
|
||||
import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
|
||||
import { MicButton } from "@/features/dictation/components/mic-button";
|
||||
|
||||
interface ChatInputProps {
|
||||
onSend: (text: string) => void;
|
||||
@@ -25,6 +27,8 @@ export default function ChatInput({
|
||||
}: ChatInputProps) {
|
||||
const { t } = useTranslation();
|
||||
const [value, setValue] = useAtom(aiChatDraftAtom);
|
||||
const workspace = useAtomValue(workspaceAtom);
|
||||
const isDictationEnabled = workspace?.settings?.ai?.dictation === true;
|
||||
|
||||
const send = (): void => {
|
||||
const text = value.trim();
|
||||
@@ -57,6 +61,13 @@ export default function ChatInput({
|
||||
// switch), so a fresh chat lands with the cursor ready in the field.
|
||||
autoFocus
|
||||
/>
|
||||
{isDictationEnabled && (
|
||||
<MicButton
|
||||
size="lg"
|
||||
disabled={isStreaming || disabled}
|
||||
onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
|
||||
/>
|
||||
)}
|
||||
{isStreaming ? (
|
||||
<Tooltip label={t("Stop")} withArrow>
|
||||
<ActionIcon
|
||||
|
||||
76
apps/client/src/features/dictation/components/mic-button.tsx
Normal file
76
apps/client/src/features/dictation/components/mic-button.tsx
Normal file
@@ -0,0 +1,76 @@
|
||||
import { FC } from "react";
|
||||
import { ActionIcon, Loader, Tooltip } from "@mantine/core";
|
||||
import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { useDictation } from "@/features/dictation/hooks/use-dictation";
|
||||
|
||||
interface MicButtonProps {
|
||||
onText: (text: string) => void;
|
||||
onStart?: () => void;
|
||||
disabled?: boolean;
|
||||
// Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
|
||||
// editor toolbar.
|
||||
size?: "md" | "lg";
|
||||
}
|
||||
|
||||
/**
|
||||
* Self-contained dictation toggle. Owns its own capture state machine: a click
|
||||
* starts recording (mic icon), a second click stops it (stop icon), and while
|
||||
* the audio is being transcribed it shows a spinner and is disabled to prevent
|
||||
* overlapping requests.
|
||||
*/
|
||||
export const MicButton: FC<MicButtonProps> = ({
|
||||
onText,
|
||||
onStart,
|
||||
disabled,
|
||||
size = "lg",
|
||||
}) => {
|
||||
const { t } = useTranslation();
|
||||
const { status, start, stop } = useDictation({ onText, onStart });
|
||||
const iconSize = size === "lg" ? 18 : 16;
|
||||
|
||||
if (status === "recording") {
|
||||
return (
|
||||
<Tooltip label={t("Stop recording")} withArrow>
|
||||
<ActionIcon
|
||||
size={size}
|
||||
color="red"
|
||||
variant="light"
|
||||
onClick={stop}
|
||||
aria-label={t("Stop recording")}
|
||||
>
|
||||
<IconPlayerStopFilled size={iconSize} />
|
||||
</ActionIcon>
|
||||
</Tooltip>
|
||||
);
|
||||
}
|
||||
|
||||
if (status === "transcribing" || status === "error") {
|
||||
return (
|
||||
<Tooltip label={t("Transcribing…")} withArrow>
|
||||
<ActionIcon
|
||||
size={size}
|
||||
variant="subtle"
|
||||
disabled
|
||||
aria-label={t("Transcribing…")}
|
||||
>
|
||||
<Loader size="xs" />
|
||||
</ActionIcon>
|
||||
</Tooltip>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<Tooltip label={t("Start dictation")} withArrow>
|
||||
<ActionIcon
|
||||
size={size}
|
||||
variant="subtle"
|
||||
onClick={() => void start()}
|
||||
disabled={disabled}
|
||||
aria-label={t("Start dictation")}
|
||||
>
|
||||
<IconMicrophone size={iconSize} />
|
||||
</ActionIcon>
|
||||
</Tooltip>
|
||||
);
|
||||
};
|
||||
260
apps/client/src/features/dictation/hooks/use-dictation.ts
Normal file
260
apps/client/src/features/dictation/hooks/use-dictation.ts
Normal file
@@ -0,0 +1,260 @@
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
import { notifications } from "@mantine/notifications";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { transcribeAudio } from "@/features/dictation/services/dictation-service";
|
||||
|
||||
export type DictationStatus = "idle" | "recording" | "transcribing" | "error";
|
||||
|
||||
interface UseDictationOptions {
|
||||
onText: (text: string) => void;
|
||||
onStart?: () => void;
|
||||
maxDurationMs?: number;
|
||||
}
|
||||
|
||||
interface UseDictationResult {
|
||||
status: DictationStatus;
|
||||
start: () => Promise<void>;
|
||||
stop: () => void;
|
||||
cancel: () => void;
|
||||
}
|
||||
|
||||
// Candidate container/codec combinations in preference order. The first one the
|
||||
// browser supports wins; if none do we let MediaRecorder pick its own default.
|
||||
const MIME_CANDIDATES = [
|
||||
"audio/webm;codecs=opus",
|
||||
"audio/webm",
|
||||
"audio/mp4",
|
||||
"audio/ogg;codecs=opus",
|
||||
"audio/ogg",
|
||||
];
|
||||
|
||||
// Derive a sensible upload filename from the recorded MIME type. The server keys
|
||||
// off the blob's MIME, so this only affects the part name, but a matching
|
||||
// extension keeps things tidy.
|
||||
function filenameForMime(mime: string): string {
|
||||
if (mime.includes("mp4")) return "speech.mp4";
|
||||
if (mime.includes("ogg")) return "speech.ogg";
|
||||
return "speech.webm";
|
||||
}
|
||||
|
||||
function pickMimeType(): string | undefined {
|
||||
if (typeof MediaRecorder === "undefined") return undefined;
|
||||
for (const candidate of MIME_CANDIDATES) {
|
||||
if (MediaRecorder.isTypeSupported?.(candidate)) return candidate;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Encapsulates the browser audio-capture state machine: request the mic, record
|
||||
* with MediaRecorder, then POST the blob for transcription. Refs hold the live
|
||||
* recorder/stream/chunks/timer/cancel flag so component re-renders never lose
|
||||
* them, and every exit path stops the MediaStream tracks.
|
||||
*/
|
||||
export function useDictation(
|
||||
options: UseDictationOptions,
|
||||
): UseDictationResult {
|
||||
const { t } = useTranslation();
|
||||
const [status, setStatus] = useState<DictationStatus>("idle");
|
||||
|
||||
// Keep the latest callbacks in a ref so the recorder's onstop closure always
|
||||
// calls the current handlers without re-creating the recorder.
|
||||
const optionsRef = useRef(options);
|
||||
optionsRef.current = options;
|
||||
|
||||
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const chunksRef = useRef<Blob[]>([]);
|
||||
const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const canceledRef = useRef(false);
|
||||
const startingRef = useRef(false);
|
||||
|
||||
const clearTimer = useCallback(() => {
|
||||
if (timerRef.current !== null) {
|
||||
clearTimeout(timerRef.current);
|
||||
timerRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
const stopTracks = useCallback(() => {
|
||||
streamRef.current?.getTracks().forEach((track) => track.stop());
|
||||
streamRef.current = null;
|
||||
}, []);
|
||||
|
||||
const start = useCallback(async (): Promise<void> => {
|
||||
// Synchronous live guard: status is stale between renders, so also block on
|
||||
// refs to prevent a double-click from opening two MediaStreams (the first
|
||||
// would leak).
|
||||
if (startingRef.current || recorderRef.current || streamRef.current) return;
|
||||
if (status !== "idle") return;
|
||||
startingRef.current = true;
|
||||
|
||||
let stream: MediaStream;
|
||||
try {
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} catch (err) {
|
||||
const name = (err as { name?: string })?.name;
|
||||
let message: string;
|
||||
if (name === "NotAllowedError" || name === "SecurityError") {
|
||||
message = t("Microphone access denied");
|
||||
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
|
||||
message = t("No microphone found");
|
||||
} else {
|
||||
message = t("Could not start recording");
|
||||
}
|
||||
notifications.show({ color: "red", message });
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
|
||||
streamRef.current = stream;
|
||||
chunksRef.current = [];
|
||||
canceledRef.current = false;
|
||||
|
||||
const mimeType = pickMimeType();
|
||||
let recorder: MediaRecorder;
|
||||
try {
|
||||
recorder = new MediaRecorder(
|
||||
stream,
|
||||
mimeType ? { mimeType } : undefined,
|
||||
);
|
||||
} catch {
|
||||
// The stream was acquired but the recorder failed to construct; stop the
|
||||
// tracks so the MediaStream does not leak before bailing out.
|
||||
stopTracks();
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: t("Could not start recording"),
|
||||
});
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
recorderRef.current = recorder;
|
||||
|
||||
recorder.ondataavailable = (e: BlobEvent) => {
|
||||
if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
|
||||
};
|
||||
|
||||
recorder.onstop = () => {
|
||||
clearTimer();
|
||||
const recordedMime = recorder.mimeType || mimeType || "audio/webm";
|
||||
const wasCanceled = canceledRef.current;
|
||||
|
||||
// Stop the mic tracks regardless of how we got here.
|
||||
stopTracks();
|
||||
recorderRef.current = null;
|
||||
|
||||
if (wasCanceled) {
|
||||
chunksRef.current = [];
|
||||
setStatus("idle");
|
||||
return;
|
||||
}
|
||||
|
||||
const blob = new Blob(chunksRef.current, { type: recordedMime });
|
||||
chunksRef.current = [];
|
||||
|
||||
setStatus("transcribing");
|
||||
void transcribeAudio(blob, filenameForMime(recordedMime))
|
||||
.then((text) => {
|
||||
// Whisper often returns a leading space; insert the trimmed value.
|
||||
const trimmed = text.trim();
|
||||
if (trimmed.length > 0) optionsRef.current.onText(trimmed);
|
||||
setStatus("idle");
|
||||
})
|
||||
.catch((err: unknown) => {
|
||||
const httpStatus = (err as { response?: { status?: number } })
|
||||
?.response?.status;
|
||||
// The server returns 503 when dictation is unconfigured and 403 when
|
||||
// it is disabled server-side; both map to the same "not configured".
|
||||
const message =
|
||||
httpStatus === 503 || httpStatus === 403
|
||||
? t("Voice dictation is not configured")
|
||||
: t("Transcription failed");
|
||||
notifications.show({ color: "red", message });
|
||||
// Surface the error state briefly, then return to idle. Store the
|
||||
// timer so it can be cleared on unmount.
|
||||
setStatus("error");
|
||||
if (errorTimerRef.current !== null) {
|
||||
clearTimeout(errorTimerRef.current);
|
||||
}
|
||||
errorTimerRef.current = setTimeout(() => {
|
||||
errorTimerRef.current = null;
|
||||
setStatus("idle");
|
||||
}, 1500);
|
||||
});
|
||||
};
|
||||
|
||||
// Notify the caller right when recording begins (before any async work) so
|
||||
// the editor can snapshot the caret position.
|
||||
try {
|
||||
optionsRef.current.onStart?.();
|
||||
recorder.start();
|
||||
} catch {
|
||||
// recorder.start() can synchronously throw (InvalidStateError /
|
||||
// NotSupportedError); clean up so the button is not left stuck and the
|
||||
// MediaStream does not leak.
|
||||
stopTracks();
|
||||
recorderRef.current = null;
|
||||
startingRef.current = false;
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: t("Could not start recording"),
|
||||
});
|
||||
setStatus("idle");
|
||||
return;
|
||||
}
|
||||
setStatus("recording");
|
||||
// Recording has truly begun; release the synchronous start guard.
|
||||
startingRef.current = false;
|
||||
|
||||
const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
|
||||
timerRef.current = setTimeout(() => {
|
||||
if (recorderRef.current?.state === "recording") {
|
||||
recorderRef.current.stop();
|
||||
}
|
||||
}, maxDurationMs);
|
||||
}, [status, t, clearTimer, stopTracks]);
|
||||
|
||||
const stop = useCallback((): void => {
|
||||
clearTimer();
|
||||
const recorder = recorderRef.current;
|
||||
if (recorder && recorder.state === "recording") {
|
||||
recorder.stop();
|
||||
}
|
||||
}, [clearTimer]);
|
||||
|
||||
const cancel = useCallback((): void => {
|
||||
clearTimer();
|
||||
canceledRef.current = true;
|
||||
const recorder = recorderRef.current;
|
||||
if (recorder && recorder.state === "recording") {
|
||||
// onstop sees canceledRef and skips transcription; it also stops tracks.
|
||||
recorder.stop();
|
||||
} else {
|
||||
stopTracks();
|
||||
}
|
||||
setStatus("idle");
|
||||
}, [clearTimer, stopTracks]);
|
||||
|
||||
// Clean up on unmount: stop any live recorder/stream and clear the timers.
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
clearTimer();
|
||||
if (errorTimerRef.current !== null) {
|
||||
clearTimeout(errorTimerRef.current);
|
||||
errorTimerRef.current = null;
|
||||
}
|
||||
const recorder = recorderRef.current;
|
||||
if (recorder && recorder.state === "recording") {
|
||||
canceledRef.current = true;
|
||||
recorder.stop();
|
||||
}
|
||||
stopTracks();
|
||||
};
|
||||
}, [clearTimer, stopTracks]);
|
||||
|
||||
return { status, start, stop, cancel };
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
import api from "@/lib/api-client";
|
||||
|
||||
// POST the recorded audio as multipart/form-data; the server transcribes it with
|
||||
// the workspace STT model and returns { text } (wrapped in the standard envelope,
|
||||
// so the value is at req.data.text). `filename` only sets the part name; the
|
||||
// server keys off the blob's MIME type.
|
||||
export async function transcribeAudio(
|
||||
blob: Blob,
|
||||
filename = "speech.webm",
|
||||
): Promise<string> {
|
||||
const form = new FormData();
|
||||
form.append("file", blob, filename);
|
||||
const req = await api.post<{ text: string }>("/ai-chat/transcribe", form, {
|
||||
headers: { "Content-Type": "multipart/form-data" },
|
||||
});
|
||||
return req.data.text;
|
||||
}
|
||||
@@ -13,6 +13,7 @@ import { QuickInsertsGroup } from "./groups/quick-inserts-group";
|
||||
import { MoreInsertsGroup } from "./groups/more-inserts-group";
|
||||
import { HistoryGroup } from "./groups/history-group";
|
||||
import { AskAiGroup } from "./groups/ask-ai-group";
|
||||
import { DictationGroup } from "./groups/dictation-group";
|
||||
import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
|
||||
import classes from "./fixed-toolbar.module.css";
|
||||
|
||||
@@ -30,6 +31,7 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
|
||||
const state = useToolbarState(editor);
|
||||
const workspace = useAtomValue(workspaceAtom);
|
||||
const isGenerativeAiEnabled = workspace?.settings?.ai?.generative === true;
|
||||
const isDictationEnabled = workspace?.settings?.ai?.dictation === true;
|
||||
|
||||
if (!editor || !state) return null;
|
||||
|
||||
@@ -65,6 +67,12 @@ export const FixedToolbar: FC<FixedToolbarProps> = ({
|
||||
<MoreInsertsGroup editor={editor} templateMode={templateMode} />
|
||||
<div className={classes.divider} />
|
||||
<HistoryGroup editor={editor} state={state} />
|
||||
{isDictationEnabled && (
|
||||
<>
|
||||
<div className={classes.divider} />
|
||||
<DictationGroup editor={editor} />
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
<div className={classes.spacer} aria-hidden />
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
import { FC, useRef } from "react";
|
||||
import type { Editor } from "@tiptap/react";
|
||||
import { MicButton } from "@/features/dictation/components/mic-button";
|
||||
|
||||
interface Props {
|
||||
editor: Editor;
|
||||
}
|
||||
|
||||
export const DictationGroup: FC<Props> = ({ editor }) => {
|
||||
const rangeRef = useRef<{ from: number; to: number } | null>(null);
|
||||
|
||||
const handleStart = () => {
|
||||
const { from, to } = editor.state.selection;
|
||||
rangeRef.current = { from, to };
|
||||
};
|
||||
|
||||
const handleText = (text: string) => {
|
||||
// The editor may be gone by the time async transcription returns; bail out
|
||||
// instead of operating on a destroyed instance.
|
||||
if (!editor || editor.isDestroyed) return;
|
||||
const snapshot = rangeRef.current;
|
||||
rangeRef.current = null;
|
||||
// The document may have shrunk during transcription (e.g. a collaborative
|
||||
// edit), so clamp the snapshot into the current bounds before inserting.
|
||||
const docSize = editor.state.doc.content.size;
|
||||
const clamp = (p: number) => Math.max(0, Math.min(p, docSize));
|
||||
try {
|
||||
if (snapshot) {
|
||||
// Insert at the snapshotted caret; a trailing space keeps words
|
||||
// separated (the hook already trims the transcribed text).
|
||||
editor
|
||||
.chain()
|
||||
.focus()
|
||||
.insertContentAt(
|
||||
{ from: clamp(snapshot.from), to: clamp(snapshot.to) },
|
||||
`${text} `,
|
||||
)
|
||||
.run();
|
||||
} else {
|
||||
editor.chain().focus().insertContent(`${text} `).run();
|
||||
}
|
||||
} catch {
|
||||
// The snapshot drifted out of range; fall back to the current caret.
|
||||
try {
|
||||
editor.chain().focus().insertContent(`${text} `).run();
|
||||
} catch {
|
||||
// The editor may have been destroyed; ignore so a dead editor can't
|
||||
// surface an uncaught error.
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<MicButton
|
||||
size="md"
|
||||
onStart={handleStart}
|
||||
onText={handleText}
|
||||
disabled={!editor.isEditable}
|
||||
/>
|
||||
);
|
||||
};
|
||||
@@ -47,6 +47,10 @@ const formSchema = z.object({
|
||||
systemPrompt: z.string(),
|
||||
apiKey: z.string(),
|
||||
embeddingApiKey: z.string(),
|
||||
// STT-specific fields. Empty base URL / key fall back to the chat ones.
|
||||
sttModel: z.string(),
|
||||
sttBaseUrl: z.string(),
|
||||
sttApiKey: z.string(),
|
||||
});
|
||||
|
||||
type FormValues = z.infer<typeof formSchema>;
|
||||
@@ -101,8 +105,12 @@ export default function AiProviderSettings() {
|
||||
const [searchEnabled, setSearchEnabled] = useState<boolean>(
|
||||
workspace?.settings?.ai?.search ?? false,
|
||||
);
|
||||
const [dictationEnabled, setDictationEnabled] = useState<boolean>(
|
||||
workspace?.settings?.ai?.dictation ?? false,
|
||||
);
|
||||
const [chatToggleLoading, setChatToggleLoading] = useState(false);
|
||||
const [searchToggleLoading, setSearchToggleLoading] = useState(false);
|
||||
const [dictationToggleLoading, setDictationToggleLoading] = useState(false);
|
||||
|
||||
// Whether a key is currently stored server-side (drives the placeholder).
|
||||
const [hasApiKey, setHasApiKey] = useState(false);
|
||||
@@ -111,6 +119,9 @@ export default function AiProviderSettings() {
|
||||
// Same, for the embedding-specific key.
|
||||
const [hasEmbeddingApiKey, setHasEmbeddingApiKey] = useState(false);
|
||||
const [embeddingKeyCleared, setEmbeddingKeyCleared] = useState(false);
|
||||
// Same, for the STT-specific key.
|
||||
const [hasSttApiKey, setHasSttApiKey] = useState(false);
|
||||
const [sttKeyCleared, setSttKeyCleared] = useState(false);
|
||||
|
||||
// Modal for the (large) system message editor.
|
||||
const [promptOpened, promptHandlers] = useDisclosure(false);
|
||||
@@ -125,6 +136,9 @@ export default function AiProviderSettings() {
|
||||
systemPrompt: "",
|
||||
apiKey: "",
|
||||
embeddingApiKey: "",
|
||||
sttModel: "",
|
||||
sttBaseUrl: "",
|
||||
sttApiKey: "",
|
||||
},
|
||||
});
|
||||
|
||||
@@ -140,12 +154,17 @@ export default function AiProviderSettings() {
|
||||
systemPrompt: settings.systemPrompt ?? "",
|
||||
apiKey: "",
|
||||
embeddingApiKey: "",
|
||||
sttModel: settings.sttModel ?? "",
|
||||
sttBaseUrl: settings.sttBaseUrl ?? "",
|
||||
sttApiKey: "",
|
||||
});
|
||||
form.resetDirty();
|
||||
setHasApiKey(settings.hasApiKey);
|
||||
setKeyCleared(false);
|
||||
setHasEmbeddingApiKey(settings.hasEmbeddingApiKey);
|
||||
setEmbeddingKeyCleared(false);
|
||||
setHasSttApiKey(settings.hasSttApiKey);
|
||||
setSttKeyCleared(false);
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [settings]);
|
||||
|
||||
@@ -160,6 +179,10 @@ export default function AiProviderSettings() {
|
||||
baseUrl: values.baseUrl,
|
||||
embeddingBaseUrl: values.embeddingBaseUrl,
|
||||
systemPrompt: values.systemPrompt,
|
||||
// The STT base URL is optional; empty falls back to the chat base URL
|
||||
// server-side.
|
||||
sttModel: values.sttModel,
|
||||
sttBaseUrl: values.sttBaseUrl,
|
||||
};
|
||||
|
||||
// Key semantics (never send the stored key back):
|
||||
@@ -179,6 +202,13 @@ export default function AiProviderSettings() {
|
||||
payload.embeddingApiKey = "";
|
||||
}
|
||||
|
||||
// Same write-only semantics for the STT-specific key.
|
||||
if (values.sttApiKey.length > 0) {
|
||||
payload.sttApiKey = values.sttApiKey;
|
||||
} else if (sttKeyCleared) {
|
||||
payload.sttApiKey = "";
|
||||
}
|
||||
|
||||
return payload;
|
||||
}
|
||||
|
||||
@@ -191,6 +221,9 @@ export default function AiProviderSettings() {
|
||||
setHasEmbeddingApiKey(updated.hasEmbeddingApiKey);
|
||||
setEmbeddingKeyCleared(false);
|
||||
form.setFieldValue("embeddingApiKey", "");
|
||||
setHasSttApiKey(updated.hasSttApiKey);
|
||||
setSttKeyCleared(false);
|
||||
form.setFieldValue("sttApiKey", "");
|
||||
form.resetDirty();
|
||||
}
|
||||
|
||||
@@ -206,6 +239,12 @@ export default function AiProviderSettings() {
|
||||
form.setFieldValue("embeddingApiKey", "");
|
||||
}
|
||||
|
||||
function handleClearSttKey() {
|
||||
setSttKeyCleared(true);
|
||||
setHasSttApiKey(false);
|
||||
form.setFieldValue("sttApiKey", "");
|
||||
}
|
||||
|
||||
// Optimistic toggle for the "AI chat" feature (settings.ai.chat).
|
||||
async function handleToggleChat(value: boolean) {
|
||||
setChatToggleLoading(true);
|
||||
@@ -268,6 +307,34 @@ export default function AiProviderSettings() {
|
||||
}
|
||||
}
|
||||
|
||||
// Optimistic toggle for the "Voice dictation" feature (settings.ai.dictation).
|
||||
async function handleToggleDictation(value: boolean) {
|
||||
setDictationToggleLoading(true);
|
||||
const previous = dictationEnabled;
|
||||
setDictationEnabled(value);
|
||||
try {
|
||||
const updated = await updateWorkspace({ aiDictation: value });
|
||||
setWorkspace({
|
||||
...updated,
|
||||
settings: {
|
||||
...updated.settings,
|
||||
ai: { ...updated.settings?.ai, dictation: value },
|
||||
},
|
||||
});
|
||||
notifications.show({ message: t("Updated successfully") });
|
||||
} catch (err) {
|
||||
setDictationEnabled(previous);
|
||||
const message = (err as { response?: { data?: { message?: string } } })
|
||||
?.response?.data?.message;
|
||||
notifications.show({
|
||||
message: message ?? t("Failed to update data"),
|
||||
color: "red",
|
||||
});
|
||||
} finally {
|
||||
setDictationToggleLoading(false);
|
||||
}
|
||||
}
|
||||
|
||||
// Admins only — match the previous behavior.
|
||||
if (!isAdmin) {
|
||||
return (
|
||||
@@ -294,6 +361,11 @@ export default function AiProviderSettings() {
|
||||
"/embeddings",
|
||||
form.values.baseUrl,
|
||||
);
|
||||
const sttResolved = resolveUrl(
|
||||
form.values.sttBaseUrl,
|
||||
"/audio/transcriptions",
|
||||
form.values.baseUrl,
|
||||
);
|
||||
|
||||
const monoFont = "ui-monospace, Menlo, monospace";
|
||||
|
||||
@@ -541,8 +613,8 @@ export default function AiProviderSettings() {
|
||||
</Box>
|
||||
</Paper>
|
||||
|
||||
{/* Card 3 — Voice / STT (disabled stub, not wired to the form/backend) */}
|
||||
<Paper withBorder radius="md" p="lg" opacity={0.6}>
|
||||
{/* Card 3 — Voice / STT */}
|
||||
<Paper withBorder radius="md" p="lg">
|
||||
<Group justify="space-between" align="center" wrap="nowrap">
|
||||
<Group gap="xs" align="center" wrap="nowrap">
|
||||
<StatusDot status="idle" />
|
||||
@@ -551,8 +623,9 @@ export default function AiProviderSettings() {
|
||||
<Switch
|
||||
label={t("Voice dictation")}
|
||||
labelPosition="left"
|
||||
checked={false}
|
||||
disabled
|
||||
checked={dictationEnabled}
|
||||
disabled={dictationToggleLoading}
|
||||
onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
|
||||
/>
|
||||
</Group>
|
||||
<Text size="xs" c="dimmed" mt={4} mb="md">
|
||||
@@ -562,33 +635,46 @@ export default function AiProviderSettings() {
|
||||
</Text>
|
||||
|
||||
<Group grow align="flex-start">
|
||||
<TextInput label={t("Model")} value="" disabled readOnly />
|
||||
<PasswordInput label={t("API key")} value="" disabled readOnly />
|
||||
</Group>
|
||||
<TextInput mt="sm" label={t("Base URL")} value="" disabled readOnly />
|
||||
|
||||
<Group mt="md">
|
||||
<Button variant="default" size="sm" disabled>
|
||||
{t("Test endpoint")}
|
||||
</Button>
|
||||
<TextInput
|
||||
label={t("Model")}
|
||||
disabled={isLoading}
|
||||
{...form.getInputProps("sttModel")}
|
||||
/>
|
||||
<Stack gap={4}>
|
||||
<PasswordInput
|
||||
label={t("API key")}
|
||||
placeholder={
|
||||
hasSttApiKey
|
||||
? t("•••• set")
|
||||
: t("Leave empty to use the chat API key")
|
||||
}
|
||||
autoComplete="off"
|
||||
{...form.getInputProps("sttApiKey")}
|
||||
/>
|
||||
{hasSttApiKey && (
|
||||
<Anchor
|
||||
component="button"
|
||||
type="button"
|
||||
c="red"
|
||||
size="xs"
|
||||
onClick={handleClearSttKey}
|
||||
>
|
||||
{t("Clear")}
|
||||
</Anchor>
|
||||
)}
|
||||
</Stack>
|
||||
</Group>
|
||||
|
||||
<Box
|
||||
mt="md"
|
||||
mx="calc(var(--mantine-spacing-lg) * -1)"
|
||||
mb="calc(var(--mantine-spacing-lg) * -1)"
|
||||
px="lg"
|
||||
py="md"
|
||||
style={{
|
||||
borderTop: "1px solid var(--mantine-color-default-border)",
|
||||
background: "var(--mantine-color-default-hover)",
|
||||
borderRadius: "0 0 var(--mantine-radius-md) var(--mantine-radius-md)",
|
||||
}}
|
||||
>
|
||||
<Text size="xs" c="dimmed">
|
||||
{t("Voice dictation is not available yet.")}
|
||||
</Text>
|
||||
</Box>
|
||||
<TextInput
|
||||
mt="sm"
|
||||
label={t("Base URL")}
|
||||
placeholder={t("Leave empty to use the chat base URL")}
|
||||
disabled={isLoading}
|
||||
{...form.getInputProps("sttBaseUrl")}
|
||||
/>
|
||||
<Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
|
||||
{t("Resolves to {{url}}", { url: sttResolved })}
|
||||
</Text>
|
||||
</Paper>
|
||||
|
||||
{/* Nested: external MCP tools the agent calls out to */}
|
||||
|
||||
@@ -16,6 +16,12 @@ export interface IAiSettings {
|
||||
systemPrompt?: string;
|
||||
hasApiKey: boolean;
|
||||
hasEmbeddingApiKey: boolean;
|
||||
// STT-specific settings. `sttBaseUrl` is the RAW stored value (empty means
|
||||
// "uses the chat base URL"). `hasSttApiKey` indicates whether an STT-specific
|
||||
// key is stored (empty means "uses the chat API key").
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
hasSttApiKey: boolean;
|
||||
// RAG indexing coverage (pages indexed for semantic search).
|
||||
indexedPages: number;
|
||||
totalPages: number;
|
||||
@@ -35,6 +41,10 @@ export interface IAiSettingsUpdate {
|
||||
systemPrompt?: string;
|
||||
apiKey?: string;
|
||||
embeddingApiKey?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
|
||||
sttApiKey?: string;
|
||||
}
|
||||
|
||||
// Result of a connection test against the configured provider.
|
||||
|
||||
@@ -24,6 +24,7 @@ export interface IWorkspace {
|
||||
disablePublicSharing?: boolean;
|
||||
mcpEnabled?: boolean;
|
||||
aiChat?: boolean;
|
||||
aiDictation?: boolean;
|
||||
trashRetentionDays?: number;
|
||||
restrictApiToAdmins?: boolean;
|
||||
allowMemberTemplates?: boolean;
|
||||
@@ -46,6 +47,7 @@ export interface IWorkspaceAiSettings {
|
||||
generative?: boolean;
|
||||
mcp?: boolean;
|
||||
chat?: boolean;
|
||||
dictation?: boolean;
|
||||
}
|
||||
|
||||
export interface IWorkspaceSharingSettings {
|
||||
|
||||
Reference in New Issue
Block a user