feat(ai): server-side voice dictation (STT) with mic in chat and editor
Add push-to-talk voice dictation that transcribes recorded audio on the server via the workspace's OpenAI-compatible AI provider (Whisper / gpt-4o-transcribe / self-hosted whisper), then inserts the text. Backend: - New `stt_api_key_enc` column + migration; STT creds parity with chat/ embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to chat baseUrl/key). Both provider whitelists updated (service + repo). - AiService.getTranscriptionModel + AiTranscriptionService. - Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace scope + throttle, 25MB cap, MIME whitelist, never logs audio/key). - New `settings.ai.dictation` workspace flag (DTO + service + audit). Frontend: - Wire up the Voice/STT settings card (model/base URL/key) and the Voice-dictation toggle. - New `features/dictation`: useDictation (MediaRecorder state machine), MicButton, transcribe service; integrated into the chat composer and a new editor-toolbar dictation group, both gated by ai.dictation.
This commit is contained in:
@@ -47,6 +47,10 @@ const formSchema = z.object({
|
||||
systemPrompt: z.string(),
|
||||
apiKey: z.string(),
|
||||
embeddingApiKey: z.string(),
|
||||
// STT-specific fields. Empty base URL / key fall back to the chat ones.
|
||||
sttModel: z.string(),
|
||||
sttBaseUrl: z.string(),
|
||||
sttApiKey: z.string(),
|
||||
});
|
||||
|
||||
type FormValues = z.infer<typeof formSchema>;
|
||||
@@ -101,8 +105,12 @@ export default function AiProviderSettings() {
|
||||
const [searchEnabled, setSearchEnabled] = useState<boolean>(
|
||||
workspace?.settings?.ai?.search ?? false,
|
||||
);
|
||||
const [dictationEnabled, setDictationEnabled] = useState<boolean>(
|
||||
workspace?.settings?.ai?.dictation ?? false,
|
||||
);
|
||||
const [chatToggleLoading, setChatToggleLoading] = useState(false);
|
||||
const [searchToggleLoading, setSearchToggleLoading] = useState(false);
|
||||
const [dictationToggleLoading, setDictationToggleLoading] = useState(false);
|
||||
|
||||
// Whether a key is currently stored server-side (drives the placeholder).
|
||||
const [hasApiKey, setHasApiKey] = useState(false);
|
||||
@@ -111,6 +119,9 @@ export default function AiProviderSettings() {
|
||||
// Same, for the embedding-specific key.
|
||||
const [hasEmbeddingApiKey, setHasEmbeddingApiKey] = useState(false);
|
||||
const [embeddingKeyCleared, setEmbeddingKeyCleared] = useState(false);
|
||||
// Same, for the STT-specific key.
|
||||
const [hasSttApiKey, setHasSttApiKey] = useState(false);
|
||||
const [sttKeyCleared, setSttKeyCleared] = useState(false);
|
||||
|
||||
// Modal for the (large) system message editor.
|
||||
const [promptOpened, promptHandlers] = useDisclosure(false);
|
||||
@@ -125,6 +136,9 @@ export default function AiProviderSettings() {
|
||||
systemPrompt: "",
|
||||
apiKey: "",
|
||||
embeddingApiKey: "",
|
||||
sttModel: "",
|
||||
sttBaseUrl: "",
|
||||
sttApiKey: "",
|
||||
},
|
||||
});
|
||||
|
||||
@@ -140,12 +154,17 @@ export default function AiProviderSettings() {
|
||||
systemPrompt: settings.systemPrompt ?? "",
|
||||
apiKey: "",
|
||||
embeddingApiKey: "",
|
||||
sttModel: settings.sttModel ?? "",
|
||||
sttBaseUrl: settings.sttBaseUrl ?? "",
|
||||
sttApiKey: "",
|
||||
});
|
||||
form.resetDirty();
|
||||
setHasApiKey(settings.hasApiKey);
|
||||
setKeyCleared(false);
|
||||
setHasEmbeddingApiKey(settings.hasEmbeddingApiKey);
|
||||
setEmbeddingKeyCleared(false);
|
||||
setHasSttApiKey(settings.hasSttApiKey);
|
||||
setSttKeyCleared(false);
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [settings]);
|
||||
|
||||
@@ -160,6 +179,10 @@ export default function AiProviderSettings() {
|
||||
baseUrl: values.baseUrl,
|
||||
embeddingBaseUrl: values.embeddingBaseUrl,
|
||||
systemPrompt: values.systemPrompt,
|
||||
// The STT base URL is optional; empty falls back to the chat base URL
|
||||
// server-side.
|
||||
sttModel: values.sttModel,
|
||||
sttBaseUrl: values.sttBaseUrl,
|
||||
};
|
||||
|
||||
// Key semantics (never send the stored key back):
|
||||
@@ -179,6 +202,13 @@ export default function AiProviderSettings() {
|
||||
payload.embeddingApiKey = "";
|
||||
}
|
||||
|
||||
// Same write-only semantics for the STT-specific key.
|
||||
if (values.sttApiKey.length > 0) {
|
||||
payload.sttApiKey = values.sttApiKey;
|
||||
} else if (sttKeyCleared) {
|
||||
payload.sttApiKey = "";
|
||||
}
|
||||
|
||||
return payload;
|
||||
}
|
||||
|
||||
@@ -191,6 +221,9 @@ export default function AiProviderSettings() {
|
||||
setHasEmbeddingApiKey(updated.hasEmbeddingApiKey);
|
||||
setEmbeddingKeyCleared(false);
|
||||
form.setFieldValue("embeddingApiKey", "");
|
||||
setHasSttApiKey(updated.hasSttApiKey);
|
||||
setSttKeyCleared(false);
|
||||
form.setFieldValue("sttApiKey", "");
|
||||
form.resetDirty();
|
||||
}
|
||||
|
||||
@@ -206,6 +239,12 @@ export default function AiProviderSettings() {
|
||||
form.setFieldValue("embeddingApiKey", "");
|
||||
}
|
||||
|
||||
function handleClearSttKey() {
|
||||
setSttKeyCleared(true);
|
||||
setHasSttApiKey(false);
|
||||
form.setFieldValue("sttApiKey", "");
|
||||
}
|
||||
|
||||
// Optimistic toggle for the "AI chat" feature (settings.ai.chat).
|
||||
async function handleToggleChat(value: boolean) {
|
||||
setChatToggleLoading(true);
|
||||
@@ -268,6 +307,34 @@ export default function AiProviderSettings() {
|
||||
}
|
||||
}
|
||||
|
||||
// Optimistic toggle for the "Voice dictation" feature (settings.ai.dictation).
|
||||
async function handleToggleDictation(value: boolean) {
|
||||
setDictationToggleLoading(true);
|
||||
const previous = dictationEnabled;
|
||||
setDictationEnabled(value);
|
||||
try {
|
||||
const updated = await updateWorkspace({ aiDictation: value });
|
||||
setWorkspace({
|
||||
...updated,
|
||||
settings: {
|
||||
...updated.settings,
|
||||
ai: { ...updated.settings?.ai, dictation: value },
|
||||
},
|
||||
});
|
||||
notifications.show({ message: t("Updated successfully") });
|
||||
} catch (err) {
|
||||
setDictationEnabled(previous);
|
||||
const message = (err as { response?: { data?: { message?: string } } })
|
||||
?.response?.data?.message;
|
||||
notifications.show({
|
||||
message: message ?? t("Failed to update data"),
|
||||
color: "red",
|
||||
});
|
||||
} finally {
|
||||
setDictationToggleLoading(false);
|
||||
}
|
||||
}
|
||||
|
||||
// Admins only — match the previous behavior.
|
||||
if (!isAdmin) {
|
||||
return (
|
||||
@@ -294,6 +361,11 @@ export default function AiProviderSettings() {
|
||||
"/embeddings",
|
||||
form.values.baseUrl,
|
||||
);
|
||||
const sttResolved = resolveUrl(
|
||||
form.values.sttBaseUrl,
|
||||
"/audio/transcriptions",
|
||||
form.values.baseUrl,
|
||||
);
|
||||
|
||||
const monoFont = "ui-monospace, Menlo, monospace";
|
||||
|
||||
@@ -541,8 +613,8 @@ export default function AiProviderSettings() {
|
||||
</Box>
|
||||
</Paper>
|
||||
|
||||
{/* Card 3 — Voice / STT (disabled stub, not wired to the form/backend) */}
|
||||
<Paper withBorder radius="md" p="lg" opacity={0.6}>
|
||||
{/* Card 3 — Voice / STT */}
|
||||
<Paper withBorder radius="md" p="lg">
|
||||
<Group justify="space-between" align="center" wrap="nowrap">
|
||||
<Group gap="xs" align="center" wrap="nowrap">
|
||||
<StatusDot status="idle" />
|
||||
@@ -551,8 +623,9 @@ export default function AiProviderSettings() {
|
||||
<Switch
|
||||
label={t("Voice dictation")}
|
||||
labelPosition="left"
|
||||
checked={false}
|
||||
disabled
|
||||
checked={dictationEnabled}
|
||||
disabled={dictationToggleLoading}
|
||||
onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
|
||||
/>
|
||||
</Group>
|
||||
<Text size="xs" c="dimmed" mt={4} mb="md">
|
||||
@@ -562,33 +635,46 @@ export default function AiProviderSettings() {
|
||||
</Text>
|
||||
|
||||
<Group grow align="flex-start">
|
||||
<TextInput label={t("Model")} value="" disabled readOnly />
|
||||
<PasswordInput label={t("API key")} value="" disabled readOnly />
|
||||
</Group>
|
||||
<TextInput mt="sm" label={t("Base URL")} value="" disabled readOnly />
|
||||
|
||||
<Group mt="md">
|
||||
<Button variant="default" size="sm" disabled>
|
||||
{t("Test endpoint")}
|
||||
</Button>
|
||||
<TextInput
|
||||
label={t("Model")}
|
||||
disabled={isLoading}
|
||||
{...form.getInputProps("sttModel")}
|
||||
/>
|
||||
<Stack gap={4}>
|
||||
<PasswordInput
|
||||
label={t("API key")}
|
||||
placeholder={
|
||||
hasSttApiKey
|
||||
? t("•••• set")
|
||||
: t("Leave empty to use the chat API key")
|
||||
}
|
||||
autoComplete="off"
|
||||
{...form.getInputProps("sttApiKey")}
|
||||
/>
|
||||
{hasSttApiKey && (
|
||||
<Anchor
|
||||
component="button"
|
||||
type="button"
|
||||
c="red"
|
||||
size="xs"
|
||||
onClick={handleClearSttKey}
|
||||
>
|
||||
{t("Clear")}
|
||||
</Anchor>
|
||||
)}
|
||||
</Stack>
|
||||
</Group>
|
||||
|
||||
<Box
|
||||
mt="md"
|
||||
mx="calc(var(--mantine-spacing-lg) * -1)"
|
||||
mb="calc(var(--mantine-spacing-lg) * -1)"
|
||||
px="lg"
|
||||
py="md"
|
||||
style={{
|
||||
borderTop: "1px solid var(--mantine-color-default-border)",
|
||||
background: "var(--mantine-color-default-hover)",
|
||||
borderRadius: "0 0 var(--mantine-radius-md) var(--mantine-radius-md)",
|
||||
}}
|
||||
>
|
||||
<Text size="xs" c="dimmed">
|
||||
{t("Voice dictation is not available yet.")}
|
||||
</Text>
|
||||
</Box>
|
||||
<TextInput
|
||||
mt="sm"
|
||||
label={t("Base URL")}
|
||||
placeholder={t("Leave empty to use the chat base URL")}
|
||||
disabled={isLoading}
|
||||
{...form.getInputProps("sttBaseUrl")}
|
||||
/>
|
||||
<Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
|
||||
{t("Resolves to {{url}}", { url: sttResolved })}
|
||||
</Text>
|
||||
</Paper>
|
||||
|
||||
{/* Nested: external MCP tools the agent calls out to */}
|
||||
|
||||
@@ -16,6 +16,12 @@ export interface IAiSettings {
|
||||
systemPrompt?: string;
|
||||
hasApiKey: boolean;
|
||||
hasEmbeddingApiKey: boolean;
|
||||
// STT-specific settings. `sttBaseUrl` is the RAW stored value (empty means
|
||||
// "uses the chat base URL"). `hasSttApiKey` indicates whether an STT-specific
|
||||
// key is stored (empty means "uses the chat API key").
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
hasSttApiKey: boolean;
|
||||
// RAG indexing coverage (pages indexed for semantic search).
|
||||
indexedPages: number;
|
||||
totalPages: number;
|
||||
@@ -35,6 +41,10 @@ export interface IAiSettingsUpdate {
|
||||
systemPrompt?: string;
|
||||
apiKey?: string;
|
||||
embeddingApiKey?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
|
||||
sttApiKey?: string;
|
||||
}
|
||||
|
||||
// Result of a connection test against the configured provider.
|
||||
|
||||
@@ -24,6 +24,7 @@ export interface IWorkspace {
|
||||
disablePublicSharing?: boolean;
|
||||
mcpEnabled?: boolean;
|
||||
aiChat?: boolean;
|
||||
aiDictation?: boolean;
|
||||
trashRetentionDays?: number;
|
||||
restrictApiToAdmins?: boolean;
|
||||
allowMemberTemplates?: boolean;
|
||||
@@ -46,6 +47,7 @@ export interface IWorkspaceAiSettings {
|
||||
generative?: boolean;
|
||||
mcp?: boolean;
|
||||
chat?: boolean;
|
||||
dictation?: boolean;
|
||||
}
|
||||
|
||||
export interface IWorkspaceSharingSettings {
|
||||
|
||||
Reference in New Issue
Block a user