feat(dictation): add realtime streaming STT (live dictation)
Layer an optional realtime speech-to-text path on top of the existing batch dictation, so transcribed text appears as the user speaks. Transport A2: browser <-> our server (Socket.IO `/ai-realtime`) <-> OpenAI Realtime (raw ws). The provider API key never leaves the server; the upstream URL is SSRF-checked before connecting; the gateway enforces the dictation+dictationRealtime gate, cookie-JWT auth and per-user/ per-workspace concurrency caps. Implemented against the GA (2026) OpenAI Realtime transcription contract (session.update / audio.input.format / server_vad), not the now-removed beta shape. Editor UI B2: interim text is shown as a meta-only ProseMirror ghost decoration (no Yjs/history noise); only completed segments are committed. Chat shows interim as a dimmed tail. The mic button switches realtime vs batch by the workspace flag; batch remains the default and fallback. Server: - AiRealtimeService (upstream ws proxy, normalized events, idle/max- duration timeouts, idempotent teardown) + parseUpstreamEvent unit tests - AiRealtimeGateway (Socket.IO `/ai-realtime`) wired into AiChatModule - admin-gated POST /ai-chat/realtime/test connectivity probe - config: settings.ai.dictationRealtime + provider sttRealtimeModel/ sttRealtimeBaseUrl (realtime key reuses sttApiKey; no new secret) Client: - pcm16 AudioWorklet (24kHz mono PCM16), RealtimeDictationClient, use-realtime-dictation hook (status/start/stop/cancel + onInterim/onFinal) - RealtimeMicButton + dictation-interim ProseMirror decoration - editor/chat integration + AI settings UI (toggle, model, test endpoint) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
committed by
claude code agent 227
parent
74e2b7ad7f
commit
7db3f007cb
@@ -1,11 +1,19 @@
|
||||
import { KeyboardEvent } from "react";
|
||||
import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core";
|
||||
import { KeyboardEvent, useState } from "react";
|
||||
import {
|
||||
ActionIcon,
|
||||
Group,
|
||||
Stack,
|
||||
Text,
|
||||
Textarea,
|
||||
Tooltip,
|
||||
} from "@mantine/core";
|
||||
import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { useAtom, useAtomValue } from "jotai";
|
||||
import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts";
|
||||
import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
|
||||
import { MicButton } from "@/features/dictation/components/mic-button";
|
||||
import { RealtimeMicButton } from "@/features/dictation/components/realtime-mic-button";
|
||||
|
||||
interface ChatInputProps {
|
||||
onSend: (text: string) => void;
|
||||
@@ -29,12 +37,17 @@ export default function ChatInput({
|
||||
const [value, setValue] = useAtom(aiChatDraftAtom);
|
||||
const workspace = useAtomValue(workspaceAtom);
|
||||
const isDictationEnabled = workspace?.settings?.ai?.dictation === true;
|
||||
const isRealtime = workspace?.settings?.ai?.dictationRealtime === true;
|
||||
// Live interim (partial) transcript shown as a dimmed tail under the input.
|
||||
const [interim, setInterim] = useState("");
|
||||
|
||||
const send = (): void => {
|
||||
const text = value.trim();
|
||||
if (!text || isStreaming || disabled) return;
|
||||
onSend(text);
|
||||
setValue("");
|
||||
// Drop any leftover partial when a message is sent.
|
||||
setInterim("");
|
||||
};
|
||||
|
||||
const handleKeyDown = (e: KeyboardEvent<HTMLTextAreaElement>): void => {
|
||||
@@ -45,7 +58,8 @@ export default function ChatInput({
|
||||
};
|
||||
|
||||
return (
|
||||
<Group gap="xs" align="flex-end" wrap="nowrap">
|
||||
<Stack gap="xs">
|
||||
<Group gap="xs" align="flex-end" wrap="nowrap">
|
||||
<Textarea
|
||||
style={{ flex: 1 }}
|
||||
placeholder={t("Ask the AI agent…")}
|
||||
@@ -61,13 +75,24 @@ export default function ChatInput({
|
||||
// switch), so a fresh chat lands with the cursor ready in the field.
|
||||
autoFocus
|
||||
/>
|
||||
{isDictationEnabled && (
|
||||
<MicButton
|
||||
size="lg"
|
||||
disabled={isStreaming || disabled}
|
||||
onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
|
||||
/>
|
||||
)}
|
||||
{isDictationEnabled &&
|
||||
(isRealtime ? (
|
||||
<RealtimeMicButton
|
||||
size="lg"
|
||||
disabled={isStreaming || disabled}
|
||||
onInterim={(text) => setInterim(text)}
|
||||
onFinal={(text) => {
|
||||
setValue((v) => (v ? `${v} ${text}` : text));
|
||||
setInterim("");
|
||||
}}
|
||||
/>
|
||||
) : (
|
||||
<MicButton
|
||||
size="lg"
|
||||
disabled={isStreaming || disabled}
|
||||
onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
|
||||
/>
|
||||
))}
|
||||
{isStreaming ? (
|
||||
<Tooltip label={t("Stop")} withArrow>
|
||||
<ActionIcon
|
||||
@@ -93,6 +118,12 @@ export default function ChatInput({
|
||||
</ActionIcon>
|
||||
</Tooltip>
|
||||
)}
|
||||
</Group>
|
||||
</Group>
|
||||
{interim && (
|
||||
<Text size="sm" c="dimmed">
|
||||
{interim}
|
||||
</Text>
|
||||
)}
|
||||
</Stack>
|
||||
);
|
||||
}
|
||||
|
||||
33
apps/client/src/features/dictation/audio/audio-worklet.d.ts
vendored
Normal file
33
apps/client/src/features/dictation/audio/audio-worklet.d.ts
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
// Minimal ambient declarations for the AudioWorklet global scope.
|
||||
//
|
||||
// The client tsconfig only pulls in the DOM libs (no "webworker"/"audioworklet"
|
||||
// lib), so the symbols available inside an AudioWorkletProcessor module are not
|
||||
// known to `tsc`. These declarations are intentionally narrow: just enough for
|
||||
// `pcm16-worklet.ts` to typecheck, matching the Web Audio API spec shapes used
|
||||
// by that processor. They describe the worklet global scope, not the main thread.
|
||||
|
||||
declare abstract class AudioWorkletProcessor {
|
||||
// Message channel back to the main thread (used to transfer PCM16 buffers).
|
||||
readonly port: MessagePort;
|
||||
|
||||
constructor();
|
||||
|
||||
// Called for each render quantum. `inputs`/`outputs` are channel arrays
|
||||
// indexed as [input][channel][sample]; `parameters` maps AudioParam names to
|
||||
// their per-sample (or single-value) Float32Array. Return `true` to keep the
|
||||
// processor alive.
|
||||
abstract process(
|
||||
inputs: Float32Array[][],
|
||||
outputs: Float32Array[][],
|
||||
parameters: Record<string, Float32Array>,
|
||||
): boolean;
|
||||
}
|
||||
|
||||
// Registers a processor class under a name usable from `new AudioWorkletNode`.
|
||||
declare function registerProcessor(
|
||||
name: string,
|
||||
processorCtor: new () => AudioWorkletProcessor,
|
||||
): void;
|
||||
|
||||
// The render context's sample rate, in Hz, available in the worklet global scope.
|
||||
declare const sampleRate: number;
|
||||
123
apps/client/src/features/dictation/audio/pcm16-worklet.ts
Normal file
123
apps/client/src/features/dictation/audio/pcm16-worklet.ts
Normal file
@@ -0,0 +1,123 @@
|
||||
// Self-contained AudioWorkletProcessor that turns the live microphone stream into
|
||||
// PCM16 (signed 16-bit, little-endian), mono, 24000 Hz chunks for the realtime STT
|
||||
// upstream. It runs in the AudioWorklet global scope, so it MUST NOT import anything
|
||||
// (the worklet module has no module graph / bundler runtime around it).
|
||||
//
|
||||
// Per `process()` call the host hands us a render quantum (typically 128 frames) at
|
||||
// the context sample rate. We read the first input channel (mono), linearly resample
|
||||
// to 24000 Hz while carrying the fractional read position across calls (so we never
|
||||
// assume a particular input rate, e.g. 44.1k or 48k), accumulate the resampled
|
||||
// samples, and once we have ~150 ms worth (3600 samples) we emit them as an
|
||||
// Int16 ArrayBuffer transferred to the main thread.
|
||||
|
||||
// Target output rate required by the upstream transcription contract.
|
||||
const TARGET_RATE = 24000;
|
||||
// ~150 ms of audio at the target rate: 24000 * 0.15 = 3600 samples per message.
|
||||
const FRAME_SAMPLES = Math.round(TARGET_RATE * 0.15);
|
||||
|
||||
class Pcm16Worklet extends AudioWorkletProcessor {
|
||||
// Fractional read position within the CURRENT quantum, in input-sample units.
|
||||
// Kept across `process()` calls so resampling has no per-quantum seams. After a
|
||||
// quantum it is rebased relative to the next quantum's start, so a value in
|
||||
// [-1, 0) means "interpolate between the previous quantum's last sample and the
|
||||
// next quantum's first sample".
|
||||
private resamplePos = 0;
|
||||
|
||||
// The previous quantum's last input sample, used to interpolate across the
|
||||
// boundary between two render quanta (the conceptual sample at index -1).
|
||||
private prevSample = 0;
|
||||
|
||||
// True once at least one sample has been seen (so `prevSample` is meaningful).
|
||||
private primed = false;
|
||||
|
||||
// Accumulated resampled Float32 samples awaiting conversion + flush.
|
||||
private acc: Float32Array = new Float32Array(FRAME_SAMPLES);
|
||||
private accLen = 0;
|
||||
|
||||
process(inputs: Float32Array[][]): boolean {
|
||||
const input = inputs[0];
|
||||
// No connected input (or a momentarily empty quantum): keep the node alive
|
||||
// and emit silence below.
|
||||
const channel = input && input.length > 0 ? input[0] : undefined;
|
||||
|
||||
if (channel && channel.length > 0) {
|
||||
this.resampleAndAccumulate(channel);
|
||||
}
|
||||
|
||||
// Drive silence to the output so connecting this node to destination keeps
|
||||
// the graph running without echoing the microphone back to the speakers.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Linearly resample `channel` (at the context `sampleRate`) to TARGET_RATE and
|
||||
// push the results into the accumulator, flushing whole frames as they fill.
|
||||
private resampleAndAccumulate(channel: Float32Array): void {
|
||||
const ratio = sampleRate / TARGET_RATE; // input samples consumed per output sample
|
||||
const n = channel.length;
|
||||
|
||||
if (!this.primed) {
|
||||
// First quantum: there is no real predecessor, so seed the virtual index -1
|
||||
// with this quantum's first sample and start reading from 0.
|
||||
this.prevSample = channel[0];
|
||||
this.primed = true;
|
||||
this.resamplePos = 0;
|
||||
}
|
||||
|
||||
let pos = this.resamplePos;
|
||||
|
||||
// Emit output samples whose RIGHT neighbor (floor + 1) is available within
|
||||
// this quantum, i.e. while floor + 1 <= n - 1 ⇔ pos < n - 1. The left
|
||||
// neighbor at floor === -1 is the carried `prevSample`; floor >= 0 reads the
|
||||
// quantum directly. Any leftover position (whose right neighbor would be the
|
||||
// NEXT quantum's first sample) is carried via `resamplePos` and resolved on
|
||||
// the next call. This guarantees we never read `channel[n]` (out of bounds).
|
||||
while (pos < n - 1) {
|
||||
const floor = Math.floor(pos);
|
||||
const frac = pos - floor;
|
||||
|
||||
const s0 = floor < 0 ? this.prevSample : channel[floor];
|
||||
const s1 = channel[floor + 1];
|
||||
|
||||
this.pushSample(s0 + (s1 - s0) * frac);
|
||||
pos += ratio;
|
||||
}
|
||||
|
||||
// Rebase the leftover position relative to the next quantum's start and carry
|
||||
// this quantum's last sample as the predecessor for the boundary interval.
|
||||
this.resamplePos = pos - n;
|
||||
this.prevSample = channel[n - 1];
|
||||
}
|
||||
|
||||
// Append one resampled sample; flush a full PCM16 frame whenever the
|
||||
// accumulator reaches FRAME_SAMPLES.
|
||||
private pushSample(sample: number): void {
|
||||
this.acc[this.accLen] = sample;
|
||||
this.accLen += 1;
|
||||
if (this.accLen >= FRAME_SAMPLES) {
|
||||
this.flush();
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the accumulated Float32 samples to Int16 LE and post the ArrayBuffer
|
||||
// to the main thread, transferring ownership (zero-copy). DataView writes are
|
||||
// little-endian to match the PCM16 contract regardless of host endianness.
|
||||
private flush(): void {
|
||||
const count = this.accLen;
|
||||
if (count === 0) return;
|
||||
|
||||
const buffer = new ArrayBuffer(count * 2);
|
||||
const view = new DataView(buffer);
|
||||
for (let i = 0; i < count; i++) {
|
||||
// Clamp to [-1, 1] then scale to the signed 16-bit range.
|
||||
let s = this.acc[i];
|
||||
if (s > 1) s = 1;
|
||||
else if (s < -1) s = -1;
|
||||
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true);
|
||||
}
|
||||
this.accLen = 0;
|
||||
|
||||
this.port.postMessage(buffer, [buffer]);
|
||||
}
|
||||
}
|
||||
|
||||
registerProcessor("pcm16-worklet", Pcm16Worklet);
|
||||
@@ -0,0 +1,84 @@
|
||||
import { FC, useEffect, useRef } from "react";
|
||||
import { ActionIcon, Tooltip } from "@mantine/core";
|
||||
import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import {
|
||||
useRealtimeDictation,
|
||||
type RealtimeDictationStatus,
|
||||
} from "@/features/dictation/hooks/use-realtime-dictation";
|
||||
|
||||
interface RealtimeMicButtonProps {
|
||||
onInterim: (text: string) => void;
|
||||
onFinal: (text: string) => void;
|
||||
onStart?: () => void;
|
||||
disabled?: boolean;
|
||||
// Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
|
||||
// editor toolbar.
|
||||
size?: "md" | "lg";
|
||||
}
|
||||
|
||||
/**
|
||||
* Streaming sibling of MicButton. Drives the realtime dictation state machine:
|
||||
* a click starts recording (mic icon), a second click stops it (stop icon).
|
||||
* Interim/final transcripts are surfaced through the onInterim/onFinal props as
|
||||
* they arrive; there is no "transcribing" state because final text lands
|
||||
* incrementally while recording. Mirrors MicButton's look and tooltips.
|
||||
*/
|
||||
export const RealtimeMicButton: FC<RealtimeMicButtonProps> = ({
|
||||
onInterim,
|
||||
onFinal,
|
||||
onStart,
|
||||
disabled,
|
||||
size = "lg",
|
||||
}) => {
|
||||
const { t } = useTranslation();
|
||||
const { status, start, stop } = useRealtimeDictation({
|
||||
onInterim,
|
||||
onFinal,
|
||||
onStart,
|
||||
});
|
||||
const iconSize = size === "lg" ? 18 : 16;
|
||||
|
||||
// When recording ends (status leaves "recording" for idle/error), clear any
|
||||
// leftover partial in the consumer once. Tracked via the previous status so
|
||||
// it only fires on the transition, not on every render.
|
||||
const prevStatusRef = useRef<RealtimeDictationStatus>(status);
|
||||
useEffect(() => {
|
||||
if (prevStatusRef.current === "recording" && status !== "recording") {
|
||||
onInterim("");
|
||||
}
|
||||
prevStatusRef.current = status;
|
||||
}, [status, onInterim]);
|
||||
|
||||
if (status === "recording") {
|
||||
return (
|
||||
<Tooltip label={t("Stop recording")} withArrow>
|
||||
<ActionIcon
|
||||
size={size}
|
||||
color="red"
|
||||
variant="light"
|
||||
onClick={stop}
|
||||
aria-label={t("Stop recording")}
|
||||
>
|
||||
<IconPlayerStopFilled size={iconSize} />
|
||||
</ActionIcon>
|
||||
</Tooltip>
|
||||
);
|
||||
}
|
||||
|
||||
// idle / error: subtle mic to (re)start. No spinner — there is no separate
|
||||
// transcribing phase in the realtime flow.
|
||||
return (
|
||||
<Tooltip label={t("Start dictation")} withArrow>
|
||||
<ActionIcon
|
||||
size={size}
|
||||
variant="subtle"
|
||||
onClick={() => void start()}
|
||||
disabled={disabled}
|
||||
aria-label={t("Start dictation")}
|
||||
>
|
||||
<IconMicrophone size={iconSize} />
|
||||
</ActionIcon>
|
||||
</Tooltip>
|
||||
);
|
||||
};
|
||||
@@ -0,0 +1,427 @@
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
import { notifications } from "@mantine/notifications";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { RealtimeDictationClient } from "@/features/dictation/services/realtime-dictation-client";
|
||||
|
||||
// The worklet module URL is produced via `new URL(..., import.meta.url)` so Vite
|
||||
// emits the processor as a separate, self-contained module chunk (it must run in
|
||||
// the AudioWorklet global scope, outside the main bundle). Built once at module
|
||||
// load — the resolved URL is stable for the app's lifetime.
|
||||
const PCM16_WORKLET_URL = new URL(
|
||||
"../audio/pcm16-worklet.ts",
|
||||
import.meta.url,
|
||||
);
|
||||
|
||||
export type RealtimeDictationStatus = "idle" | "recording" | "error";
|
||||
|
||||
export interface UseRealtimeDictationOptions {
|
||||
onInterim: (text: string) => void; // latest partial for the live segment
|
||||
onFinal: (text: string) => void; // a completed segment (trimmed)
|
||||
onStart?: () => void; // fired right when capture begins (caret snapshot)
|
||||
maxDurationMs?: number; // default 120000
|
||||
}
|
||||
|
||||
export interface UseRealtimeDictationResult {
|
||||
status: RealtimeDictationStatus;
|
||||
start: () => Promise<void>;
|
||||
stop: () => void;
|
||||
cancel: () => void;
|
||||
}
|
||||
|
||||
// AudioContext is webkit-prefixed on some older Safari builds; keep a typed
|
||||
// fallback so the hook never crashes when the standard name is missing.
|
||||
function getAudioContextCtor(): typeof AudioContext | undefined {
|
||||
if (typeof AudioContext !== "undefined") return AudioContext;
|
||||
const w = window as unknown as { webkitAudioContext?: typeof AudioContext };
|
||||
return w.webkitAudioContext;
|
||||
}
|
||||
|
||||
/**
|
||||
* Streaming sibling of `use-dictation`. Captures the mic, resamples to PCM16
|
||||
* 24 kHz in an AudioWorklet, and streams it over the normalized `/ai-realtime`
|
||||
* Socket.IO namespace, surfacing interim/final transcripts as they arrive.
|
||||
*
|
||||
* Mirrors `use-dictation`'s conventions: refs hold the live graph/client/timers
|
||||
* so re-renders never lose them, getUserMedia errors map to the same Mantine
|
||||
* notifications, and every exit path stops the MediaStream tracks and closes the
|
||||
* AudioContext. There is no `transcribing` state — final text arrives
|
||||
* incrementally while `recording`.
|
||||
*/
|
||||
export function useRealtimeDictation(
|
||||
options: UseRealtimeDictationOptions,
|
||||
): UseRealtimeDictationResult {
|
||||
const { t, i18n } = useTranslation();
|
||||
const [status, setStatus] = useState<RealtimeDictationStatus>("idle");
|
||||
|
||||
// Keep the latest callbacks in a ref so async socket handlers always call the
|
||||
// current handlers without re-creating the capture graph.
|
||||
const optionsRef = useRef(options);
|
||||
optionsRef.current = options;
|
||||
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const audioContextRef = useRef<AudioContext | null>(null);
|
||||
const sourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
|
||||
const workletRef = useRef<AudioWorkletNode | null>(null);
|
||||
const clientRef = useRef<RealtimeDictationClient | null>(null);
|
||||
|
||||
const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
|
||||
const canceledRef = useRef(false);
|
||||
const startingRef = useRef(false);
|
||||
// True once the server emits `ready`; audio is buffered until then, then flushed.
|
||||
const readyRef = useRef(false);
|
||||
// PCM16 chunks captured before the upstream session is ready.
|
||||
const pendingAudioRef = useRef<ArrayBuffer[]>([]);
|
||||
// Stable ref to the latest stop() so the max-duration timer (armed inside the
|
||||
// start closure) can invoke the current version without re-arming every render.
|
||||
const stopRef = useRef<() => void>(() => undefined);
|
||||
|
||||
const clearTimer = useCallback(() => {
|
||||
if (timerRef.current !== null) {
|
||||
clearTimeout(timerRef.current);
|
||||
timerRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
const stopTracks = useCallback(() => {
|
||||
streamRef.current?.getTracks().forEach((track) => track.stop());
|
||||
streamRef.current = null;
|
||||
}, []);
|
||||
|
||||
// Tear down the audio graph (worklet node, source, context). Never throws on a
|
||||
// half-built or already-closed graph.
|
||||
const teardownAudio = useCallback(() => {
|
||||
const worklet = workletRef.current;
|
||||
if (worklet) {
|
||||
worklet.port.onmessage = null;
|
||||
try {
|
||||
worklet.disconnect();
|
||||
} catch {
|
||||
// Node may already be disconnected; ignore.
|
||||
}
|
||||
workletRef.current = null;
|
||||
}
|
||||
|
||||
const source = sourceRef.current;
|
||||
if (source) {
|
||||
try {
|
||||
source.disconnect();
|
||||
} catch {
|
||||
// Ignore disconnect of an already-detached node.
|
||||
}
|
||||
sourceRef.current = null;
|
||||
}
|
||||
|
||||
const ctx = audioContextRef.current;
|
||||
if (ctx) {
|
||||
audioContextRef.current = null;
|
||||
if (ctx.state !== "closed") {
|
||||
// close() returns a promise; swallow rejections so teardown never throws.
|
||||
void ctx.close().catch(() => undefined);
|
||||
}
|
||||
}
|
||||
}, []);
|
||||
|
||||
// Full teardown shared by stop/cancel/unmount. Order: stop streaming upstream,
|
||||
// disconnect the socket, then dismantle the local audio graph and tracks, then
|
||||
// clear timers and reset the ready/pending state.
|
||||
const teardown = useCallback(() => {
|
||||
const client = clientRef.current;
|
||||
if (client) {
|
||||
clientRef.current = null;
|
||||
try {
|
||||
client.stop();
|
||||
} catch {
|
||||
// Socket may already be gone; ignore.
|
||||
}
|
||||
client.disconnect();
|
||||
}
|
||||
|
||||
teardownAudio();
|
||||
stopTracks();
|
||||
clearTimer();
|
||||
|
||||
readyRef.current = false;
|
||||
pendingAudioRef.current = [];
|
||||
startingRef.current = false;
|
||||
}, [teardownAudio, stopTracks, clearTimer]);
|
||||
|
||||
// Surface a concrete failure: log it, notify, flip to "error", and reset to
|
||||
// "idle" after a short delay (mirrors use-dictation's error timer).
|
||||
const handleError = useCallback(
|
||||
(message: string, err?: unknown) => {
|
||||
if (canceledRef.current) return;
|
||||
// Never log audio — only the textual reason.
|
||||
console.error("[realtime-dictation]", message, err ?? "");
|
||||
notifications.show({ color: "red", message });
|
||||
teardown();
|
||||
setStatus("error");
|
||||
if (errorTimerRef.current !== null) {
|
||||
clearTimeout(errorTimerRef.current);
|
||||
}
|
||||
errorTimerRef.current = setTimeout(() => {
|
||||
errorTimerRef.current = null;
|
||||
setStatus("idle");
|
||||
}, 1500);
|
||||
},
|
||||
[teardown],
|
||||
);
|
||||
|
||||
const start = useCallback(async (): Promise<void> => {
|
||||
// Synchronous live guard: status is stale between renders, so also block on
|
||||
// refs to prevent a double-click from opening two MediaStreams / sockets.
|
||||
if (
|
||||
startingRef.current ||
|
||||
streamRef.current ||
|
||||
audioContextRef.current ||
|
||||
clientRef.current
|
||||
) {
|
||||
return;
|
||||
}
|
||||
if (status !== "idle") return;
|
||||
startingRef.current = true;
|
||||
canceledRef.current = false;
|
||||
readyRef.current = false;
|
||||
pendingAudioRef.current = [];
|
||||
|
||||
if (!navigator.mediaDevices?.getUserMedia) {
|
||||
const reason =
|
||||
"navigator.mediaDevices.getUserMedia is unavailable in this context";
|
||||
console.error("[realtime-dictation] " + reason);
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: t("Audio recording is not available in this browser/context"),
|
||||
});
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
|
||||
let stream: MediaStream;
|
||||
try {
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} catch (err) {
|
||||
// Always log the full error for diagnosis (name, message, stack).
|
||||
console.error("[realtime-dictation] getUserMedia failed", err);
|
||||
const name = (err as { name?: string })?.name;
|
||||
const detail = (err as { message?: string })?.message ?? String(err);
|
||||
let message: string;
|
||||
if (name === "NotAllowedError" || name === "SecurityError") {
|
||||
message = t("Microphone access denied");
|
||||
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
|
||||
message = t("No microphone found");
|
||||
} else if (name === "NotReadableError" || name === "AbortError") {
|
||||
message = t("Microphone is unavailable or already in use");
|
||||
} else {
|
||||
// Unknown failure: show the real reason instead of a generic string.
|
||||
message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
|
||||
}
|
||||
notifications.show({ color: "red", message });
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// If a cancel landed during the await, drop the stream and bail out.
|
||||
if (canceledRef.current) {
|
||||
stream.getTracks().forEach((track) => track.stop());
|
||||
startingRef.current = false;
|
||||
setStatus("idle");
|
||||
return;
|
||||
}
|
||||
streamRef.current = stream;
|
||||
|
||||
// Build the capture graph. The worklet still resamples robustly if the browser
|
||||
// ignores the 24 kHz hint, so any actual context rate is handled correctly.
|
||||
const AudioCtx = getAudioContextCtor();
|
||||
if (!AudioCtx) {
|
||||
stopTracks();
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: t("Audio recording is not available in this browser/context"),
|
||||
});
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
|
||||
let audioContext: AudioContext;
|
||||
try {
|
||||
audioContext = new AudioCtx({ sampleRate: 24000 });
|
||||
audioContextRef.current = audioContext;
|
||||
// AudioWorklet requires a secure context (https/localhost), same constraint
|
||||
// as getUserMedia. A failure here means the UI should fall back to batch.
|
||||
await audioContext.audioWorklet.addModule(PCM16_WORKLET_URL);
|
||||
} catch (err) {
|
||||
console.error("[realtime-dictation] audio worklet setup failed", err);
|
||||
teardownAudio();
|
||||
stopTracks();
|
||||
const detail = (err as { message?: string })?.message ?? String(err);
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: `${t("Could not start recording")}: ${detail}`,
|
||||
});
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// Another cancel could have landed during addModule().
|
||||
if (canceledRef.current) {
|
||||
teardownAudio();
|
||||
stopTracks();
|
||||
startingRef.current = false;
|
||||
setStatus("idle");
|
||||
return;
|
||||
}
|
||||
|
||||
let source: MediaStreamAudioSourceNode;
|
||||
let worklet: AudioWorkletNode;
|
||||
try {
|
||||
source = audioContext.createMediaStreamSource(stream);
|
||||
worklet = new AudioWorkletNode(audioContext, "pcm16-worklet");
|
||||
sourceRef.current = source;
|
||||
workletRef.current = worklet;
|
||||
// MediaStreamSource → worklet → destination. The worklet emits silence, so
|
||||
// connecting to destination drives the render graph without echoing the mic.
|
||||
source.connect(worklet);
|
||||
worklet.connect(audioContext.destination);
|
||||
} catch (err) {
|
||||
console.error("[realtime-dictation] audio graph wiring failed", err);
|
||||
teardownAudio();
|
||||
stopTracks();
|
||||
const detail = (err as { message?: string })?.message ?? String(err);
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: `${t("Could not start recording")}: ${detail}`,
|
||||
});
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// Each worklet message is a PCM16 ArrayBuffer. Forward it once the upstream
|
||||
// session is ready; until then buffer so no leading audio is dropped.
|
||||
worklet.port.onmessage = (event: MessageEvent) => {
|
||||
if (canceledRef.current) return;
|
||||
const buf = event.data as ArrayBuffer;
|
||||
if (!(buf instanceof ArrayBuffer)) return;
|
||||
if (readyRef.current && clientRef.current) {
|
||||
clientRef.current.sendAudio(buf);
|
||||
} else {
|
||||
pendingAudioRef.current.push(buf);
|
||||
}
|
||||
};
|
||||
|
||||
// Wire the realtime transport. The server replies `ready` once the upstream
|
||||
// STT session is live; we then flush any buffered audio.
|
||||
const client = new RealtimeDictationClient({
|
||||
onReady: () => {
|
||||
if (canceledRef.current) return;
|
||||
readyRef.current = true;
|
||||
const pending = pendingAudioRef.current;
|
||||
pendingAudioRef.current = [];
|
||||
for (const buf of pending) clientRef.current?.sendAudio(buf);
|
||||
},
|
||||
onInterim: (_itemId, text) => {
|
||||
if (canceledRef.current) return;
|
||||
optionsRef.current.onInterim(text);
|
||||
},
|
||||
onFinal: (_itemId, text) => {
|
||||
if (canceledRef.current) return;
|
||||
const trimmed = text.trim();
|
||||
if (trimmed.length > 0) optionsRef.current.onFinal(trimmed);
|
||||
},
|
||||
onError: (message) => {
|
||||
handleError(message);
|
||||
},
|
||||
onClosed: () => {
|
||||
// The server ended the session (idle/max-duration or graceful upstream
|
||||
// close). Skip if a cancel already tore everything down, or if an error
|
||||
// path already owns the status (its error→idle timer is pending), or if a
|
||||
// local stop already cleared the live refs. Otherwise tear down the capture
|
||||
// graph + socket and return to idle so the mic/AudioContext don't leak and
|
||||
// the button doesn't stay stuck on "recording".
|
||||
if (canceledRef.current) return;
|
||||
if (errorTimerRef.current !== null) return;
|
||||
if (
|
||||
!clientRef.current &&
|
||||
!audioContextRef.current &&
|
||||
!streamRef.current
|
||||
) {
|
||||
return;
|
||||
}
|
||||
teardown();
|
||||
setStatus("idle");
|
||||
},
|
||||
});
|
||||
clientRef.current = client;
|
||||
|
||||
// Notify the caller right when capture begins (before opening the socket) so
|
||||
// the editor can snapshot the caret position.
|
||||
try {
|
||||
optionsRef.current.onStart?.();
|
||||
} catch (err) {
|
||||
console.error("[realtime-dictation] onStart callback threw", err);
|
||||
}
|
||||
|
||||
// Open the socket, then ask the server to open the upstream session. The
|
||||
// language hint is the base subtag of the resolved UI language (e.g. "en-US"
|
||||
// → "en"), since the upstream transcription model expects an ISO language
|
||||
// code, not a region-tagged locale; the server omits it upstream when absent.
|
||||
client.connect();
|
||||
const locale = i18n.resolvedLanguage || i18n.language || "";
|
||||
const language = locale.split("-")[0] || undefined;
|
||||
client.start({ language });
|
||||
|
||||
setStatus("recording");
|
||||
// Capture has truly begun; release the synchronous start guard.
|
||||
startingRef.current = false;
|
||||
|
||||
const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
|
||||
timerRef.current = setTimeout(() => {
|
||||
// Reuse stop() so the upstream is flushed/closed gracefully.
|
||||
stopRef.current?.();
|
||||
}, maxDurationMs);
|
||||
}, [status, t, i18n, stopTracks, teardownAudio, handleError]);
|
||||
|
||||
const stop = useCallback((): void => {
|
||||
// Nothing live → no-op (never crash on an idle/destroyed state).
|
||||
if (
|
||||
!clientRef.current &&
|
||||
!audioContextRef.current &&
|
||||
!streamRef.current &&
|
||||
!startingRef.current
|
||||
) {
|
||||
return;
|
||||
}
|
||||
teardown();
|
||||
setStatus("idle");
|
||||
}, [teardown]);
|
||||
|
||||
// Keep the stop ref pointed at the latest stop() for the max-duration timer.
|
||||
stopRef.current = stop;
|
||||
|
||||
const cancel = useCallback((): void => {
|
||||
// Mark canceled first so any late socket/worklet callbacks are ignored.
|
||||
canceledRef.current = true;
|
||||
teardown();
|
||||
setStatus("idle");
|
||||
}, [teardown]);
|
||||
|
||||
// Clean up on unmount: stop tracks, close the context/worklet, disconnect the
|
||||
// socket, and clear timers.
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
canceledRef.current = true;
|
||||
if (errorTimerRef.current !== null) {
|
||||
clearTimeout(errorTimerRef.current);
|
||||
errorTimerRef.current = null;
|
||||
}
|
||||
teardown();
|
||||
};
|
||||
}, [teardown]);
|
||||
|
||||
return { status, start, stop, cancel };
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
import { io, Socket } from "socket.io-client";
|
||||
import { SOCKET_URL } from "@/features/websocket/types";
|
||||
|
||||
// Handlers the hook supplies; the client translates the normalized `/ai-realtime`
|
||||
// Socket.IO events into these callbacks. The client itself owns no React state —
|
||||
// it is a thin transport wrapper so the hook can stay focused on the audio graph.
|
||||
export interface RealtimeDictationHandlers {
|
||||
// Upstream STT session is established; safe to start sending audio.
|
||||
onReady: () => void;
|
||||
// Latest partial transcript for the current (not-yet-final) segment.
|
||||
onInterim: (itemId: string, text: string) => void;
|
||||
// A completed segment's transcript.
|
||||
onFinal: (itemId: string, text: string) => void;
|
||||
// Concrete failure reason (connect error or server-surfaced error).
|
||||
onError: (message: string) => void;
|
||||
// Session ended (graceful stop or upstream closed).
|
||||
onClosed: () => void;
|
||||
}
|
||||
|
||||
interface StartOptions {
|
||||
language?: string;
|
||||
}
|
||||
|
||||
// Wraps the dedicated `/ai-realtime` Socket.IO namespace. Cookie-based auth rides
|
||||
// the handshake via `withCredentials` (no bearer token), exactly like the main
|
||||
// app socket. `autoConnect: false` lets the hook wire listeners up before the
|
||||
// handshake fires so no early event is missed.
|
||||
export class RealtimeDictationClient {
|
||||
private socket: Socket | null = null;
|
||||
// onError must fire at most once per session: the server `error` and socket
|
||||
// `connect_error` can both arrive (e.g. an error then a failed reconnect), but
|
||||
// the hook owns the error→idle flow and a second call would double-fire it.
|
||||
private erroredFlag = false;
|
||||
|
||||
constructor(private readonly handlers: RealtimeDictationHandlers) {}
|
||||
|
||||
// Forward the first error reason only; later error/connect_error are swallowed.
|
||||
private emitError(message: string): void {
|
||||
if (this.erroredFlag) return;
|
||||
this.erroredFlag = true;
|
||||
this.handlers.onError(message);
|
||||
}
|
||||
|
||||
// Create the socket, register listeners, then open the connection. Safe to call
|
||||
// once per client instance; a second call is a no-op while a socket exists.
|
||||
connect(): void {
|
||||
if (this.socket) return;
|
||||
// Fresh socket → allow onError to fire again for this connection.
|
||||
this.erroredFlag = false;
|
||||
|
||||
// SOCKET_URL is undefined in this app (socket.io derives the page origin), so
|
||||
// the `/ai-realtime` namespace rides the same `/socket.io` path as the main
|
||||
// socket — which the Vite dev server proxies as a websocket.
|
||||
const socket: Socket = SOCKET_URL
|
||||
? io(`${SOCKET_URL}/ai-realtime`, {
|
||||
transports: ["websocket"],
|
||||
withCredentials: true,
|
||||
autoConnect: false,
|
||||
})
|
||||
: io("/ai-realtime", {
|
||||
transports: ["websocket"],
|
||||
withCredentials: true,
|
||||
autoConnect: false,
|
||||
});
|
||||
|
||||
this.socket = socket;
|
||||
|
||||
socket.on("ready", () => this.handlers.onReady());
|
||||
|
||||
socket.on("interim", (payload: { itemId: string; text: string }) => {
|
||||
this.handlers.onInterim(payload?.itemId ?? "", payload?.text ?? "");
|
||||
});
|
||||
|
||||
socket.on("final", (payload: { itemId: string; text: string }) => {
|
||||
this.handlers.onFinal(payload?.itemId ?? "", payload?.text ?? "");
|
||||
});
|
||||
|
||||
socket.on("error", (payload: { message?: string } | string) => {
|
||||
const message =
|
||||
typeof payload === "string"
|
||||
? payload
|
||||
: payload?.message || "Realtime dictation error";
|
||||
this.emitError(message);
|
||||
});
|
||||
|
||||
socket.on("closed", () => this.handlers.onClosed());
|
||||
|
||||
// Low-level transport failure (handshake/auth/proxy). Surface a concrete cause.
|
||||
socket.on("connect_error", (err: Error) => {
|
||||
const message = err?.message
|
||||
? `Realtime connection failed: ${err.message}`
|
||||
: "Realtime connection failed";
|
||||
this.emitError(message);
|
||||
});
|
||||
|
||||
socket.connect();
|
||||
}
|
||||
|
||||
// Ask the server to resolve config and open the upstream STT session.
|
||||
start(opts: StartOptions): void {
|
||||
this.socket?.emit("start", { language: opts.language });
|
||||
}
|
||||
|
||||
// Forward a raw PCM16 chunk; socket.io serializes the ArrayBuffer as binary.
|
||||
sendAudio(buf: ArrayBuffer): void {
|
||||
this.socket?.emit("audio", buf);
|
||||
}
|
||||
|
||||
// Request a graceful flush/close of the upstream session.
|
||||
stop(): void {
|
||||
this.socket?.emit("stop");
|
||||
}
|
||||
|
||||
// Tear down: drop every listener and close the connection. Idempotent.
|
||||
disconnect(): void {
|
||||
const socket = this.socket;
|
||||
if (!socket) return;
|
||||
this.socket = null;
|
||||
// Reset so a subsequent connect() on a reused instance can error again.
|
||||
this.erroredFlag = false;
|
||||
socket.removeAllListeners();
|
||||
socket.disconnect();
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,21 @@
|
||||
import { FC, useRef } from "react";
|
||||
import type { Editor } from "@tiptap/react";
|
||||
import { useAtomValue } from "jotai";
|
||||
import { MicButton } from "@/features/dictation/components/mic-button";
|
||||
import { RealtimeMicButton } from "@/features/dictation/components/realtime-mic-button";
|
||||
import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
|
||||
import {
|
||||
setDictationInterim,
|
||||
clearDictationInterim,
|
||||
} from "@/features/editor/extensions/dictation-interim/dictation-interim.ts";
|
||||
|
||||
interface Props {
|
||||
editor: Editor;
|
||||
}
|
||||
|
||||
export const DictationGroup: FC<Props> = ({ editor }) => {
|
||||
const workspace = useAtomValue(workspaceAtom);
|
||||
const isRealtime = workspace?.settings?.ai?.dictationRealtime === true;
|
||||
const rangeRef = useRef<{ from: number; to: number } | null>(null);
|
||||
|
||||
const handleStart = () => {
|
||||
@@ -50,6 +59,33 @@ export const DictationGroup: FC<Props> = ({ editor }) => {
|
||||
}
|
||||
};
|
||||
|
||||
// Realtime path: commit each final segment at the LIVE caret (inserts happen
|
||||
// during recording, so no fixed snapshot is needed); interim is shown via the
|
||||
// ghost decoration only.
|
||||
if (isRealtime) {
|
||||
return (
|
||||
<RealtimeMicButton
|
||||
size="md"
|
||||
disabled={!editor.isEditable}
|
||||
onStart={() => {
|
||||
if (editor && !editor.isDestroyed) clearDictationInterim(editor);
|
||||
}}
|
||||
onInterim={(text) => {
|
||||
if (editor && !editor.isDestroyed) setDictationInterim(editor, text);
|
||||
}}
|
||||
onFinal={(text) => {
|
||||
if (!editor || editor.isDestroyed) return;
|
||||
clearDictationInterim(editor);
|
||||
try {
|
||||
editor.chain().focus().insertContent(`${text} `).run();
|
||||
} catch {
|
||||
// The editor may have been destroyed mid-stream; ignore.
|
||||
}
|
||||
}}
|
||||
/>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<MicButton
|
||||
size="md"
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
import { Extension } from "@tiptap/core";
|
||||
import type { Editor } from "@tiptap/core";
|
||||
import { Plugin, PluginKey } from "@tiptap/pm/state";
|
||||
import { Decoration, DecorationSet } from "@tiptap/pm/view";
|
||||
|
||||
// Plugin key shared by the extension and the imperative helpers below so they
|
||||
// dispatch/read the same plugin state.
|
||||
const dictationInterimKey = new PluginKey<DictationInterimState>(
|
||||
"dictationInterim",
|
||||
);
|
||||
|
||||
interface DictationInterimState {
|
||||
// The current interim (partial) transcript. Empty string means "no ghost".
|
||||
text: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* B2 editor decoration: shows the realtime interim (partial) transcript as a
|
||||
* ghost widget at the caret. The interim is held ONLY in plugin meta state and
|
||||
* rendered as a widget Decoration — it is NEVER written into the document, so
|
||||
* it produces no Yjs update and no history entry. Only final segments are
|
||||
* committed (by the dictation-group / chat consumers).
|
||||
*/
|
||||
export const DictationInterim = Extension.create({
|
||||
name: "dictationInterim",
|
||||
|
||||
addProseMirrorPlugins() {
|
||||
return [
|
||||
new Plugin<DictationInterimState>({
|
||||
key: dictationInterimKey,
|
||||
state: {
|
||||
init: (): DictationInterimState => ({ text: "" }),
|
||||
apply: (tr, value): DictationInterimState => {
|
||||
const meta = tr.getMeta(dictationInterimKey) as
|
||||
| DictationInterimState
|
||||
| undefined;
|
||||
// Meta-only updates replace the interim text; everything else keeps
|
||||
// the existing value (it follows the caret on its own since the
|
||||
// decoration is recomputed against the live selection).
|
||||
if (meta) {
|
||||
return { text: meta.text };
|
||||
}
|
||||
return value;
|
||||
},
|
||||
},
|
||||
props: {
|
||||
decorations(state) {
|
||||
const pluginState = dictationInterimKey.getState(state);
|
||||
const text = pluginState?.text ?? "";
|
||||
if (!text) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Render the interim as an inline ghost at the caret. Inline styles
|
||||
// keep this self-contained — no global CSS is required.
|
||||
const widget = Decoration.widget(
|
||||
state.selection.head,
|
||||
() => {
|
||||
const span = document.createElement("span");
|
||||
span.textContent = text;
|
||||
span.setAttribute("contenteditable", "false");
|
||||
span.style.opacity = "0.5";
|
||||
span.style.fontStyle = "italic";
|
||||
return span;
|
||||
},
|
||||
{ side: 1, ignoreSelection: true },
|
||||
);
|
||||
|
||||
return DecorationSet.create(state.doc, [widget]);
|
||||
},
|
||||
},
|
||||
}),
|
||||
];
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Set the interim ghost text via a META-ONLY transaction — no doc steps, so it
|
||||
* generates no Yjs update and no history entry.
|
||||
*/
|
||||
export function setDictationInterim(editor: Editor, text: string): void {
|
||||
editor.view.dispatch(
|
||||
editor.state.tr.setMeta(dictationInterimKey, { text }),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the interim ghost text via a META-ONLY transaction (same no-op-on-doc
|
||||
* guarantee as setDictationInterim).
|
||||
*/
|
||||
export function clearDictationInterim(editor: Editor): void {
|
||||
editor.view.dispatch(
|
||||
editor.state.tr.setMeta(dictationInterimKey, { text: "" }),
|
||||
);
|
||||
}
|
||||
|
||||
export default DictationInterim;
|
||||
@@ -123,6 +123,7 @@ import { countWords } from "alfaaz";
|
||||
import AutoJoiner from "@/features/editor/extensions/autojoiner.ts";
|
||||
import GlobalDragHandle from "@/features/editor/extensions/drag-handle.ts";
|
||||
import { CleanStyles } from "@/features/editor/extensions/clean-styles.ts";
|
||||
import { DictationInterim } from "@/features/editor/extensions/dictation-interim/dictation-interim.ts";
|
||||
|
||||
const lowlight = createLowlight(common);
|
||||
lowlight.register("mermaid", plaintext);
|
||||
@@ -343,6 +344,7 @@ export const mainExtensions = [
|
||||
},
|
||||
}),
|
||||
Selection,
|
||||
DictationInterim,
|
||||
Attachment.configure({
|
||||
view: AttachmentView,
|
||||
}),
|
||||
|
||||
@@ -32,6 +32,7 @@ import {
|
||||
useAiSettingsQuery,
|
||||
useReindexAiEmbeddingsMutation,
|
||||
useTestAiConnectionMutation,
|
||||
useTestRealtimeConnectionMutation,
|
||||
useUpdateAiSettingsMutation,
|
||||
} from "@/features/workspace/queries/ai-settings-query.ts";
|
||||
import {
|
||||
@@ -62,6 +63,10 @@ const formSchema = z.object({
|
||||
// STT-specific fields. Empty base URL / key fall back to the chat ones.
|
||||
sttModel: z.string(),
|
||||
sttBaseUrl: z.string(),
|
||||
// Realtime (streaming) STT fields. Empty model falls back to sttModel and
|
||||
// empty base URL falls back to the STT base URL server-side.
|
||||
sttRealtimeModel: z.string(),
|
||||
sttRealtimeBaseUrl: z.string(),
|
||||
sttApiStyle: z.enum(["multipart", "json"]),
|
||||
sttApiKey: z.string(),
|
||||
});
|
||||
@@ -176,6 +181,8 @@ export default function AiProviderSettings() {
|
||||
const chatTest = useTestAiConnectionMutation();
|
||||
const embedTest = useTestAiConnectionMutation();
|
||||
const sttTest = useTestAiConnectionMutation();
|
||||
// Realtime probe hits a separate /ai-chat/realtime/test route (admin-gated).
|
||||
const realtimeTest = useTestRealtimeConnectionMutation();
|
||||
|
||||
// Agent roles drive the public-share assistant identity picker. Admin-gated
|
||||
// (the component returns early for non-admins), same as the AI settings query.
|
||||
@@ -192,6 +199,8 @@ export default function AiProviderSettings() {
|
||||
const [dictationEnabled, setDictationEnabled] = useState<boolean>(
|
||||
workspace?.settings?.ai?.dictation ?? false,
|
||||
);
|
||||
const [realtimeDictationEnabled, setRealtimeDictationEnabled] =
|
||||
useState<boolean>(workspace?.settings?.ai?.dictationRealtime ?? false);
|
||||
const [publicShareAssistantEnabled, setPublicShareAssistantEnabled] =
|
||||
useState<boolean>(
|
||||
workspace?.settings?.ai?.publicShareAssistant ?? false,
|
||||
@@ -199,6 +208,10 @@ export default function AiProviderSettings() {
|
||||
const [chatToggleLoading, setChatToggleLoading] = useState(false);
|
||||
const [searchToggleLoading, setSearchToggleLoading] = useState(false);
|
||||
const [dictationToggleLoading, setDictationToggleLoading] = useState(false);
|
||||
const [
|
||||
realtimeDictationToggleLoading,
|
||||
setRealtimeDictationToggleLoading,
|
||||
] = useState(false);
|
||||
const [
|
||||
publicShareAssistantToggleLoading,
|
||||
setPublicShareAssistantToggleLoading,
|
||||
@@ -232,6 +245,8 @@ export default function AiProviderSettings() {
|
||||
embeddingApiKey: "",
|
||||
sttModel: "",
|
||||
sttBaseUrl: "",
|
||||
sttRealtimeModel: "",
|
||||
sttRealtimeBaseUrl: "",
|
||||
sttApiStyle: "multipart" as SttApiStyle,
|
||||
sttApiKey: "",
|
||||
},
|
||||
@@ -253,6 +268,8 @@ export default function AiProviderSettings() {
|
||||
embeddingApiKey: "",
|
||||
sttModel: settings.sttModel ?? "",
|
||||
sttBaseUrl: settings.sttBaseUrl ?? "",
|
||||
sttRealtimeModel: settings.sttRealtimeModel ?? "",
|
||||
sttRealtimeBaseUrl: settings.sttRealtimeBaseUrl ?? "",
|
||||
sttApiStyle: settings.sttApiStyle ?? "multipart",
|
||||
sttApiKey: "",
|
||||
});
|
||||
@@ -287,6 +304,10 @@ export default function AiProviderSettings() {
|
||||
// server-side.
|
||||
sttModel: values.sttModel,
|
||||
sttBaseUrl: values.sttBaseUrl,
|
||||
// Realtime STT: empty model falls back to sttModel, empty base URL falls
|
||||
// back to the STT base URL server-side.
|
||||
sttRealtimeModel: values.sttRealtimeModel,
|
||||
sttRealtimeBaseUrl: values.sttRealtimeBaseUrl,
|
||||
sttApiStyle: values.sttApiStyle,
|
||||
};
|
||||
|
||||
@@ -434,6 +455,35 @@ export default function AiProviderSettings() {
|
||||
}
|
||||
}
|
||||
|
||||
// Optimistic toggle for the "Realtime dictation" feature
|
||||
// (settings.ai.dictationRealtime). Layered on top of batch dictation.
|
||||
async function handleToggleRealtimeDictation(value: boolean) {
|
||||
setRealtimeDictationToggleLoading(true);
|
||||
const previous = realtimeDictationEnabled;
|
||||
setRealtimeDictationEnabled(value);
|
||||
try {
|
||||
const updated = await updateWorkspace({ aiDictationRealtime: value });
|
||||
setWorkspace({
|
||||
...updated,
|
||||
settings: {
|
||||
...updated.settings,
|
||||
ai: { ...updated.settings?.ai, dictationRealtime: value },
|
||||
},
|
||||
});
|
||||
notifications.show({ message: t("Updated successfully") });
|
||||
} catch (err) {
|
||||
setRealtimeDictationEnabled(previous);
|
||||
const message = (err as { response?: { data?: { message?: string } } })
|
||||
?.response?.data?.message;
|
||||
notifications.show({
|
||||
message: message ?? t("Failed to update data"),
|
||||
color: "red",
|
||||
});
|
||||
} finally {
|
||||
setRealtimeDictationToggleLoading(false);
|
||||
}
|
||||
}
|
||||
|
||||
// Optimistic toggle for the anonymous public-share AI assistant
|
||||
// (settings.ai.publicShareAssistant). When off, the public endpoint 404s.
|
||||
async function handleTogglePublicShareAssistant(value: boolean) {
|
||||
@@ -853,13 +903,24 @@ export default function AiProviderSettings() {
|
||||
<StatusDot status={sttStatus} label={cardStatusLabel(sttStatus, t)} />
|
||||
<Text fw={600}>{t("Voice / STT")}</Text>
|
||||
</Group>
|
||||
<Switch
|
||||
label={t("Voice dictation")}
|
||||
labelPosition="left"
|
||||
checked={dictationEnabled}
|
||||
disabled={dictationToggleLoading}
|
||||
onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
|
||||
/>
|
||||
<Group gap="md" align="center" wrap="nowrap">
|
||||
<Switch
|
||||
label={t("Voice dictation")}
|
||||
labelPosition="left"
|
||||
checked={dictationEnabled}
|
||||
disabled={dictationToggleLoading}
|
||||
onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
|
||||
/>
|
||||
<Switch
|
||||
label={t("Realtime dictation")}
|
||||
labelPosition="left"
|
||||
checked={realtimeDictationEnabled}
|
||||
disabled={realtimeDictationToggleLoading}
|
||||
onChange={(e) =>
|
||||
handleToggleRealtimeDictation(e.currentTarget.checked)
|
||||
}
|
||||
/>
|
||||
</Group>
|
||||
</Group>
|
||||
<Text size="xs" c="dimmed" mt={4} mb="md">
|
||||
{t(
|
||||
@@ -954,6 +1015,58 @@ export default function AiProviderSettings() {
|
||||
</Text>
|
||||
))}
|
||||
</Group>
|
||||
|
||||
{/* Realtime (streaming) dictation: layered on top of batch STT and only
|
||||
shown when the workspace toggle is on. Model falls back to the STT
|
||||
model and the endpoint falls back to the STT base URL server-side. */}
|
||||
{realtimeDictationEnabled && (
|
||||
<>
|
||||
<Text size="xs" c="dimmed" mt="md" mb="xs">
|
||||
{t(
|
||||
"Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)",
|
||||
)}
|
||||
</Text>
|
||||
|
||||
<TextInput
|
||||
label={t("Realtime model")}
|
||||
placeholder="gpt-4o-mini-transcribe"
|
||||
disabled={isLoading}
|
||||
{...form.getInputProps("sttRealtimeModel")}
|
||||
/>
|
||||
|
||||
<TextInput
|
||||
mt="sm"
|
||||
label={t("Realtime endpoint")}
|
||||
description={t(
|
||||
"Leave empty to use the STT base URL",
|
||||
)}
|
||||
placeholder={t("Leave empty to use the STT base URL")}
|
||||
disabled={isLoading}
|
||||
{...form.getInputProps("sttRealtimeBaseUrl")}
|
||||
/>
|
||||
|
||||
<Group mt="md" align="center">
|
||||
<Button
|
||||
variant="default"
|
||||
size="sm"
|
||||
loading={realtimeTest.isPending}
|
||||
onClick={() => realtimeTest.mutate()}
|
||||
>
|
||||
{t("Test endpoint")}
|
||||
</Button>
|
||||
{realtimeTest.data &&
|
||||
(realtimeTest.data.ok ? (
|
||||
<Text size="sm" c="green">
|
||||
{t("Connection successful")}
|
||||
</Text>
|
||||
) : (
|
||||
<Text size="sm" c="red">
|
||||
{realtimeTest.data.error || t("Connection failed")}
|
||||
</Text>
|
||||
))}
|
||||
</Group>
|
||||
</>
|
||||
)}
|
||||
</Paper>
|
||||
|
||||
{/* Nested: external MCP tools the agent calls out to */}
|
||||
|
||||
@@ -8,6 +8,7 @@ import {
|
||||
getAiSettings,
|
||||
updateAiSettings,
|
||||
testAiConnection,
|
||||
testRealtimeConnection,
|
||||
reindexAiEmbeddings,
|
||||
IAiSettings,
|
||||
IAiSettingsUpdate,
|
||||
@@ -55,6 +56,12 @@ export function useTestAiConnectionMutation() {
|
||||
});
|
||||
}
|
||||
|
||||
export function useTestRealtimeConnectionMutation() {
|
||||
return useMutation<IAiTestResult, Error, void>({
|
||||
mutationFn: () => testRealtimeConnection(),
|
||||
});
|
||||
}
|
||||
|
||||
export function useReindexAiEmbeddingsMutation() {
|
||||
const { t } = useTranslation();
|
||||
const queryClient = useQueryClient();
|
||||
|
||||
@@ -32,6 +32,8 @@ export interface IAiSettings {
|
||||
// key is stored (empty means "uses the chat API key").
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
sttRealtimeModel?: string;
|
||||
sttRealtimeBaseUrl?: string;
|
||||
sttApiStyle?: SttApiStyle;
|
||||
hasSttApiKey: boolean;
|
||||
// RAG indexing coverage (pages indexed for semantic search).
|
||||
@@ -59,6 +61,8 @@ export interface IAiSettingsUpdate {
|
||||
embeddingApiKey?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
sttRealtimeModel?: string;
|
||||
sttRealtimeBaseUrl?: string;
|
||||
sttApiStyle?: SttApiStyle;
|
||||
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
|
||||
sttApiKey?: string;
|
||||
@@ -95,6 +99,14 @@ export async function testAiConnection(
|
||||
return req.data;
|
||||
}
|
||||
|
||||
// Probes the realtime (streaming STT) endpoint. Unlike the other tests this
|
||||
// route lives under /ai-chat (not /workspace/ai-settings); it is admin-gated
|
||||
// server-side and returns the same { ok, error? } envelope at req.data.
|
||||
export async function testRealtimeConnection(): Promise<IAiTestResult> {
|
||||
const req = await api.post<IAiTestResult>("/ai-chat/realtime/test");
|
||||
return req.data;
|
||||
}
|
||||
|
||||
export async function reindexAiEmbeddings(): Promise<IAiSettings> {
|
||||
const req = await api.post<IAiSettings>("/workspace/ai-settings/reindex");
|
||||
return req.data;
|
||||
|
||||
@@ -25,6 +25,7 @@ export interface IWorkspace {
|
||||
mcpEnabled?: boolean;
|
||||
aiChat?: boolean;
|
||||
aiDictation?: boolean;
|
||||
aiDictationRealtime?: boolean;
|
||||
aiPublicShareAssistant?: boolean;
|
||||
trashRetentionDays?: number;
|
||||
restrictApiToAdmins?: boolean;
|
||||
@@ -62,6 +63,7 @@ export interface IWorkspaceAiSettings {
|
||||
mcp?: boolean;
|
||||
chat?: boolean;
|
||||
dictation?: boolean;
|
||||
dictationRealtime?: boolean;
|
||||
publicShareAssistant?: boolean;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user