diff --git a/apps/client/src/features/dictation/components/mic-button.tsx b/apps/client/src/features/dictation/components/mic-button.tsx index 8c0974ae..70ead74e 100644 --- a/apps/client/src/features/dictation/components/mic-button.tsx +++ b/apps/client/src/features/dictation/components/mic-button.tsx @@ -75,15 +75,23 @@ export const MicButton: FC = ({ ); } - if (status === "transcribing" || status === "error") { + if ( + status === "loading" || + status === "transcribing" || + status === "error" + ) { + // "loading" (streaming hook fetching the VAD model on first use) shows the + // same spinner+disabled state so the first click is visibly acknowledged and + // a confusing second click can't fire while the model loads. + const label = status === "loading" ? t("Preparing…") : t("Transcribing…"); return ( - + diff --git a/apps/client/src/features/dictation/hooks/use-dictation.ts b/apps/client/src/features/dictation/hooks/use-dictation.ts index 0d32402f..4d8c451d 100644 --- a/apps/client/src/features/dictation/hooks/use-dictation.ts +++ b/apps/client/src/features/dictation/hooks/use-dictation.ts @@ -3,7 +3,15 @@ import { notifications } from "@mantine/notifications"; import { useTranslation } from "react-i18next"; import { transcribeAudio } from "@/features/dictation/services/dictation-service"; -export type DictationStatus = "idle" | "recording" | "transcribing" | "error"; +// "loading" is set only by the streaming hook while it lazily loads the VAD +// model on first use; the batch hook never sets it. It exists so the streaming +// hook and the mic button can show immediate feedback during that load. +export type DictationStatus = + | "idle" + | "recording" + | "transcribing" + | "error" + | "loading"; interface UseDictationOptions { onText: (text: string) => void; diff --git a/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts b/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts index 8128df91..b086747c 100644 --- a/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts +++ b/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts @@ -67,6 +67,9 @@ export function useStreamingDictation( optionsRef.current = options; const vadRef = useRef(null); + // AudioContext we create+resume inside the click gesture and inject into + // MicVAD (see start()). We own it; MicVAD does not close an injected context. + const audioContextRef = useRef(null); const timerRef = useRef | null>(null); const canceledRef = useRef(false); const startingRef = useRef(false); @@ -250,6 +253,26 @@ export function useStreamingDictation( inFlightRef.current = 0; resetLevel(); + // Create and resume the AudioContext NOW, inside the click gesture, before + // the (first-time-slow) model load below. A context first touched outside a + // user gesture stays "suspended" and the VAD audio worklet never runs — that + // is exactly why the first click did nothing and only the second (model + // already cached, so MicVAD.new was fast enough to create the context inside + // the gesture) started recording. We own this context and inject it into + // MicVAD (which then will NOT close it); it is reused across start/stop and + // closed only on unmount. + const AudioCtor = + window.AudioContext || + (window as unknown as { webkitAudioContext?: typeof AudioContext }) + .webkitAudioContext; + if (AudioCtor && !audioContextRef.current) { + audioContextRef.current = new AudioCtor(); + } + // Resume within the gesture; swallow rejection (e.g. already running/closed). + void audioContextRef.current?.resume().catch(() => {}); + // Show immediate feedback while the model loads (see Part B). + setStatus("loading"); + let vad: MicVADInstance; try { // Lazy import so the heavy onnx model/worklet are only fetched on first use @@ -265,6 +288,12 @@ export function useStreamingDictation( // mic is opened only by the explicit vad.start() below, where the real // getUserMedia errors are caught and mapped. startOnLoad: false, + // Inject the AudioContext we created+resumed inside the click gesture so + // the VAD worklet runs on a "running" context. When provided, the library + // uses it and does NOT take ownership/close it. + ...(audioContextRef.current + ? { audioContext: audioContextRef.current } + : {}), // Only pass asset paths when defined; otherwise the library uses its // bundled CDN defaults. ...(VAD_BASE_ASSET_PATH !== undefined @@ -430,6 +459,14 @@ export function useStreamingDictation( activeRef.current = false; canceledRef.current = true; destroyVad(); + // Close the AudioContext we own (MicVAD never closes an injected one). + if ( + audioContextRef.current && + audioContextRef.current.state !== "closed" + ) { + void audioContextRef.current.close().catch(() => {}); + } + audioContextRef.current = null; }; }, [clearTimer, destroyVad]);