fix(dictation): start streaming dictation on the first click

The streaming mic button only began recording on the SECOND click. The VAD library creates its AudioContext inside vad.start() and never resumes it; on the first click the lazy model load (import + MicVAD.new) ran first, so the context was created after the user-gesture window expired and started suspended — the audio worklet never ran, so nothing happened. The second click was fast (model cached) so the context landed inside the gesture and worked. Create and resume our own AudioContext synchronously at the top of start() (inside the click gesture, before the model load) and inject it into MicVAD, which then does not take ownership of it; it is reused across start/stop and closed only on unmount. Add a "loading" status so the first click is shown as a spinner (disabled) while the model loads, which also blocks a confusing second click. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-22 18:39:34 +03:00
parent 14e26aab70
commit 8f01a01122
3 changed files with 57 additions and 4 deletions
--- a/apps/client/src/features/dictation/components/mic-button.tsx
+++ b/apps/client/src/features/dictation/components/mic-button.tsx
@@ -75,15 +75,23 @@ export const MicButton: FC<MicButtonProps> = ({
    );
  }

-  if (status === "transcribing" || status === "error") {
+  if (
+    status === "loading" ||
+    status === "transcribing" ||
+    status === "error"
+  ) {
+    // "loading" (streaming hook fetching the VAD model on first use) shows the
+    // same spinner+disabled state so the first click is visibly acknowledged and
+    // a confusing second click can't fire while the model loads.
+    const label = status === "loading" ? t("Preparing…") : t("Transcribing…");
    return (
-      <Tooltip label={t("Transcribing…")} withArrow>
+      <Tooltip label={label} withArrow>
        <ActionIcon
          size={size}
          variant="subtle"
          color={color}
          disabled
-          aria-label={t("Transcribing…")}
+          aria-label={label}
        >
          <Loader size="xs" />
        </ActionIcon>
--- a/apps/client/src/features/dictation/hooks/use-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-dictation.ts
@@ -3,7 +3,15 @@ import { notifications } from "@mantine/notifications";
 import { useTranslation } from "react-i18next";
 import { transcribeAudio } from "@/features/dictation/services/dictation-service";

-export type DictationStatus = "idle" | "recording" | "transcribing" | "error";
+// "loading" is set only by the streaming hook while it lazily loads the VAD
+// model on first use; the batch hook never sets it. It exists so the streaming
+// hook and the mic button can show immediate feedback during that load.
+export type DictationStatus =
+  | "idle"
+  | "recording"
+  | "transcribing"
+  | "error"
+  | "loading";

 interface UseDictationOptions {
  onText: (text: string) => void;
--- a/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts
@@ -67,6 +67,9 @@ export function useStreamingDictation(
  optionsRef.current = options;

  const vadRef = useRef<MicVADInstance | null>(null);
+  // AudioContext we create+resume inside the click gesture and inject into
+  // MicVAD (see start()). We own it; MicVAD does not close an injected context.
+  const audioContextRef = useRef<AudioContext | null>(null);
  const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
  const canceledRef = useRef(false);
  const startingRef = useRef(false);
@@ -250,6 +253,26 @@ export function useStreamingDictation(
    inFlightRef.current = 0;
    resetLevel();

+    // Create and resume the AudioContext NOW, inside the click gesture, before
+    // the (first-time-slow) model load below. A context first touched outside a
+    // user gesture stays "suspended" and the VAD audio worklet never runs — that
+    // is exactly why the first click did nothing and only the second (model
+    // already cached, so MicVAD.new was fast enough to create the context inside
+    // the gesture) started recording. We own this context and inject it into
+    // MicVAD (which then will NOT close it); it is reused across start/stop and
+    // closed only on unmount.
+    const AudioCtor =
+      window.AudioContext ||
+      (window as unknown as { webkitAudioContext?: typeof AudioContext })
+        .webkitAudioContext;
+    if (AudioCtor && !audioContextRef.current) {
+      audioContextRef.current = new AudioCtor();
+    }
+    // Resume within the gesture; swallow rejection (e.g. already running/closed).
+    void audioContextRef.current?.resume().catch(() => {});
+    // Show immediate feedback while the model loads (see Part B).
+    setStatus("loading");
+
    let vad: MicVADInstance;
    try {
      // Lazy import so the heavy onnx model/worklet are only fetched on first use
@@ -265,6 +288,12 @@ export function useStreamingDictation(
        // mic is opened only by the explicit vad.start() below, where the real
        // getUserMedia errors are caught and mapped.
        startOnLoad: false,
+        // Inject the AudioContext we created+resumed inside the click gesture so
+        // the VAD worklet runs on a "running" context. When provided, the library
+        // uses it and does NOT take ownership/close it.
+        ...(audioContextRef.current
+          ? { audioContext: audioContextRef.current }
+          : {}),
        // Only pass asset paths when defined; otherwise the library uses its
        // bundled CDN defaults.
        ...(VAD_BASE_ASSET_PATH !== undefined
@@ -430,6 +459,14 @@ export function useStreamingDictation(
      activeRef.current = false;
      canceledRef.current = true;
      destroyVad();
+      // Close the AudioContext we own (MicVAD never closes an injected one).
+      if (
+        audioContextRef.current &&
+        audioContextRef.current.state !== "closed"
+      ) {
+        void audioContextRef.current.close().catch(() => {});
+      }
+      audioContextRef.current = null;
    };
  }, [clearTimer, destroyVad]);