feat(dictation): streaming STT via silence cut (Silero VAD)

Add a lightweight "streaming" dictation mode as a simpler alternative to the realtime-websocket path: detect speech with Silero VAD (@ricky0123/vad-web), cut each segment on a pause and POST it to the existing /ai-chat/transcribe endpoint, so text appears progressively. No server changes. - new useStreamingDictation hook (same API as useDictation), lazy-loads VAD, in-order seq emission, session-epoch guard against stop->start races - new encodeWavPcm16 util (Float32 -> mono PCM16 WAV, accepted by the server) - MicButton gains a `streaming` prop; enabled in the editor toolbar and chat - VAD tuning: redemptionMs 640 / preSpeechPadMs 320 / minSpeechMs 96 - batch dictation kept as the fallback (streaming=false) - deps: @ricky0123/vad-web@0.0.30, onnxruntime-web@1.27.0 Note: VAD assets load from the library CDN by default; for self-hosted/offline set VAD_BASE_ASSET_PATH/VAD_ONNX_WASM_BASE_PATH and copy assets to public/vad/. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-22 16:52:05 +03:00
parent 7ce1a24f82
commit 4f0da42d88
7 changed files with 555 additions and 16 deletions
--- a/apps/client/package.json
+++ b/apps/client/package.json
@@ -28,6 +28,7 @@
    "@mantine/modals": "8.3.18",
    "@mantine/notifications": "8.3.18",
    "@mantine/spotlight": "8.3.18",
+    "@ricky0123/vad-web": "^0.0.30",
    "@slidoapp/emoji-mart": "5.8.7",
    "@slidoapp/emoji-mart-data": "1.2.4",
    "@slidoapp/emoji-mart-react": "1.1.5",
@@ -53,6 +54,7 @@
    "mantine-form-zod-resolver": "1.3.0",
    "mermaid": "11.15.0",
    "mitt": "3.0.1",
+    "onnxruntime-web": "^1.27.0",
    "posthog-js": "1.372.2",
    "react": "18.3.1",
    "react-clear-modal": "^2.0.18",
--- a/apps/client/src/features/ai-chat/components/chat-input.tsx
+++ b/apps/client/src/features/ai-chat/components/chat-input.tsx
@@ -64,6 +64,7 @@ export default function ChatInput({
      {isDictationEnabled && (
        <MicButton
          size="lg"
+          streaming
          disabled={isStreaming || disabled}
          onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
        />
--- a/apps/client/src/features/dictation/components/mic-button.tsx
+++ b/apps/client/src/features/dictation/components/mic-button.tsx
@@ -3,6 +3,7 @@ import { ActionIcon, Loader, Tooltip } from "@mantine/core";
 import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
 import { useTranslation } from "react-i18next";
 import { useDictation } from "@/features/dictation/hooks/use-dictation";
+import { useStreamingDictation } from "@/features/dictation/hooks/use-streaming-dictation";
 import classes from "./mic-button.module.css";

 interface MicButtonProps {
@@ -17,6 +18,9 @@ interface MicButtonProps {
  color?: string;
  // Optional explicit glyph size override; defaults to the size-token value.
  iconSize?: number;
+  // When true, use the streaming (Silero-VAD) dictation controller, which emits
+  // text progressively as the user pauses; otherwise use the batch controller.
+  streaming?: boolean;
 }

 /**
@@ -32,9 +36,17 @@ export const MicButton: FC<MicButtonProps> = ({
  size = "lg",
  color,
  iconSize,
+  streaming = false,
 }) => {
  const { t } = useTranslation();
-  const { status, start, stop, audioLevel } = useDictation({ onText, onStart });
+  // Call BOTH hooks unconditionally to respect the rules of hooks: which one is
+  // active is a render-time choice, but both must be invoked every render. This
+  // is safe because both controllers are inert until start() is called — neither
+  // opens the mic on mount — so the unused one costs nothing.
+  const batchCtl = useDictation({ onText, onStart });
+  const streamingCtl = useStreamingDictation({ onText, onStart });
+  const ctl = streaming ? streamingCtl : batchCtl;
+  const { status, start, stop, audioLevel } = ctl;
  const resolvedIconSize = iconSize ?? (size === "lg" ? 18 : 16);

  if (status === "recording") {
--- a/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts
@@ -0,0 +1,429 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { notifications } from "@mantine/notifications";
+import { useTranslation } from "react-i18next";
+import { transcribeAudio } from "@/features/dictation/services/dictation-service";
+import { encodeWavPcm16 } from "@/features/dictation/utils/encode-wav";
+import type { DictationStatus } from "@/features/dictation/hooks/use-dictation";
+
+// Lazily-imported MicVAD type. The runtime import happens inside start() so the
+// heavy onnxruntime-web / Silero model is code-split out of the main bundle and
+// only fetched when the user actually begins dictation.
+type MicVADInstance = {
+  start: () => Promise<void>;
+  pause: () => Promise<void>;
+  destroy: () => Promise<void>;
+};
+
+interface UseStreamingDictationOptions {
+  onText: (text: string) => void;
+  onStart?: () => void;
+  maxDurationMs?: number;
+}
+
+interface UseStreamingDictationResult {
+  status: DictationStatus;
+  start: () => Promise<void>;
+  stop: () => void;
+  cancel: () => void;
+  // Smoothed live speech level in the 0..1 range while recording (0 when idle).
+  audioLevel: number;
+}
+
+// Sample rate of the audio MicVAD hands to onSpeechEnd (Silero VAD runs at 16k).
+const VAD_SAMPLE_RATE = 16000;
+
+// Asset paths for the VAD worklet and the onnxruntime WASM binaries. For this
+// prototype they are left undefined so the library loads its bundled assets from
+// its default CDN — this avoids fragile rolldown asset-copy config. For a
+// self-hosted / offline / privacy build, copy the vad-web `dist` worklet + the
+// `*.onnx` model and the onnxruntime-web `*.wasm` files into
+// `apps/client/public/vad/` and set these to that local path (e.g. "/vad/").
+const VAD_BASE_ASSET_PATH: string | undefined = undefined;
+const VAD_ONNX_WASM_BASE_PATH: string | undefined = undefined;
+
+/**
+ * Streaming variant of useDictation. Detects speech with a real (Silero) VAD and,
+ * each time the speaker pauses, cuts that speech segment and POSTs it to the same
+ * batch transcription endpoint, so text appears progressively as the user speaks.
+ *
+ * Returns the SAME shape as useDictation ({ status, start, stop, cancel,
+ * audioLevel }) so MicButton can use either interchangeably. Refs hold the live
+ * VAD instance / counters / timer so component re-renders never lose them, and
+ * every exit path destroys the VAD and stops the MediaStream.
+ */
+export function useStreamingDictation(
+  options: UseStreamingDictationOptions,
+): UseStreamingDictationResult {
+  const { t } = useTranslation();
+  const [status, setStatus] = useState<DictationStatus>("idle");
+  const [audioLevel, setAudioLevel] = useState(0);
+
+  // Keep the latest callbacks in a ref so async VAD/HTTP closures always call the
+  // current handlers without re-creating the VAD.
+  const optionsRef = useRef(options);
+  optionsRef.current = options;
+
+  const vadRef = useRef<MicVADInstance | null>(null);
+  const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const canceledRef = useRef(false);
+  const startingRef = useRef(false);
+  // True while a recording session is active (VAD listening). Used to ignore late
+  // VAD callbacks that fire after stop()/cancel().
+  const activeRef = useRef(false);
+
+  // In-order emission: each segment gets a monotonically increasing seq when its
+  // speech ends; completed transcriptions are buffered by seq and flushed in
+  // order so out-of-order HTTP responses can't scramble the text.
+  const nextSeqRef = useRef(0);
+  const nextEmitSeqRef = useRef(0);
+  const resultsRef = useRef<Map<number, string>>(new Map());
+  // Number of transcription requests still in flight.
+  const inFlightRef = useRef(0);
+  // Session epoch: bumped when a NEW session starts (start) or everything is
+  // hard-discarded (cancel). Each in-flight request captures the epoch at send
+  // time; if the epoch has since changed, the request is stale and its
+  // then/catch/finally are skipped so old text can't leak into a new session and
+  // the in-flight counter can't be driven negative across sessions.
+  const epochRef = useRef(0);
+
+  // Exponentially smoothed speech level, and the last value pushed to React state.
+  const smoothedLevelRef = useRef(0);
+  const emittedLevelRef = useRef(0);
+
+  const clearTimer = useCallback(() => {
+    if (timerRef.current !== null) {
+      clearTimeout(timerRef.current);
+      timerRef.current = null;
+    }
+  }, []);
+
+  // Reset the level meter back to zero (refs + React state).
+  const resetLevel = useCallback(() => {
+    smoothedLevelRef.current = 0;
+    emittedLevelRef.current = 0;
+    setAudioLevel(0);
+  }, []);
+
+  // Destroy the live VAD instance (which also releases the mic stream and audio
+  // context it created). Safe to call multiple times and on any exit path;
+  // defensive try/catch so teardown never throws.
+  const destroyVad = useCallback(() => {
+    const vad = vadRef.current;
+    vadRef.current = null;
+    if (vad) {
+      try {
+        // destroy() pauses + tears down the worklet/stream/context internally.
+        // It returns a promise, so attach a .catch too: the surrounding
+        // try/catch only catches synchronous throws, and a rejected destroy()
+        // would otherwise surface as an unhandled rejection.
+        void vad
+          .destroy()
+          .catch((err) =>
+            console.warn("[dictation] VAD teardown failed", err),
+          );
+      } catch (err) {
+        // Cleanup must never throw; just log for diagnosis.
+        console.warn("[dictation] VAD teardown failed", err);
+      }
+    }
+  }, []);
+
+  // Decide the status once recording has ended: stay "transcribing" while
+  // requests are in flight, otherwise return to "idle".
+  const settleAfterStop = useCallback(() => {
+    if (inFlightRef.current > 0) {
+      setStatus("transcribing");
+    } else {
+      setStatus("idle");
+    }
+  }, []);
+
+  // Drain the in-order result buffer: while the next expected seq is ready, trim
+  // it, emit it if non-empty, and advance. Called after every resolved request.
+  const drainResults = useCallback(() => {
+    const results = resultsRef.current;
+    while (results.has(nextEmitSeqRef.current)) {
+      const text = results.get(nextEmitSeqRef.current)!;
+      results.delete(nextEmitSeqRef.current);
+      nextEmitSeqRef.current += 1;
+      const trimmed = text.trim();
+      // Whisper often returns a leading space; emit the trimmed value.
+      if (trimmed.length > 0) optionsRef.current.onText(trimmed);
+    }
+  }, []);
+
+  // Map a transcription error to a user-facing message, mirroring the batch hook.
+  const transcriptionErrorMessage = useCallback(
+    (err: unknown): string => {
+      const resp = (
+        err as { response?: { status?: number; data?: { message?: string } } }
+      )?.response;
+      const serverMsg = resp?.data?.message;
+      if (serverMsg && serverMsg.trim().length > 0) {
+        // The server already explains the cause (e.g. provider 404, bad format,
+        // STT not configured) — show it verbatim.
+        return serverMsg;
+      }
+      if (resp?.status === 503 || resp?.status === 403) {
+        return t("Voice dictation is not configured");
+      }
+      return `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
+    },
+    [t],
+  );
+
+  // Handle one ended speech segment: encode to WAV and transcribe. Results are
+  // buffered by seq and flushed in order. A single failed segment does NOT kill
+  // the session: log + one notification, then advance past that seq so later
+  // segments still flush.
+  const handleSegment = useCallback(
+    (audio: Float32Array) => {
+      const seq = nextSeqRef.current;
+      nextSeqRef.current += 1;
+      inFlightRef.current += 1;
+      // Capture the epoch for this request synchronously at send time.
+      const epoch = epochRef.current;
+
+      const wavBlob = encodeWavPcm16(audio, VAD_SAMPLE_RATE);
+      void transcribeAudio(wavBlob, "speech.wav")
+        .then((text) => {
+          // Stale request from a previous session: drop it without touching any
+          // current-session state.
+          if (epoch !== epochRef.current) return;
+          // Defend against a non-string server value before drainResults trims.
+          resultsRef.current.set(seq, typeof text === "string" ? text : "");
+          drainResults();
+        })
+        .catch((err: unknown) => {
+          if (epoch !== epochRef.current) return;
+          // Log the full error for diagnosis (status + body + stack).
+          console.error("[dictation] segment transcription failed", err);
+          notifications.show({
+            color: "red",
+            message: transcriptionErrorMessage(err),
+          });
+          // Skip this seq so later segments can still flush in order.
+          if (nextEmitSeqRef.current === seq) {
+            nextEmitSeqRef.current += 1;
+            drainResults();
+          } else {
+            resultsRef.current.set(seq, "");
+            drainResults();
+          }
+        })
+        .finally(() => {
+          if (epoch !== epochRef.current) return;
+          inFlightRef.current -= 1;
+          // If recording already stopped, flip to idle once everything drained.
+          if (!activeRef.current && inFlightRef.current === 0) {
+            setStatus("idle");
+          }
+        });
+    },
+    [drainResults, transcriptionErrorMessage],
+  );
+
+  const start = useCallback(async (): Promise<void> => {
+    // Synchronous live guard: status is stale between renders, so also block on
+    // refs to prevent a double-click from creating two VAD instances (the first
+    // would leak its mic stream).
+    if (startingRef.current || vadRef.current || activeRef.current) return;
+    if (status !== "idle") return;
+    startingRef.current = true;
+
+    // Notify the caller right when dictation begins (before any async work) so the
+    // editor can snapshot the caret position.
+    optionsRef.current.onStart?.();
+
+    // Reset per-session in-order emission state. Bump the epoch so any request
+    // still in flight from a previous (stopped) session becomes stale and its
+    // then/catch/finally are skipped — it can neither emit old text into this
+    // new session nor decrement this session's freshly-zeroed in-flight counter.
+    epochRef.current += 1;
+    canceledRef.current = false;
+    nextSeqRef.current = 0;
+    nextEmitSeqRef.current = 0;
+    resultsRef.current = new Map();
+    inFlightRef.current = 0;
+    resetLevel();
+
+    let vad: MicVADInstance;
+    try {
+      // Lazy import so the heavy onnx model/worklet are only fetched on first use
+      // and code-split out of the main bundle.
+      const { MicVAD } = await import("@ricky0123/vad-web");
+
+      vad = await MicVAD.new({
+        // Silero v5 model (smaller/faster than the legacy model).
+        model: "v5",
+        // vad-web 0.0.30 defaults startOnLoad:true, which opens the mic (calls
+        // getUserMedia) inside new() and leaves the later vad.start() a no-op —
+        // making its mic-permission error handling dead code. Force it off so the
+        // mic is opened only by the explicit vad.start() below, where the real
+        // getUserMedia errors are caught and mapped.
+        startOnLoad: false,
+        // Only pass asset paths when defined; otherwise the library uses its
+        // bundled CDN defaults.
+        ...(VAD_BASE_ASSET_PATH !== undefined
+          ? { baseAssetPath: VAD_BASE_ASSET_PATH }
+          : {}),
+        ...(VAD_ONNX_WASM_BASE_PATH !== undefined
+          ? { onnxWASMBasePath: VAD_ONNX_WASM_BASE_PATH }
+          : {}),
+        // --- VAD tuning (all tunable) ---
+        // Probability over which a frame counts as speech.
+        positiveSpeechThreshold: 0.5,
+        // Probability under which a frame counts as non-speech (~0.15 below the
+        // positive threshold, per Silero guidance).
+        negativeSpeechThreshold: 0.35,
+        // Silence to wait through before ending a segment (the "don't cut
+        // immediately" delay) — ~0.6s. NOTE: vad-web 0.0.30 takes this in ms, not
+        // frames (one Silero frame is ~32ms at 16k).
+        redemptionMs: 640,
+        // Audio kept before speech start (left padding so the first word isn't
+        // clipped) — ~0.3s.
+        preSpeechPadMs: 320,
+        // Ignore sub-100ms blips like clicks.
+        minSpeechMs: 96,
+        onFrameProcessed: (probabilities: { isSpeech: number }) => {
+          // Drive the level meter from the speech probability. Light exponential
+          // smoothing + a throttle so React state isn't updated every frame; this
+          // powers the existing button halo. Reuses the VAD's own frame
+          // probabilities — no second AudioContext/AnalyserNode.
+          if (!activeRef.current) return;
+          const level = Math.min(1, Math.max(0, probabilities.isSpeech));
+          smoothedLevelRef.current = smoothedLevelRef.current * 0.8 + level * 0.2;
+          if (Math.abs(smoothedLevelRef.current - emittedLevelRef.current) > 0.01) {
+            emittedLevelRef.current = smoothedLevelRef.current;
+            setAudioLevel(smoothedLevelRef.current);
+          }
+        },
+        onSpeechStart: () => {
+          // No-op: the segment is only handled once it ends.
+        },
+        onSpeechEnd: (audio: Float32Array) => {
+          // A pause was detected — cut this segment and transcribe it. Ignore late
+          // callbacks that fire after stop()/cancel().
+          if (!activeRef.current || canceledRef.current) return;
+          handleSegment(audio);
+        },
+      });
+    } catch (err) {
+      // With startOnLoad:false, new() loads the model/worklet/wasm but does NOT
+      // open the mic, so a throw here is an asset/init failure (model fetch,
+      // worklet, onnxruntime wasm), not a mic-permission error. Map it as a
+      // generic "could not start" with the underlying detail. (The mic-permission
+      // name checks are kept in the vad.start() catch below, where getUserMedia
+      // actually runs.)
+      console.error("[dictation] VAD init failed", err);
+      const detail = (err as { message?: string })?.message ?? String(err);
+      notifications.show({
+        color: "red",
+        message: `${t("Could not start recording")}: ${detail}`,
+      });
+      // Defensive: if MicVAD.new partially succeeded before throwing, make sure we
+      // don't leak it.
+      destroyVad();
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    vadRef.current = vad;
+    // Accept frames once start() resolves; the VAD callbacks already guard on
+    // activeRef, so setting it before start() is safe.
+    activeRef.current = true;
+
+    try {
+      // With startOnLoad:false this is where getUserMedia actually runs, so map
+      // mic-permission errors here the same way the batch hook does; otherwise
+      // fall back to a generic "could not start" message.
+      await vad.start();
+    } catch (err) {
+      // Always log the full error for diagnosis (name, message, stack).
+      console.error("[dictation] VAD.start failed", err);
+      const name = (err as { name?: string })?.name;
+      const detail = (err as { message?: string })?.message ?? String(err);
+      let message: string;
+      if (name === "NotAllowedError" || name === "SecurityError") {
+        message = t("Microphone access denied");
+      } else if (name === "NotFoundError" || name === "OverconstrainedError") {
+        message = t("No microphone found");
+      } else if (name === "NotReadableError" || name === "AbortError") {
+        message = t("Microphone is unavailable or already in use");
+      } else {
+        message = `${t("Could not start recording")}: ${detail}`;
+      }
+      notifications.show({ color: "red", message });
+      activeRef.current = false;
+      destroyVad();
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
+    setStatus("recording");
+    // Recording has truly begun; release the synchronous start guard.
+    startingRef.current = false;
+
+    // Optional overall safety cap: auto-stop after maxDurationMs like the batch
+    // hook does.
+    const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
+    timerRef.current = setTimeout(() => {
+      if (activeRef.current) stopRef.current();
+    }, maxDurationMs);
+  }, [status, t, resetLevel, destroyVad, handleSegment]);
+
+  const stop = useCallback((): void => {
+    clearTimer();
+    if (!activeRef.current && !vadRef.current) {
+      // Nothing is running; make sure the UI is idle.
+      setStatus("idle");
+      return;
+    }
+    // Mark inactive first so late onSpeechEnd/onFrameProcessed callbacks are
+    // ignored. Any speech segment that has NOT yet ended (user clicks Stop
+    // mid-utterance) is dropped — acceptable for v1; users normally pause before
+    // stopping.
+    activeRef.current = false;
+    destroyVad();
+    resetLevel();
+    settleAfterStop();
+  }, [clearTimer, destroyVad, resetLevel, settleAfterStop]);
+
+  // Keep stop() reachable from the maxDuration timer closure (which is created
+  // before stop is defined) without re-creating the VAD.
+  const stopRef = useRef(stop);
+  stopRef.current = stop;
+
+  const cancel = useCallback((): void => {
+    clearTimer();
+    canceledRef.current = true;
+    activeRef.current = false;
+    // Hard discard: bump the epoch so any in-flight request becomes stale and is
+    // ignored the moment it resolves (no emit, no counter touch).
+    epochRef.current += 1;
+    // Drop pending results / queue; in-flight requests will resolve into a now-
+    // empty buffer and be ignored.
+    resultsRef.current = new Map();
+    nextSeqRef.current = 0;
+    nextEmitSeqRef.current = 0;
+    inFlightRef.current = 0;
+    destroyVad();
+    resetLevel();
+    setStatus("idle");
+  }, [clearTimer, destroyVad, resetLevel]);
+
+  // Clean up on unmount: destroy the VAD, stop the mic stream, clear the timer.
+  // Defensive try/catch lives inside destroyVad so teardown never throws.
+  useEffect(() => {
+    return () => {
+      clearTimer();
+      activeRef.current = false;
+      canceledRef.current = true;
+      destroyVad();
+    };
+  }, [clearTimer, destroyVad]);
+
+  return { status, start, stop, cancel, audioLevel };
+}
--- a/apps/client/src/features/dictation/utils/encode-wav.ts
+++ b/apps/client/src/features/dictation/utils/encode-wav.ts
@@ -0,0 +1,32 @@
+// Encode mono Float32 PCM samples into a 16-bit PCM WAV blob (audio/wav).
+// The server STT endpoint whitelists audio/wav, so this is sent as-is.
+export function encodeWavPcm16(samples: Float32Array, sampleRate = 16000): Blob {
+  const bytesPerSample = 2;
+  const blockAlign = bytesPerSample; // mono
+  const dataSize = samples.length * bytesPerSample;
+  const buffer = new ArrayBuffer(44 + dataSize);
+  const view = new DataView(buffer);
+  const writeStr = (offset: number, s: string) => {
+    for (let i = 0; i < s.length; i++) view.setUint8(offset + i, s.charCodeAt(i));
+  };
+  writeStr(0, "RIFF");
+  view.setUint32(4, 36 + dataSize, true);
+  writeStr(8, "WAVE");
+  writeStr(12, "fmt ");
+  view.setUint32(16, 16, true); // PCM fmt chunk size
+  view.setUint16(20, 1, true); // audio format = PCM
+  view.setUint16(22, 1, true); // channels = mono
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, sampleRate * blockAlign, true); // byte rate
+  view.setUint16(32, blockAlign, true);
+  view.setUint16(34, 16, true); // bits per sample
+  writeStr(36, "data");
+  view.setUint32(40, dataSize, true);
+  let offset = 44;
+  for (let i = 0; i < samples.length; i++) {
+    const clamped = Math.max(-1, Math.min(1, samples[i]));
+    view.setInt16(offset, clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff, true);
+    offset += 2;
+  }
+  return new Blob([buffer], { type: "audio/wav" });
+}
--- a/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
+++ b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx
@@ -9,42 +9,57 @@ interface Props {
 }

 export const DictationGroup: FC<Props> = ({ editor, color, iconSize }) => {
+  // Caret snapshot taken when dictation starts (where the first segment lands).
  const rangeRef = useRef<{ from: number; to: number } | null>(null);
+  // Running insertion point: after each inserted segment we remember the caret
+  // end so the NEXT segment appends right after it, contiguously, regardless of
+  // where the user's caret currently is. Null until the first segment lands.
+  const insertPosRef = useRef<number | null>(null);

  const handleStart = () => {
    const { from, to } = editor.state.selection;
    rangeRef.current = { from, to };
+    // New session: forget any insertion point from a previous dictation so the
+    // first segment uses the fresh snapshot above.
+    insertPosRef.current = null;
  };

  const handleText = (text: string) => {
    // The editor may be gone by the time async transcription returns; bail out
    // instead of operating on a destroyed instance.
    if (!editor || editor.isDestroyed) return;
-    const snapshot = rangeRef.current;
-    rangeRef.current = null;
    // The document may have shrunk during transcription (e.g. a collaborative
-    // edit), so clamp the snapshot into the current bounds before inserting.
+    // edit), so clamp any position into the current bounds before inserting.
    const docSize = editor.state.doc.content.size;
    const clamp = (p: number) => Math.max(0, Math.min(p, docSize));
+    // First segment lands at the snapshotted caret range; subsequent segments
+    // land at a zero-length range at the running insertion point so they stay
+    // contiguous even if the user clicked elsewhere mid-dictation.
+    const snapshot = rangeRef.current;
+    const range =
+      insertPosRef.current !== null
+        ? { from: clamp(insertPosRef.current), to: clamp(insertPosRef.current) }
+        : snapshot
+          ? { from: clamp(snapshot.from), to: clamp(snapshot.to) }
+          : null;
    try {
-      if (snapshot) {
-        // Insert at the snapshotted caret; a trailing space keeps words
-        // separated (the hook already trims the transcribed text).
-        editor
-          .chain()
-          .focus()
-          .insertContentAt(
-            { from: clamp(snapshot.from), to: clamp(snapshot.to) },
-            `${text} `,
-          )
-          .run();
+      if (range) {
+        // Insert at the resolved range; a trailing space keeps words separated
+        // (the hook already trims the transcribed text).
+        editor.chain().focus().insertContentAt(range, `${text} `).run();
      } else {
+        // No snapshot and no running point (shouldn't happen normally) — fall
+        // back to the current caret.
        editor.chain().focus().insertContent(`${text} `).run();
      }
+      // Remember where the inserted text ends so the next segment appends right
+      // after it, independent of later user caret moves.
+      insertPosRef.current = editor.state.selection.to;
    } catch {
-      // The snapshot drifted out of range; fall back to the current caret.
+      // The range drifted out of bounds; fall back to the current caret.
      try {
        editor.chain().focus().insertContent(`${text} `).run();
+        insertPosRef.current = editor.state.selection.to;
      } catch {
        // The editor may have been destroyed; ignore so a dead editor can't
        // surface an uncaught error.
@@ -55,6 +70,7 @@ export const DictationGroup: FC<Props> = ({ editor, color, iconSize }) => {
  return (
    <MicButton
      size="md"
+      streaming
      onStart={handleStart}
      onText={handleText}
      disabled={!editor.isEditable}