Add a lightweight "streaming" dictation mode as a simpler alternative to the realtime-websocket path: detect speech with Silero VAD (@ricky0123/vad-web), cut each segment on a pause and POST it to the existing /ai-chat/transcribe endpoint, so text appears progressively. No server changes. - new useStreamingDictation hook (same API as useDictation), lazy-loads VAD, in-order seq emission, session-epoch guard against stop->start races - new encodeWavPcm16 util (Float32 -> mono PCM16 WAV, accepted by the server) - MicButton gains a `streaming` prop; enabled in the editor toolbar and chat - VAD tuning: redemptionMs 640 / preSpeechPadMs 320 / minSpeechMs 96 - batch dictation kept as the fallback (streaming=false) - deps: @ricky0123/vad-web@0.0.30, onnxruntime-web@1.27.0 Note: VAD assets load from the library CDN by default; for self-hosted/offline set VAD_BASE_ASSET_PATH/VAD_ONNX_WASM_BASE_PATH and copy assets to public/vad/. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
109 lines
3.6 KiB
TypeScript
109 lines
3.6 KiB
TypeScript
import { FC } from "react";
|
|
import { ActionIcon, Loader, Tooltip } from "@mantine/core";
|
|
import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
|
|
import { useTranslation } from "react-i18next";
|
|
import { useDictation } from "@/features/dictation/hooks/use-dictation";
|
|
import { useStreamingDictation } from "@/features/dictation/hooks/use-streaming-dictation";
|
|
import classes from "./mic-button.module.css";
|
|
|
|
interface MicButtonProps {
|
|
onText: (text: string) => void;
|
|
onStart?: () => void;
|
|
disabled?: boolean;
|
|
// Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
|
|
// editor toolbar.
|
|
size?: "md" | "lg";
|
|
// Optional Mantine color override for the idle/transcribing states (the
|
|
// recording state stays red). Defaults to the theme primary when omitted.
|
|
color?: string;
|
|
// Optional explicit glyph size override; defaults to the size-token value.
|
|
iconSize?: number;
|
|
// When true, use the streaming (Silero-VAD) dictation controller, which emits
|
|
// text progressively as the user pauses; otherwise use the batch controller.
|
|
streaming?: boolean;
|
|
}
|
|
|
|
/**
|
|
* Self-contained dictation toggle. Owns its own capture state machine: a click
|
|
* starts recording (mic icon), a second click stops it (stop icon), and while
|
|
* the audio is being transcribed it shows a spinner and is disabled to prevent
|
|
* overlapping requests.
|
|
*/
|
|
export const MicButton: FC<MicButtonProps> = ({
|
|
onText,
|
|
onStart,
|
|
disabled,
|
|
size = "lg",
|
|
color,
|
|
iconSize,
|
|
streaming = false,
|
|
}) => {
|
|
const { t } = useTranslation();
|
|
// Call BOTH hooks unconditionally to respect the rules of hooks: which one is
|
|
// active is a render-time choice, but both must be invoked every render. This
|
|
// is safe because both controllers are inert until start() is called — neither
|
|
// opens the mic on mount — so the unused one costs nothing.
|
|
const batchCtl = useDictation({ onText, onStart });
|
|
const streamingCtl = useStreamingDictation({ onText, onStart });
|
|
const ctl = streaming ? streamingCtl : batchCtl;
|
|
const { status, start, stop, audioLevel } = ctl;
|
|
const resolvedIconSize = iconSize ?? (size === "lg" ? 18 : 16);
|
|
|
|
if (status === "recording") {
|
|
// Live volume-driven halo: the scale follows the current mic level.
|
|
const haloScale = 1 + Math.min(1, audioLevel) * 0.9;
|
|
return (
|
|
<Tooltip label={t("Stop recording")} withArrow>
|
|
<span className={classes.recordingWrap}>
|
|
<span
|
|
className={classes.pulse}
|
|
style={{ transform: `scale(${haloScale})` }}
|
|
aria-hidden="true"
|
|
/>
|
|
<ActionIcon
|
|
size={size}
|
|
color="red"
|
|
variant="light"
|
|
onClick={stop}
|
|
aria-label={t("Stop recording")}
|
|
style={{ position: "relative", zIndex: 1 }}
|
|
>
|
|
<IconPlayerStopFilled size={resolvedIconSize} />
|
|
</ActionIcon>
|
|
</span>
|
|
</Tooltip>
|
|
);
|
|
}
|
|
|
|
if (status === "transcribing" || status === "error") {
|
|
return (
|
|
<Tooltip label={t("Transcribing…")} withArrow>
|
|
<ActionIcon
|
|
size={size}
|
|
variant="subtle"
|
|
color={color}
|
|
disabled
|
|
aria-label={t("Transcribing…")}
|
|
>
|
|
<Loader size="xs" />
|
|
</ActionIcon>
|
|
</Tooltip>
|
|
);
|
|
}
|
|
|
|
return (
|
|
<Tooltip label={t("Start dictation")} withArrow>
|
|
<ActionIcon
|
|
size={size}
|
|
variant="subtle"
|
|
color={color}
|
|
onClick={() => void start()}
|
|
disabled={disabled}
|
|
aria-label={t("Start dictation")}
|
|
>
|
|
<IconMicrophone size={resolvedIconSize} />
|
|
</ActionIcon>
|
|
</Tooltip>
|
|
);
|
|
};
|