feat(dictation): streaming STT via silence cut (Silero VAD)
Add a lightweight "streaming" dictation mode as a simpler alternative to the realtime-websocket path: detect speech with Silero VAD (@ricky0123/vad-web), cut each segment on a pause and POST it to the existing /ai-chat/transcribe endpoint, so text appears progressively. No server changes. - new useStreamingDictation hook (same API as useDictation), lazy-loads VAD, in-order seq emission, session-epoch guard against stop->start races - new encodeWavPcm16 util (Float32 -> mono PCM16 WAV, accepted by the server) - MicButton gains a `streaming` prop; enabled in the editor toolbar and chat - VAD tuning: redemptionMs 640 / preSpeechPadMs 320 / minSpeechMs 96 - batch dictation kept as the fallback (streaming=false) - deps: @ricky0123/vad-web@0.0.30, onnxruntime-web@1.27.0 Note: VAD assets load from the library CDN by default; for self-hosted/offline set VAD_BASE_ASSET_PATH/VAD_ONNX_WASM_BASE_PATH and copy assets to public/vad/. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -9,42 +9,57 @@ interface Props {
|
||||
}
|
||||
|
||||
export const DictationGroup: FC<Props> = ({ editor, color, iconSize }) => {
|
||||
// Caret snapshot taken when dictation starts (where the first segment lands).
|
||||
const rangeRef = useRef<{ from: number; to: number } | null>(null);
|
||||
// Running insertion point: after each inserted segment we remember the caret
|
||||
// end so the NEXT segment appends right after it, contiguously, regardless of
|
||||
// where the user's caret currently is. Null until the first segment lands.
|
||||
const insertPosRef = useRef<number | null>(null);
|
||||
|
||||
const handleStart = () => {
|
||||
const { from, to } = editor.state.selection;
|
||||
rangeRef.current = { from, to };
|
||||
// New session: forget any insertion point from a previous dictation so the
|
||||
// first segment uses the fresh snapshot above.
|
||||
insertPosRef.current = null;
|
||||
};
|
||||
|
||||
const handleText = (text: string) => {
|
||||
// The editor may be gone by the time async transcription returns; bail out
|
||||
// instead of operating on a destroyed instance.
|
||||
if (!editor || editor.isDestroyed) return;
|
||||
const snapshot = rangeRef.current;
|
||||
rangeRef.current = null;
|
||||
// The document may have shrunk during transcription (e.g. a collaborative
|
||||
// edit), so clamp the snapshot into the current bounds before inserting.
|
||||
// edit), so clamp any position into the current bounds before inserting.
|
||||
const docSize = editor.state.doc.content.size;
|
||||
const clamp = (p: number) => Math.max(0, Math.min(p, docSize));
|
||||
// First segment lands at the snapshotted caret range; subsequent segments
|
||||
// land at a zero-length range at the running insertion point so they stay
|
||||
// contiguous even if the user clicked elsewhere mid-dictation.
|
||||
const snapshot = rangeRef.current;
|
||||
const range =
|
||||
insertPosRef.current !== null
|
||||
? { from: clamp(insertPosRef.current), to: clamp(insertPosRef.current) }
|
||||
: snapshot
|
||||
? { from: clamp(snapshot.from), to: clamp(snapshot.to) }
|
||||
: null;
|
||||
try {
|
||||
if (snapshot) {
|
||||
// Insert at the snapshotted caret; a trailing space keeps words
|
||||
// separated (the hook already trims the transcribed text).
|
||||
editor
|
||||
.chain()
|
||||
.focus()
|
||||
.insertContentAt(
|
||||
{ from: clamp(snapshot.from), to: clamp(snapshot.to) },
|
||||
`${text} `,
|
||||
)
|
||||
.run();
|
||||
if (range) {
|
||||
// Insert at the resolved range; a trailing space keeps words separated
|
||||
// (the hook already trims the transcribed text).
|
||||
editor.chain().focus().insertContentAt(range, `${text} `).run();
|
||||
} else {
|
||||
// No snapshot and no running point (shouldn't happen normally) — fall
|
||||
// back to the current caret.
|
||||
editor.chain().focus().insertContent(`${text} `).run();
|
||||
}
|
||||
// Remember where the inserted text ends so the next segment appends right
|
||||
// after it, independent of later user caret moves.
|
||||
insertPosRef.current = editor.state.selection.to;
|
||||
} catch {
|
||||
// The snapshot drifted out of range; fall back to the current caret.
|
||||
// The range drifted out of bounds; fall back to the current caret.
|
||||
try {
|
||||
editor.chain().focus().insertContent(`${text} `).run();
|
||||
insertPosRef.current = editor.state.selection.to;
|
||||
} catch {
|
||||
// The editor may have been destroyed; ignore so a dead editor can't
|
||||
// surface an uncaught error.
|
||||
@@ -55,6 +70,7 @@ export const DictationGroup: FC<Props> = ({ editor, color, iconSize }) => {
|
||||
return (
|
||||
<MicButton
|
||||
size="md"
|
||||
streaming
|
||||
onStart={handleStart}
|
||||
onText={handleText}
|
||||
disabled={!editor.isEditable}
|
||||
|
||||
Reference in New Issue
Block a user