diff --git a/apps/client/package.json b/apps/client/package.json index 0433c97f..3ccdea68 100644 --- a/apps/client/package.json +++ b/apps/client/package.json @@ -28,6 +28,7 @@ "@mantine/modals": "8.3.18", "@mantine/notifications": "8.3.18", "@mantine/spotlight": "8.3.18", + "@ricky0123/vad-web": "^0.0.30", "@slidoapp/emoji-mart": "5.8.7", "@slidoapp/emoji-mart-data": "1.2.4", "@slidoapp/emoji-mart-react": "1.1.5", @@ -53,6 +54,7 @@ "mantine-form-zod-resolver": "1.3.0", "mermaid": "11.15.0", "mitt": "3.0.1", + "onnxruntime-web": "^1.27.0", "posthog-js": "1.372.2", "react": "18.3.1", "react-clear-modal": "^2.0.18", diff --git a/apps/client/src/features/ai-chat/components/chat-input.tsx b/apps/client/src/features/ai-chat/components/chat-input.tsx index 3bb67535..9713fea9 100644 --- a/apps/client/src/features/ai-chat/components/chat-input.tsx +++ b/apps/client/src/features/ai-chat/components/chat-input.tsx @@ -64,6 +64,7 @@ export default function ChatInput({ {isDictationEnabled && ( setValue((v) => (v ? `${v} ${text}` : text))} /> diff --git a/apps/client/src/features/dictation/components/mic-button.tsx b/apps/client/src/features/dictation/components/mic-button.tsx index 9546167f..8c0974ae 100644 --- a/apps/client/src/features/dictation/components/mic-button.tsx +++ b/apps/client/src/features/dictation/components/mic-button.tsx @@ -3,6 +3,7 @@ import { ActionIcon, Loader, Tooltip } from "@mantine/core"; import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react"; import { useTranslation } from "react-i18next"; import { useDictation } from "@/features/dictation/hooks/use-dictation"; +import { useStreamingDictation } from "@/features/dictation/hooks/use-streaming-dictation"; import classes from "./mic-button.module.css"; interface MicButtonProps { @@ -17,6 +18,9 @@ interface MicButtonProps { color?: string; // Optional explicit glyph size override; defaults to the size-token value. iconSize?: number; + // When true, use the streaming (Silero-VAD) dictation controller, which emits + // text progressively as the user pauses; otherwise use the batch controller. + streaming?: boolean; } /** @@ -32,9 +36,17 @@ export const MicButton: FC = ({ size = "lg", color, iconSize, + streaming = false, }) => { const { t } = useTranslation(); - const { status, start, stop, audioLevel } = useDictation({ onText, onStart }); + // Call BOTH hooks unconditionally to respect the rules of hooks: which one is + // active is a render-time choice, but both must be invoked every render. This + // is safe because both controllers are inert until start() is called — neither + // opens the mic on mount — so the unused one costs nothing. + const batchCtl = useDictation({ onText, onStart }); + const streamingCtl = useStreamingDictation({ onText, onStart }); + const ctl = streaming ? streamingCtl : batchCtl; + const { status, start, stop, audioLevel } = ctl; const resolvedIconSize = iconSize ?? (size === "lg" ? 18 : 16); if (status === "recording") { diff --git a/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts b/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts new file mode 100644 index 00000000..658e2e55 --- /dev/null +++ b/apps/client/src/features/dictation/hooks/use-streaming-dictation.ts @@ -0,0 +1,429 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { notifications } from "@mantine/notifications"; +import { useTranslation } from "react-i18next"; +import { transcribeAudio } from "@/features/dictation/services/dictation-service"; +import { encodeWavPcm16 } from "@/features/dictation/utils/encode-wav"; +import type { DictationStatus } from "@/features/dictation/hooks/use-dictation"; + +// Lazily-imported MicVAD type. The runtime import happens inside start() so the +// heavy onnxruntime-web / Silero model is code-split out of the main bundle and +// only fetched when the user actually begins dictation. +type MicVADInstance = { + start: () => Promise; + pause: () => Promise; + destroy: () => Promise; +}; + +interface UseStreamingDictationOptions { + onText: (text: string) => void; + onStart?: () => void; + maxDurationMs?: number; +} + +interface UseStreamingDictationResult { + status: DictationStatus; + start: () => Promise; + stop: () => void; + cancel: () => void; + // Smoothed live speech level in the 0..1 range while recording (0 when idle). + audioLevel: number; +} + +// Sample rate of the audio MicVAD hands to onSpeechEnd (Silero VAD runs at 16k). +const VAD_SAMPLE_RATE = 16000; + +// Asset paths for the VAD worklet and the onnxruntime WASM binaries. For this +// prototype they are left undefined so the library loads its bundled assets from +// its default CDN — this avoids fragile rolldown asset-copy config. For a +// self-hosted / offline / privacy build, copy the vad-web `dist` worklet + the +// `*.onnx` model and the onnxruntime-web `*.wasm` files into +// `apps/client/public/vad/` and set these to that local path (e.g. "/vad/"). +const VAD_BASE_ASSET_PATH: string | undefined = undefined; +const VAD_ONNX_WASM_BASE_PATH: string | undefined = undefined; + +/** + * Streaming variant of useDictation. Detects speech with a real (Silero) VAD and, + * each time the speaker pauses, cuts that speech segment and POSTs it to the same + * batch transcription endpoint, so text appears progressively as the user speaks. + * + * Returns the SAME shape as useDictation ({ status, start, stop, cancel, + * audioLevel }) so MicButton can use either interchangeably. Refs hold the live + * VAD instance / counters / timer so component re-renders never lose them, and + * every exit path destroys the VAD and stops the MediaStream. + */ +export function useStreamingDictation( + options: UseStreamingDictationOptions, +): UseStreamingDictationResult { + const { t } = useTranslation(); + const [status, setStatus] = useState("idle"); + const [audioLevel, setAudioLevel] = useState(0); + + // Keep the latest callbacks in a ref so async VAD/HTTP closures always call the + // current handlers without re-creating the VAD. + const optionsRef = useRef(options); + optionsRef.current = options; + + const vadRef = useRef(null); + const timerRef = useRef | null>(null); + const canceledRef = useRef(false); + const startingRef = useRef(false); + // True while a recording session is active (VAD listening). Used to ignore late + // VAD callbacks that fire after stop()/cancel(). + const activeRef = useRef(false); + + // In-order emission: each segment gets a monotonically increasing seq when its + // speech ends; completed transcriptions are buffered by seq and flushed in + // order so out-of-order HTTP responses can't scramble the text. + const nextSeqRef = useRef(0); + const nextEmitSeqRef = useRef(0); + const resultsRef = useRef>(new Map()); + // Number of transcription requests still in flight. + const inFlightRef = useRef(0); + // Session epoch: bumped when a NEW session starts (start) or everything is + // hard-discarded (cancel). Each in-flight request captures the epoch at send + // time; if the epoch has since changed, the request is stale and its + // then/catch/finally are skipped so old text can't leak into a new session and + // the in-flight counter can't be driven negative across sessions. + const epochRef = useRef(0); + + // Exponentially smoothed speech level, and the last value pushed to React state. + const smoothedLevelRef = useRef(0); + const emittedLevelRef = useRef(0); + + const clearTimer = useCallback(() => { + if (timerRef.current !== null) { + clearTimeout(timerRef.current); + timerRef.current = null; + } + }, []); + + // Reset the level meter back to zero (refs + React state). + const resetLevel = useCallback(() => { + smoothedLevelRef.current = 0; + emittedLevelRef.current = 0; + setAudioLevel(0); + }, []); + + // Destroy the live VAD instance (which also releases the mic stream and audio + // context it created). Safe to call multiple times and on any exit path; + // defensive try/catch so teardown never throws. + const destroyVad = useCallback(() => { + const vad = vadRef.current; + vadRef.current = null; + if (vad) { + try { + // destroy() pauses + tears down the worklet/stream/context internally. + // It returns a promise, so attach a .catch too: the surrounding + // try/catch only catches synchronous throws, and a rejected destroy() + // would otherwise surface as an unhandled rejection. + void vad + .destroy() + .catch((err) => + console.warn("[dictation] VAD teardown failed", err), + ); + } catch (err) { + // Cleanup must never throw; just log for diagnosis. + console.warn("[dictation] VAD teardown failed", err); + } + } + }, []); + + // Decide the status once recording has ended: stay "transcribing" while + // requests are in flight, otherwise return to "idle". + const settleAfterStop = useCallback(() => { + if (inFlightRef.current > 0) { + setStatus("transcribing"); + } else { + setStatus("idle"); + } + }, []); + + // Drain the in-order result buffer: while the next expected seq is ready, trim + // it, emit it if non-empty, and advance. Called after every resolved request. + const drainResults = useCallback(() => { + const results = resultsRef.current; + while (results.has(nextEmitSeqRef.current)) { + const text = results.get(nextEmitSeqRef.current)!; + results.delete(nextEmitSeqRef.current); + nextEmitSeqRef.current += 1; + const trimmed = text.trim(); + // Whisper often returns a leading space; emit the trimmed value. + if (trimmed.length > 0) optionsRef.current.onText(trimmed); + } + }, []); + + // Map a transcription error to a user-facing message, mirroring the batch hook. + const transcriptionErrorMessage = useCallback( + (err: unknown): string => { + const resp = ( + err as { response?: { status?: number; data?: { message?: string } } } + )?.response; + const serverMsg = resp?.data?.message; + if (serverMsg && serverMsg.trim().length > 0) { + // The server already explains the cause (e.g. provider 404, bad format, + // STT not configured) — show it verbatim. + return serverMsg; + } + if (resp?.status === 503 || resp?.status === 403) { + return t("Voice dictation is not configured"); + } + return `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`; + }, + [t], + ); + + // Handle one ended speech segment: encode to WAV and transcribe. Results are + // buffered by seq and flushed in order. A single failed segment does NOT kill + // the session: log + one notification, then advance past that seq so later + // segments still flush. + const handleSegment = useCallback( + (audio: Float32Array) => { + const seq = nextSeqRef.current; + nextSeqRef.current += 1; + inFlightRef.current += 1; + // Capture the epoch for this request synchronously at send time. + const epoch = epochRef.current; + + const wavBlob = encodeWavPcm16(audio, VAD_SAMPLE_RATE); + void transcribeAudio(wavBlob, "speech.wav") + .then((text) => { + // Stale request from a previous session: drop it without touching any + // current-session state. + if (epoch !== epochRef.current) return; + // Defend against a non-string server value before drainResults trims. + resultsRef.current.set(seq, typeof text === "string" ? text : ""); + drainResults(); + }) + .catch((err: unknown) => { + if (epoch !== epochRef.current) return; + // Log the full error for diagnosis (status + body + stack). + console.error("[dictation] segment transcription failed", err); + notifications.show({ + color: "red", + message: transcriptionErrorMessage(err), + }); + // Skip this seq so later segments can still flush in order. + if (nextEmitSeqRef.current === seq) { + nextEmitSeqRef.current += 1; + drainResults(); + } else { + resultsRef.current.set(seq, ""); + drainResults(); + } + }) + .finally(() => { + if (epoch !== epochRef.current) return; + inFlightRef.current -= 1; + // If recording already stopped, flip to idle once everything drained. + if (!activeRef.current && inFlightRef.current === 0) { + setStatus("idle"); + } + }); + }, + [drainResults, transcriptionErrorMessage], + ); + + const start = useCallback(async (): Promise => { + // Synchronous live guard: status is stale between renders, so also block on + // refs to prevent a double-click from creating two VAD instances (the first + // would leak its mic stream). + if (startingRef.current || vadRef.current || activeRef.current) return; + if (status !== "idle") return; + startingRef.current = true; + + // Notify the caller right when dictation begins (before any async work) so the + // editor can snapshot the caret position. + optionsRef.current.onStart?.(); + + // Reset per-session in-order emission state. Bump the epoch so any request + // still in flight from a previous (stopped) session becomes stale and its + // then/catch/finally are skipped — it can neither emit old text into this + // new session nor decrement this session's freshly-zeroed in-flight counter. + epochRef.current += 1; + canceledRef.current = false; + nextSeqRef.current = 0; + nextEmitSeqRef.current = 0; + resultsRef.current = new Map(); + inFlightRef.current = 0; + resetLevel(); + + let vad: MicVADInstance; + try { + // Lazy import so the heavy onnx model/worklet are only fetched on first use + // and code-split out of the main bundle. + const { MicVAD } = await import("@ricky0123/vad-web"); + + vad = await MicVAD.new({ + // Silero v5 model (smaller/faster than the legacy model). + model: "v5", + // vad-web 0.0.30 defaults startOnLoad:true, which opens the mic (calls + // getUserMedia) inside new() and leaves the later vad.start() a no-op — + // making its mic-permission error handling dead code. Force it off so the + // mic is opened only by the explicit vad.start() below, where the real + // getUserMedia errors are caught and mapped. + startOnLoad: false, + // Only pass asset paths when defined; otherwise the library uses its + // bundled CDN defaults. + ...(VAD_BASE_ASSET_PATH !== undefined + ? { baseAssetPath: VAD_BASE_ASSET_PATH } + : {}), + ...(VAD_ONNX_WASM_BASE_PATH !== undefined + ? { onnxWASMBasePath: VAD_ONNX_WASM_BASE_PATH } + : {}), + // --- VAD tuning (all tunable) --- + // Probability over which a frame counts as speech. + positiveSpeechThreshold: 0.5, + // Probability under which a frame counts as non-speech (~0.15 below the + // positive threshold, per Silero guidance). + negativeSpeechThreshold: 0.35, + // Silence to wait through before ending a segment (the "don't cut + // immediately" delay) — ~0.6s. NOTE: vad-web 0.0.30 takes this in ms, not + // frames (one Silero frame is ~32ms at 16k). + redemptionMs: 640, + // Audio kept before speech start (left padding so the first word isn't + // clipped) — ~0.3s. + preSpeechPadMs: 320, + // Ignore sub-100ms blips like clicks. + minSpeechMs: 96, + onFrameProcessed: (probabilities: { isSpeech: number }) => { + // Drive the level meter from the speech probability. Light exponential + // smoothing + a throttle so React state isn't updated every frame; this + // powers the existing button halo. Reuses the VAD's own frame + // probabilities — no second AudioContext/AnalyserNode. + if (!activeRef.current) return; + const level = Math.min(1, Math.max(0, probabilities.isSpeech)); + smoothedLevelRef.current = smoothedLevelRef.current * 0.8 + level * 0.2; + if (Math.abs(smoothedLevelRef.current - emittedLevelRef.current) > 0.01) { + emittedLevelRef.current = smoothedLevelRef.current; + setAudioLevel(smoothedLevelRef.current); + } + }, + onSpeechStart: () => { + // No-op: the segment is only handled once it ends. + }, + onSpeechEnd: (audio: Float32Array) => { + // A pause was detected — cut this segment and transcribe it. Ignore late + // callbacks that fire after stop()/cancel(). + if (!activeRef.current || canceledRef.current) return; + handleSegment(audio); + }, + }); + } catch (err) { + // With startOnLoad:false, new() loads the model/worklet/wasm but does NOT + // open the mic, so a throw here is an asset/init failure (model fetch, + // worklet, onnxruntime wasm), not a mic-permission error. Map it as a + // generic "could not start" with the underlying detail. (The mic-permission + // name checks are kept in the vad.start() catch below, where getUserMedia + // actually runs.) + console.error("[dictation] VAD init failed", err); + const detail = (err as { message?: string })?.message ?? String(err); + notifications.show({ + color: "red", + message: `${t("Could not start recording")}: ${detail}`, + }); + // Defensive: if MicVAD.new partially succeeded before throwing, make sure we + // don't leak it. + destroyVad(); + setStatus("idle"); + startingRef.current = false; + return; + } + + vadRef.current = vad; + // Accept frames once start() resolves; the VAD callbacks already guard on + // activeRef, so setting it before start() is safe. + activeRef.current = true; + + try { + // With startOnLoad:false this is where getUserMedia actually runs, so map + // mic-permission errors here the same way the batch hook does; otherwise + // fall back to a generic "could not start" message. + await vad.start(); + } catch (err) { + // Always log the full error for diagnosis (name, message, stack). + console.error("[dictation] VAD.start failed", err); + const name = (err as { name?: string })?.name; + const detail = (err as { message?: string })?.message ?? String(err); + let message: string; + if (name === "NotAllowedError" || name === "SecurityError") { + message = t("Microphone access denied"); + } else if (name === "NotFoundError" || name === "OverconstrainedError") { + message = t("No microphone found"); + } else if (name === "NotReadableError" || name === "AbortError") { + message = t("Microphone is unavailable or already in use"); + } else { + message = `${t("Could not start recording")}: ${detail}`; + } + notifications.show({ color: "red", message }); + activeRef.current = false; + destroyVad(); + setStatus("idle"); + startingRef.current = false; + return; + } + + setStatus("recording"); + // Recording has truly begun; release the synchronous start guard. + startingRef.current = false; + + // Optional overall safety cap: auto-stop after maxDurationMs like the batch + // hook does. + const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000; + timerRef.current = setTimeout(() => { + if (activeRef.current) stopRef.current(); + }, maxDurationMs); + }, [status, t, resetLevel, destroyVad, handleSegment]); + + const stop = useCallback((): void => { + clearTimer(); + if (!activeRef.current && !vadRef.current) { + // Nothing is running; make sure the UI is idle. + setStatus("idle"); + return; + } + // Mark inactive first so late onSpeechEnd/onFrameProcessed callbacks are + // ignored. Any speech segment that has NOT yet ended (user clicks Stop + // mid-utterance) is dropped — acceptable for v1; users normally pause before + // stopping. + activeRef.current = false; + destroyVad(); + resetLevel(); + settleAfterStop(); + }, [clearTimer, destroyVad, resetLevel, settleAfterStop]); + + // Keep stop() reachable from the maxDuration timer closure (which is created + // before stop is defined) without re-creating the VAD. + const stopRef = useRef(stop); + stopRef.current = stop; + + const cancel = useCallback((): void => { + clearTimer(); + canceledRef.current = true; + activeRef.current = false; + // Hard discard: bump the epoch so any in-flight request becomes stale and is + // ignored the moment it resolves (no emit, no counter touch). + epochRef.current += 1; + // Drop pending results / queue; in-flight requests will resolve into a now- + // empty buffer and be ignored. + resultsRef.current = new Map(); + nextSeqRef.current = 0; + nextEmitSeqRef.current = 0; + inFlightRef.current = 0; + destroyVad(); + resetLevel(); + setStatus("idle"); + }, [clearTimer, destroyVad, resetLevel]); + + // Clean up on unmount: destroy the VAD, stop the mic stream, clear the timer. + // Defensive try/catch lives inside destroyVad so teardown never throws. + useEffect(() => { + return () => { + clearTimer(); + activeRef.current = false; + canceledRef.current = true; + destroyVad(); + }; + }, [clearTimer, destroyVad]); + + return { status, start, stop, cancel, audioLevel }; +} diff --git a/apps/client/src/features/dictation/utils/encode-wav.ts b/apps/client/src/features/dictation/utils/encode-wav.ts new file mode 100644 index 00000000..818d50ee --- /dev/null +++ b/apps/client/src/features/dictation/utils/encode-wav.ts @@ -0,0 +1,32 @@ +// Encode mono Float32 PCM samples into a 16-bit PCM WAV blob (audio/wav). +// The server STT endpoint whitelists audio/wav, so this is sent as-is. +export function encodeWavPcm16(samples: Float32Array, sampleRate = 16000): Blob { + const bytesPerSample = 2; + const blockAlign = bytesPerSample; // mono + const dataSize = samples.length * bytesPerSample; + const buffer = new ArrayBuffer(44 + dataSize); + const view = new DataView(buffer); + const writeStr = (offset: number, s: string) => { + for (let i = 0; i < s.length; i++) view.setUint8(offset + i, s.charCodeAt(i)); + }; + writeStr(0, "RIFF"); + view.setUint32(4, 36 + dataSize, true); + writeStr(8, "WAVE"); + writeStr(12, "fmt "); + view.setUint32(16, 16, true); // PCM fmt chunk size + view.setUint16(20, 1, true); // audio format = PCM + view.setUint16(22, 1, true); // channels = mono + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * blockAlign, true); // byte rate + view.setUint16(32, blockAlign, true); + view.setUint16(34, 16, true); // bits per sample + writeStr(36, "data"); + view.setUint32(40, dataSize, true); + let offset = 44; + for (let i = 0; i < samples.length; i++) { + const clamped = Math.max(-1, Math.min(1, samples[i])); + view.setInt16(offset, clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff, true); + offset += 2; + } + return new Blob([buffer], { type: "audio/wav" }); +} diff --git a/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx index 9f7a6157..f9ee2198 100644 --- a/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx +++ b/apps/client/src/features/editor/components/fixed-toolbar/groups/dictation-group.tsx @@ -9,42 +9,57 @@ interface Props { } export const DictationGroup: FC = ({ editor, color, iconSize }) => { + // Caret snapshot taken when dictation starts (where the first segment lands). const rangeRef = useRef<{ from: number; to: number } | null>(null); + // Running insertion point: after each inserted segment we remember the caret + // end so the NEXT segment appends right after it, contiguously, regardless of + // where the user's caret currently is. Null until the first segment lands. + const insertPosRef = useRef(null); const handleStart = () => { const { from, to } = editor.state.selection; rangeRef.current = { from, to }; + // New session: forget any insertion point from a previous dictation so the + // first segment uses the fresh snapshot above. + insertPosRef.current = null; }; const handleText = (text: string) => { // The editor may be gone by the time async transcription returns; bail out // instead of operating on a destroyed instance. if (!editor || editor.isDestroyed) return; - const snapshot = rangeRef.current; - rangeRef.current = null; // The document may have shrunk during transcription (e.g. a collaborative - // edit), so clamp the snapshot into the current bounds before inserting. + // edit), so clamp any position into the current bounds before inserting. const docSize = editor.state.doc.content.size; const clamp = (p: number) => Math.max(0, Math.min(p, docSize)); + // First segment lands at the snapshotted caret range; subsequent segments + // land at a zero-length range at the running insertion point so they stay + // contiguous even if the user clicked elsewhere mid-dictation. + const snapshot = rangeRef.current; + const range = + insertPosRef.current !== null + ? { from: clamp(insertPosRef.current), to: clamp(insertPosRef.current) } + : snapshot + ? { from: clamp(snapshot.from), to: clamp(snapshot.to) } + : null; try { - if (snapshot) { - // Insert at the snapshotted caret; a trailing space keeps words - // separated (the hook already trims the transcribed text). - editor - .chain() - .focus() - .insertContentAt( - { from: clamp(snapshot.from), to: clamp(snapshot.to) }, - `${text} `, - ) - .run(); + if (range) { + // Insert at the resolved range; a trailing space keeps words separated + // (the hook already trims the transcribed text). + editor.chain().focus().insertContentAt(range, `${text} `).run(); } else { + // No snapshot and no running point (shouldn't happen normally) — fall + // back to the current caret. editor.chain().focus().insertContent(`${text} `).run(); } + // Remember where the inserted text ends so the next segment appends right + // after it, independent of later user caret moves. + insertPosRef.current = editor.state.selection.to; } catch { - // The snapshot drifted out of range; fall back to the current caret. + // The range drifted out of bounds; fall back to the current caret. try { editor.chain().focus().insertContent(`${text} `).run(); + insertPosRef.current = editor.state.selection.to; } catch { // The editor may have been destroyed; ignore so a dead editor can't // surface an uncaught error. @@ -55,6 +70,7 @@ export const DictationGroup: FC = ({ editor, color, iconSize }) => { return ( =22.12.0} @@ -5253,6 +5262,7 @@ packages: '@ungap/structured-clone@1.3.0': resolution: {integrity: sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==} + deprecated: Potential CWE-502 - Update to 1.3.1 or higher '@unrs/resolver-binding-android-arm-eabi@1.11.1': resolution: {integrity: sha512-ppLRUgHVaGRWUx0R0Ut06Mjo9gBaBkg3v/8AxusGLhsIotbBLuRk51rAzqLC8gq6NyyAojEXglNjzf6R948DNw==} @@ -7026,6 +7036,9 @@ packages: resolution: {integrity: sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==} hasBin: true + flatbuffers@25.9.23: + resolution: {integrity: sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ==} + flatted@3.4.2: resolution: {integrity: sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==} @@ -7188,6 +7201,9 @@ packages: graceful-fs@4.2.11: resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} + guid-typescript@1.0.9: + resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} + hachure-fill@0.5.2: resolution: {integrity: sha512-3GKBOn+m2LX9iq+JC1064cSFprJY4jL1jCXTcpnfER5HYE2l/4EfWSGzkPa/ZDBmYI0ZOEj5VHV/eKnPGkHuOg==} @@ -8623,6 +8639,12 @@ packages: resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==} engines: {node: '>=6'} + onnxruntime-common@1.27.0: + resolution: {integrity: sha512-3KxL5wIVqa8Ex08jxSzncm9CMgw8CjOFyOQ7SxvG9o0cVLlhTNKXyIQuTbtX4tGPJEf73OER2xrjt4HJSBL4ow==} + + onnxruntime-web@1.27.0: + resolution: {integrity: sha512-ogDLsqIozHZwifPuN37OproAo0byX6t43/bP8GzeZWBWD6MOGExswFAx3up4NS/vvWBOg2u2PXomDt3rMmdQSg==} + open@8.4.2: resolution: {integrity: sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==} engines: {node: '>=12'} @@ -8912,6 +8934,9 @@ packages: pkg-types@1.3.1: resolution: {integrity: sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==} + platform@1.3.6: + resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} + pluralize@8.0.0: resolution: {integrity: sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA==} engines: {node: '>=4'} @@ -9645,6 +9670,7 @@ packages: sliced@1.0.1: resolution: {integrity: sha512-VZBmZP8WU3sMOZm1bdgTadsQbcscK0UM8oKxKVBs4XAhUo2Xxzm/OFMGBkPusxw9xL3Uy8LrzEqGqJhclsr0yA==} + deprecated: Unsupported socket.io-adapter@2.5.4: resolution: {integrity: sha512-wDNHGXGewWAjQPt3pyeYBtpWSq9cLE5UW1ZUPL/2eGK9jtse/FpXib7epSTsz0Q0m+6sg6Y4KtcFTlah1bdOVg==} @@ -14568,6 +14594,10 @@ snapshots: '@remirror/core-constants@3.0.0': {} + '@ricky0123/vad-web@0.0.30': + dependencies: + onnxruntime-web: 1.27.0 + '@rolldown/binding-android-arm64@1.0.0-rc.12': optional: true @@ -17812,6 +17842,8 @@ snapshots: flat@5.0.2: {} + flatbuffers@25.9.23: {} + flatted@3.4.2: {} follow-redirects@1.16.0: {} @@ -17970,6 +18002,8 @@ snapshots: graceful-fs@4.2.11: {} + guid-typescript@1.0.9: {} + hachure-fill@0.5.2: {} handlebars@4.7.9: @@ -19587,6 +19621,17 @@ snapshots: dependencies: mimic-fn: 2.1.0 + onnxruntime-common@1.27.0: {} + + onnxruntime-web@1.27.0: + dependencies: + flatbuffers: 25.9.23 + guid-typescript: 1.0.9 + long: 5.3.2 + onnxruntime-common: 1.27.0 + platform: 1.3.6 + protobufjs: 7.5.8 + open@8.4.2: dependencies: define-lazy-prop: 2.0.0 @@ -19911,6 +19956,8 @@ snapshots: mlly: 1.8.0 pathe: 2.0.3 + platform@1.3.6: {} + pluralize@8.0.0: {} png-chunk-text@1.0.0: {}