From 0b3d595572967e8ee9cd4f0481d446317abf821e Mon Sep 17 00:00:00 2001 From: claude_code Date: Sun, 21 Jun 2026 14:47:28 +0300 Subject: [PATCH] feat(dictation): add realtime streaming STT (live dictation) Layer an optional realtime speech-to-text path on top of the existing batch dictation, so transcribed text appears as the user speaks. Transport A2: browser <-> our server (Socket.IO `/ai-realtime`) <-> OpenAI Realtime (raw ws). The provider API key never leaves the server; the upstream URL is SSRF-checked before connecting; the gateway enforces the dictation+dictationRealtime gate, cookie-JWT auth and per-user/ per-workspace concurrency caps. Implemented against the GA (2026) OpenAI Realtime transcription contract (session.update / audio.input.format / server_vad), not the now-removed beta shape. Editor UI B2: interim text is shown as a meta-only ProseMirror ghost decoration (no Yjs/history noise); only completed segments are committed. Chat shows interim as a dimmed tail. The mic button switches realtime vs batch by the workspace flag; batch remains the default and fallback. Server: - AiRealtimeService (upstream ws proxy, normalized events, idle/max- duration timeouts, idempotent teardown) + parseUpstreamEvent unit tests - AiRealtimeGateway (Socket.IO `/ai-realtime`) wired into AiChatModule - admin-gated POST /ai-chat/realtime/test connectivity probe - config: settings.ai.dictationRealtime + provider sttRealtimeModel/ sttRealtimeBaseUrl (realtime key reuses sttApiKey; no new secret) Client: - pcm16 AudioWorklet (24kHz mono PCM16), RealtimeDictationClient, use-realtime-dictation hook (status/start/stop/cancel + onInterim/onFinal) - RealtimeMicButton + dictation-interim ProseMirror decoration - editor/chat integration + AI settings UI (toggle, model, test endpoint) Co-Authored-By: Claude Opus 4.8 --- .../public/locales/en-US/translation.json | 5 + .../ai-chat/components/chat-input.tsx | 53 +- .../dictation/audio/audio-worklet.d.ts | 33 ++ .../features/dictation/audio/pcm16-worklet.ts | 123 +++++ .../components/realtime-mic-button.tsx | 84 +++ .../dictation/hooks/use-realtime-dictation.ts | 427 +++++++++++++++ .../services/realtime-dictation-client.ts | 124 +++++ .../fixed-toolbar/groups/dictation-group.tsx | 36 ++ .../dictation-interim/dictation-interim.ts | 97 ++++ .../features/editor/extensions/extensions.ts | 2 + .../components/ai-provider-settings.tsx | 127 ++++- .../workspace/queries/ai-settings-query.ts | 7 + .../workspace/services/ai-settings-service.ts | 12 + .../workspace/types/workspace.types.ts | 2 + .../src/core/ai-chat/ai-chat.controller.ts | 38 ++ .../server/src/core/ai-chat/ai-chat.module.ts | 7 + .../ai-chat/realtime/ai-realtime.gateway.ts | 236 +++++++++ .../realtime/ai-realtime.service.spec.ts | 186 +++++++ .../ai-chat/realtime/ai-realtime.service.ts | 485 ++++++++++++++++++ .../workspace/dto/update-workspace.dto.ts | 4 + .../workspace/services/workspace.service.ts | 15 + .../repos/workspace/workspace.repo.ts | 2 +- .../integrations/ai/ai-settings.service.ts | 10 + apps/server/src/integrations/ai/ai.types.ts | 7 + .../ai/dto/update-ai-settings.dto.ts | 8 + 25 files changed, 2111 insertions(+), 19 deletions(-) create mode 100644 apps/client/src/features/dictation/audio/audio-worklet.d.ts create mode 100644 apps/client/src/features/dictation/audio/pcm16-worklet.ts create mode 100644 apps/client/src/features/dictation/components/realtime-mic-button.tsx create mode 100644 apps/client/src/features/dictation/hooks/use-realtime-dictation.ts create mode 100644 apps/client/src/features/dictation/services/realtime-dictation-client.ts create mode 100644 apps/client/src/features/editor/extensions/dictation-interim/dictation-interim.ts create mode 100644 apps/server/src/core/ai-chat/realtime/ai-realtime.gateway.ts create mode 100644 apps/server/src/core/ai-chat/realtime/ai-realtime.service.spec.ts create mode 100644 apps/server/src/core/ai-chat/realtime/ai-realtime.service.ts diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json index 70353fee..c5fcbe74 100644 --- a/apps/client/public/locales/en-US/translation.json +++ b/apps/client/public/locales/en-US/translation.json @@ -1179,6 +1179,11 @@ "Semantic search": "Semantic search", "Voice / STT": "Voice / STT", "Voice dictation": "Voice dictation", + "Realtime dictation": "Realtime dictation", + "Realtime model": "Realtime model", + "Realtime endpoint": "Realtime endpoint", + "Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)": "Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)", + "Leave empty to use the STT base URL": "Leave empty to use the STT base URL", "Voice dictation is not available yet.": "Voice dictation is not available yet.", "Test endpoint": "Test endpoint", "Save endpoints": "Save endpoints", diff --git a/apps/client/src/features/ai-chat/components/chat-input.tsx b/apps/client/src/features/ai-chat/components/chat-input.tsx index 3bb67535..2728e7cf 100644 --- a/apps/client/src/features/ai-chat/components/chat-input.tsx +++ b/apps/client/src/features/ai-chat/components/chat-input.tsx @@ -1,11 +1,19 @@ -import { KeyboardEvent } from "react"; -import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core"; +import { KeyboardEvent, useState } from "react"; +import { + ActionIcon, + Group, + Stack, + Text, + Textarea, + Tooltip, +} from "@mantine/core"; import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react"; import { useTranslation } from "react-i18next"; import { useAtom, useAtomValue } from "jotai"; import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts"; import { workspaceAtom } from "@/features/user/atoms/current-user-atom"; import { MicButton } from "@/features/dictation/components/mic-button"; +import { RealtimeMicButton } from "@/features/dictation/components/realtime-mic-button"; interface ChatInputProps { onSend: (text: string) => void; @@ -29,12 +37,17 @@ export default function ChatInput({ const [value, setValue] = useAtom(aiChatDraftAtom); const workspace = useAtomValue(workspaceAtom); const isDictationEnabled = workspace?.settings?.ai?.dictation === true; + const isRealtime = workspace?.settings?.ai?.dictationRealtime === true; + // Live interim (partial) transcript shown as a dimmed tail under the input. + const [interim, setInterim] = useState(""); const send = (): void => { const text = value.trim(); if (!text || isStreaming || disabled) return; onSend(text); setValue(""); + // Drop any leftover partial when a message is sent. + setInterim(""); }; const handleKeyDown = (e: KeyboardEvent): void => { @@ -45,7 +58,8 @@ export default function ChatInput({ }; return ( - + +