feat(dictation): add realtime streaming STT (live dictation)

Layer an optional realtime speech-to-text path on top of the existing
batch dictation, so transcribed text appears as the user speaks.

Transport A2: browser <-> our server (Socket.IO `/ai-realtime`) <->
OpenAI Realtime (raw ws). The provider API key never leaves the server;
the upstream URL is SSRF-checked before connecting; the gateway enforces
the dictation+dictationRealtime gate, cookie-JWT auth and per-user/
per-workspace concurrency caps. Implemented against the GA (2026) OpenAI
Realtime transcription contract (session.update / audio.input.format /
server_vad), not the now-removed beta shape.

Editor UI B2: interim text is shown as a meta-only ProseMirror ghost
decoration (no Yjs/history noise); only completed segments are committed.
Chat shows interim as a dimmed tail. The mic button switches realtime vs
batch by the workspace flag; batch remains the default and fallback.

Server:
- AiRealtimeService (upstream ws proxy, normalized events, idle/max-
  duration timeouts, idempotent teardown) + parseUpstreamEvent unit tests
- AiRealtimeGateway (Socket.IO `/ai-realtime`) wired into AiChatModule
- admin-gated POST /ai-chat/realtime/test connectivity probe
- config: settings.ai.dictationRealtime + provider sttRealtimeModel/
  sttRealtimeBaseUrl (realtime key reuses sttApiKey; no new secret)

Client:
- pcm16 AudioWorklet (24kHz mono PCM16), RealtimeDictationClient,
  use-realtime-dictation hook (status/start/stop/cancel + onInterim/onFinal)
- RealtimeMicButton + dictation-interim ProseMirror decoration
- editor/chat integration + AI settings UI (toggle, model, test endpoint)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
claude_code
2026-06-21 14:47:28 +03:00
committed by claude code agent 227
parent 74e2b7ad7f
commit 7db3f007cb
25 changed files with 2111 additions and 19 deletions

View File

@@ -1181,6 +1181,11 @@
"Semantic search": "Semantic search", "Semantic search": "Semantic search",
"Voice / STT": "Voice / STT", "Voice / STT": "Voice / STT",
"Voice dictation": "Voice dictation", "Voice dictation": "Voice dictation",
"Realtime dictation": "Realtime dictation",
"Realtime model": "Realtime model",
"Realtime endpoint": "Realtime endpoint",
"Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)": "Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)",
"Leave empty to use the STT base URL": "Leave empty to use the STT base URL",
"Voice dictation is not available yet.": "Voice dictation is not available yet.", "Voice dictation is not available yet.": "Voice dictation is not available yet.",
"Test endpoint": "Test endpoint", "Test endpoint": "Test endpoint",
"Save endpoints": "Save endpoints", "Save endpoints": "Save endpoints",

View File

@@ -1,11 +1,19 @@
import { KeyboardEvent } from "react"; import { KeyboardEvent, useState } from "react";
import { ActionIcon, Group, Textarea, Tooltip } from "@mantine/core"; import {
ActionIcon,
Group,
Stack,
Text,
Textarea,
Tooltip,
} from "@mantine/core";
import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react"; import { IconPlayerStopFilled, IconSend } from "@tabler/icons-react";
import { useTranslation } from "react-i18next"; import { useTranslation } from "react-i18next";
import { useAtom, useAtomValue } from "jotai"; import { useAtom, useAtomValue } from "jotai";
import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts"; import { aiChatDraftAtom } from "@/features/ai-chat/atoms/ai-chat-atom.ts";
import { workspaceAtom } from "@/features/user/atoms/current-user-atom"; import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
import { MicButton } from "@/features/dictation/components/mic-button"; import { MicButton } from "@/features/dictation/components/mic-button";
import { RealtimeMicButton } from "@/features/dictation/components/realtime-mic-button";
interface ChatInputProps { interface ChatInputProps {
onSend: (text: string) => void; onSend: (text: string) => void;
@@ -29,12 +37,17 @@ export default function ChatInput({
const [value, setValue] = useAtom(aiChatDraftAtom); const [value, setValue] = useAtom(aiChatDraftAtom);
const workspace = useAtomValue(workspaceAtom); const workspace = useAtomValue(workspaceAtom);
const isDictationEnabled = workspace?.settings?.ai?.dictation === true; const isDictationEnabled = workspace?.settings?.ai?.dictation === true;
const isRealtime = workspace?.settings?.ai?.dictationRealtime === true;
// Live interim (partial) transcript shown as a dimmed tail under the input.
const [interim, setInterim] = useState("");
const send = (): void => { const send = (): void => {
const text = value.trim(); const text = value.trim();
if (!text || isStreaming || disabled) return; if (!text || isStreaming || disabled) return;
onSend(text); onSend(text);
setValue(""); setValue("");
// Drop any leftover partial when a message is sent.
setInterim("");
}; };
const handleKeyDown = (e: KeyboardEvent<HTMLTextAreaElement>): void => { const handleKeyDown = (e: KeyboardEvent<HTMLTextAreaElement>): void => {
@@ -45,7 +58,8 @@ export default function ChatInput({
}; };
return ( return (
<Group gap="xs" align="flex-end" wrap="nowrap"> <Stack gap="xs">
<Group gap="xs" align="flex-end" wrap="nowrap">
<Textarea <Textarea
style={{ flex: 1 }} style={{ flex: 1 }}
placeholder={t("Ask the AI agent…")} placeholder={t("Ask the AI agent…")}
@@ -61,13 +75,24 @@ export default function ChatInput({
// switch), so a fresh chat lands with the cursor ready in the field. // switch), so a fresh chat lands with the cursor ready in the field.
autoFocus autoFocus
/> />
{isDictationEnabled && ( {isDictationEnabled &&
<MicButton (isRealtime ? (
size="lg" <RealtimeMicButton
disabled={isStreaming || disabled} size="lg"
onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))} disabled={isStreaming || disabled}
/> onInterim={(text) => setInterim(text)}
)} onFinal={(text) => {
setValue((v) => (v ? `${v} ${text}` : text));
setInterim("");
}}
/>
) : (
<MicButton
size="lg"
disabled={isStreaming || disabled}
onText={(text) => setValue((v) => (v ? `${v} ${text}` : text))}
/>
))}
{isStreaming ? ( {isStreaming ? (
<Tooltip label={t("Stop")} withArrow> <Tooltip label={t("Stop")} withArrow>
<ActionIcon <ActionIcon
@@ -93,6 +118,12 @@ export default function ChatInput({
</ActionIcon> </ActionIcon>
</Tooltip> </Tooltip>
)} )}
</Group> </Group>
{interim && (
<Text size="sm" c="dimmed">
{interim}
</Text>
)}
</Stack>
); );
} }

View File

@@ -0,0 +1,33 @@
// Minimal ambient declarations for the AudioWorklet global scope.
//
// The client tsconfig only pulls in the DOM libs (no "webworker"/"audioworklet"
// lib), so the symbols available inside an AudioWorkletProcessor module are not
// known to `tsc`. These declarations are intentionally narrow: just enough for
// `pcm16-worklet.ts` to typecheck, matching the Web Audio API spec shapes used
// by that processor. They describe the worklet global scope, not the main thread.
declare abstract class AudioWorkletProcessor {
// Message channel back to the main thread (used to transfer PCM16 buffers).
readonly port: MessagePort;
constructor();
// Called for each render quantum. `inputs`/`outputs` are channel arrays
// indexed as [input][channel][sample]; `parameters` maps AudioParam names to
// their per-sample (or single-value) Float32Array. Return `true` to keep the
// processor alive.
abstract process(
inputs: Float32Array[][],
outputs: Float32Array[][],
parameters: Record<string, Float32Array>,
): boolean;
}
// Registers a processor class under a name usable from `new AudioWorkletNode`.
declare function registerProcessor(
name: string,
processorCtor: new () => AudioWorkletProcessor,
): void;
// The render context's sample rate, in Hz, available in the worklet global scope.
declare const sampleRate: number;

View File

@@ -0,0 +1,123 @@
// Self-contained AudioWorkletProcessor that turns the live microphone stream into
// PCM16 (signed 16-bit, little-endian), mono, 24000 Hz chunks for the realtime STT
// upstream. It runs in the AudioWorklet global scope, so it MUST NOT import anything
// (the worklet module has no module graph / bundler runtime around it).
//
// Per `process()` call the host hands us a render quantum (typically 128 frames) at
// the context sample rate. We read the first input channel (mono), linearly resample
// to 24000 Hz while carrying the fractional read position across calls (so we never
// assume a particular input rate, e.g. 44.1k or 48k), accumulate the resampled
// samples, and once we have ~150 ms worth (3600 samples) we emit them as an
// Int16 ArrayBuffer transferred to the main thread.
// Target output rate required by the upstream transcription contract.
const TARGET_RATE = 24000;
// ~150 ms of audio at the target rate: 24000 * 0.15 = 3600 samples per message.
const FRAME_SAMPLES = Math.round(TARGET_RATE * 0.15);
class Pcm16Worklet extends AudioWorkletProcessor {
// Fractional read position within the CURRENT quantum, in input-sample units.
// Kept across `process()` calls so resampling has no per-quantum seams. After a
// quantum it is rebased relative to the next quantum's start, so a value in
// [-1, 0) means "interpolate between the previous quantum's last sample and the
// next quantum's first sample".
private resamplePos = 0;
// The previous quantum's last input sample, used to interpolate across the
// boundary between two render quanta (the conceptual sample at index -1).
private prevSample = 0;
// True once at least one sample has been seen (so `prevSample` is meaningful).
private primed = false;
// Accumulated resampled Float32 samples awaiting conversion + flush.
private acc: Float32Array = new Float32Array(FRAME_SAMPLES);
private accLen = 0;
process(inputs: Float32Array[][]): boolean {
const input = inputs[0];
// No connected input (or a momentarily empty quantum): keep the node alive
// and emit silence below.
const channel = input && input.length > 0 ? input[0] : undefined;
if (channel && channel.length > 0) {
this.resampleAndAccumulate(channel);
}
// Drive silence to the output so connecting this node to destination keeps
// the graph running without echoing the microphone back to the speakers.
return true;
}
// Linearly resample `channel` (at the context `sampleRate`) to TARGET_RATE and
// push the results into the accumulator, flushing whole frames as they fill.
private resampleAndAccumulate(channel: Float32Array): void {
const ratio = sampleRate / TARGET_RATE; // input samples consumed per output sample
const n = channel.length;
if (!this.primed) {
// First quantum: there is no real predecessor, so seed the virtual index -1
// with this quantum's first sample and start reading from 0.
this.prevSample = channel[0];
this.primed = true;
this.resamplePos = 0;
}
let pos = this.resamplePos;
// Emit output samples whose RIGHT neighbor (floor + 1) is available within
// this quantum, i.e. while floor + 1 <= n - 1 ⇔ pos < n - 1. The left
// neighbor at floor === -1 is the carried `prevSample`; floor >= 0 reads the
// quantum directly. Any leftover position (whose right neighbor would be the
// NEXT quantum's first sample) is carried via `resamplePos` and resolved on
// the next call. This guarantees we never read `channel[n]` (out of bounds).
while (pos < n - 1) {
const floor = Math.floor(pos);
const frac = pos - floor;
const s0 = floor < 0 ? this.prevSample : channel[floor];
const s1 = channel[floor + 1];
this.pushSample(s0 + (s1 - s0) * frac);
pos += ratio;
}
// Rebase the leftover position relative to the next quantum's start and carry
// this quantum's last sample as the predecessor for the boundary interval.
this.resamplePos = pos - n;
this.prevSample = channel[n - 1];
}
// Append one resampled sample; flush a full PCM16 frame whenever the
// accumulator reaches FRAME_SAMPLES.
private pushSample(sample: number): void {
this.acc[this.accLen] = sample;
this.accLen += 1;
if (this.accLen >= FRAME_SAMPLES) {
this.flush();
}
}
// Convert the accumulated Float32 samples to Int16 LE and post the ArrayBuffer
// to the main thread, transferring ownership (zero-copy). DataView writes are
// little-endian to match the PCM16 contract regardless of host endianness.
private flush(): void {
const count = this.accLen;
if (count === 0) return;
const buffer = new ArrayBuffer(count * 2);
const view = new DataView(buffer);
for (let i = 0; i < count; i++) {
// Clamp to [-1, 1] then scale to the signed 16-bit range.
let s = this.acc[i];
if (s > 1) s = 1;
else if (s < -1) s = -1;
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true);
}
this.accLen = 0;
this.port.postMessage(buffer, [buffer]);
}
}
registerProcessor("pcm16-worklet", Pcm16Worklet);

View File

@@ -0,0 +1,84 @@
import { FC, useEffect, useRef } from "react";
import { ActionIcon, Tooltip } from "@mantine/core";
import { IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
import { useTranslation } from "react-i18next";
import {
useRealtimeDictation,
type RealtimeDictationStatus,
} from "@/features/dictation/hooks/use-realtime-dictation";
interface RealtimeMicButtonProps {
onInterim: (text: string) => void;
onFinal: (text: string) => void;
onStart?: () => void;
disabled?: boolean;
// Mantine ActionIcon size token; "lg" matches the chat composer, "md" the
// editor toolbar.
size?: "md" | "lg";
}
/**
* Streaming sibling of MicButton. Drives the realtime dictation state machine:
* a click starts recording (mic icon), a second click stops it (stop icon).
* Interim/final transcripts are surfaced through the onInterim/onFinal props as
* they arrive; there is no "transcribing" state because final text lands
* incrementally while recording. Mirrors MicButton's look and tooltips.
*/
export const RealtimeMicButton: FC<RealtimeMicButtonProps> = ({
onInterim,
onFinal,
onStart,
disabled,
size = "lg",
}) => {
const { t } = useTranslation();
const { status, start, stop } = useRealtimeDictation({
onInterim,
onFinal,
onStart,
});
const iconSize = size === "lg" ? 18 : 16;
// When recording ends (status leaves "recording" for idle/error), clear any
// leftover partial in the consumer once. Tracked via the previous status so
// it only fires on the transition, not on every render.
const prevStatusRef = useRef<RealtimeDictationStatus>(status);
useEffect(() => {
if (prevStatusRef.current === "recording" && status !== "recording") {
onInterim("");
}
prevStatusRef.current = status;
}, [status, onInterim]);
if (status === "recording") {
return (
<Tooltip label={t("Stop recording")} withArrow>
<ActionIcon
size={size}
color="red"
variant="light"
onClick={stop}
aria-label={t("Stop recording")}
>
<IconPlayerStopFilled size={iconSize} />
</ActionIcon>
</Tooltip>
);
}
// idle / error: subtle mic to (re)start. No spinner — there is no separate
// transcribing phase in the realtime flow.
return (
<Tooltip label={t("Start dictation")} withArrow>
<ActionIcon
size={size}
variant="subtle"
onClick={() => void start()}
disabled={disabled}
aria-label={t("Start dictation")}
>
<IconMicrophone size={iconSize} />
</ActionIcon>
</Tooltip>
);
};

View File

@@ -0,0 +1,427 @@
import { useCallback, useEffect, useRef, useState } from "react";
import { notifications } from "@mantine/notifications";
import { useTranslation } from "react-i18next";
import { RealtimeDictationClient } from "@/features/dictation/services/realtime-dictation-client";
// The worklet module URL is produced via `new URL(..., import.meta.url)` so Vite
// emits the processor as a separate, self-contained module chunk (it must run in
// the AudioWorklet global scope, outside the main bundle). Built once at module
// load — the resolved URL is stable for the app's lifetime.
const PCM16_WORKLET_URL = new URL(
"../audio/pcm16-worklet.ts",
import.meta.url,
);
export type RealtimeDictationStatus = "idle" | "recording" | "error";
export interface UseRealtimeDictationOptions {
onInterim: (text: string) => void; // latest partial for the live segment
onFinal: (text: string) => void; // a completed segment (trimmed)
onStart?: () => void; // fired right when capture begins (caret snapshot)
maxDurationMs?: number; // default 120000
}
export interface UseRealtimeDictationResult {
status: RealtimeDictationStatus;
start: () => Promise<void>;
stop: () => void;
cancel: () => void;
}
// AudioContext is webkit-prefixed on some older Safari builds; keep a typed
// fallback so the hook never crashes when the standard name is missing.
function getAudioContextCtor(): typeof AudioContext | undefined {
if (typeof AudioContext !== "undefined") return AudioContext;
const w = window as unknown as { webkitAudioContext?: typeof AudioContext };
return w.webkitAudioContext;
}
/**
* Streaming sibling of `use-dictation`. Captures the mic, resamples to PCM16
* 24 kHz in an AudioWorklet, and streams it over the normalized `/ai-realtime`
* Socket.IO namespace, surfacing interim/final transcripts as they arrive.
*
* Mirrors `use-dictation`'s conventions: refs hold the live graph/client/timers
* so re-renders never lose them, getUserMedia errors map to the same Mantine
* notifications, and every exit path stops the MediaStream tracks and closes the
* AudioContext. There is no `transcribing` state — final text arrives
* incrementally while `recording`.
*/
export function useRealtimeDictation(
options: UseRealtimeDictationOptions,
): UseRealtimeDictationResult {
const { t, i18n } = useTranslation();
const [status, setStatus] = useState<RealtimeDictationStatus>("idle");
// Keep the latest callbacks in a ref so async socket handlers always call the
// current handlers without re-creating the capture graph.
const optionsRef = useRef(options);
optionsRef.current = options;
const streamRef = useRef<MediaStream | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const sourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
const workletRef = useRef<AudioWorkletNode | null>(null);
const clientRef = useRef<RealtimeDictationClient | null>(null);
const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const errorTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const canceledRef = useRef(false);
const startingRef = useRef(false);
// True once the server emits `ready`; audio is buffered until then, then flushed.
const readyRef = useRef(false);
// PCM16 chunks captured before the upstream session is ready.
const pendingAudioRef = useRef<ArrayBuffer[]>([]);
// Stable ref to the latest stop() so the max-duration timer (armed inside the
// start closure) can invoke the current version without re-arming every render.
const stopRef = useRef<() => void>(() => undefined);
const clearTimer = useCallback(() => {
if (timerRef.current !== null) {
clearTimeout(timerRef.current);
timerRef.current = null;
}
}, []);
const stopTracks = useCallback(() => {
streamRef.current?.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}, []);
// Tear down the audio graph (worklet node, source, context). Never throws on a
// half-built or already-closed graph.
const teardownAudio = useCallback(() => {
const worklet = workletRef.current;
if (worklet) {
worklet.port.onmessage = null;
try {
worklet.disconnect();
} catch {
// Node may already be disconnected; ignore.
}
workletRef.current = null;
}
const source = sourceRef.current;
if (source) {
try {
source.disconnect();
} catch {
// Ignore disconnect of an already-detached node.
}
sourceRef.current = null;
}
const ctx = audioContextRef.current;
if (ctx) {
audioContextRef.current = null;
if (ctx.state !== "closed") {
// close() returns a promise; swallow rejections so teardown never throws.
void ctx.close().catch(() => undefined);
}
}
}, []);
// Full teardown shared by stop/cancel/unmount. Order: stop streaming upstream,
// disconnect the socket, then dismantle the local audio graph and tracks, then
// clear timers and reset the ready/pending state.
const teardown = useCallback(() => {
const client = clientRef.current;
if (client) {
clientRef.current = null;
try {
client.stop();
} catch {
// Socket may already be gone; ignore.
}
client.disconnect();
}
teardownAudio();
stopTracks();
clearTimer();
readyRef.current = false;
pendingAudioRef.current = [];
startingRef.current = false;
}, [teardownAudio, stopTracks, clearTimer]);
// Surface a concrete failure: log it, notify, flip to "error", and reset to
// "idle" after a short delay (mirrors use-dictation's error timer).
const handleError = useCallback(
(message: string, err?: unknown) => {
if (canceledRef.current) return;
// Never log audio — only the textual reason.
console.error("[realtime-dictation]", message, err ?? "");
notifications.show({ color: "red", message });
teardown();
setStatus("error");
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
}
errorTimerRef.current = setTimeout(() => {
errorTimerRef.current = null;
setStatus("idle");
}, 1500);
},
[teardown],
);
const start = useCallback(async (): Promise<void> => {
// Synchronous live guard: status is stale between renders, so also block on
// refs to prevent a double-click from opening two MediaStreams / sockets.
if (
startingRef.current ||
streamRef.current ||
audioContextRef.current ||
clientRef.current
) {
return;
}
if (status !== "idle") return;
startingRef.current = true;
canceledRef.current = false;
readyRef.current = false;
pendingAudioRef.current = [];
if (!navigator.mediaDevices?.getUserMedia) {
const reason =
"navigator.mediaDevices.getUserMedia is unavailable in this context";
console.error("[realtime-dictation] " + reason);
notifications.show({
color: "red",
message: t("Audio recording is not available in this browser/context"),
});
setStatus("idle");
startingRef.current = false;
return;
}
let stream: MediaStream;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
// Always log the full error for diagnosis (name, message, stack).
console.error("[realtime-dictation] getUserMedia failed", err);
const name = (err as { name?: string })?.name;
const detail = (err as { message?: string })?.message ?? String(err);
let message: string;
if (name === "NotAllowedError" || name === "SecurityError") {
message = t("Microphone access denied");
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
message = t("No microphone found");
} else if (name === "NotReadableError" || name === "AbortError") {
message = t("Microphone is unavailable or already in use");
} else {
// Unknown failure: show the real reason instead of a generic string.
message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
}
notifications.show({ color: "red", message });
setStatus("idle");
startingRef.current = false;
return;
}
// If a cancel landed during the await, drop the stream and bail out.
if (canceledRef.current) {
stream.getTracks().forEach((track) => track.stop());
startingRef.current = false;
setStatus("idle");
return;
}
streamRef.current = stream;
// Build the capture graph. The worklet still resamples robustly if the browser
// ignores the 24 kHz hint, so any actual context rate is handled correctly.
const AudioCtx = getAudioContextCtor();
if (!AudioCtx) {
stopTracks();
notifications.show({
color: "red",
message: t("Audio recording is not available in this browser/context"),
});
setStatus("idle");
startingRef.current = false;
return;
}
let audioContext: AudioContext;
try {
audioContext = new AudioCtx({ sampleRate: 24000 });
audioContextRef.current = audioContext;
// AudioWorklet requires a secure context (https/localhost), same constraint
// as getUserMedia. A failure here means the UI should fall back to batch.
await audioContext.audioWorklet.addModule(PCM16_WORKLET_URL);
} catch (err) {
console.error("[realtime-dictation] audio worklet setup failed", err);
teardownAudio();
stopTracks();
const detail = (err as { message?: string })?.message ?? String(err);
notifications.show({
color: "red",
message: `${t("Could not start recording")}: ${detail}`,
});
setStatus("idle");
startingRef.current = false;
return;
}
// Another cancel could have landed during addModule().
if (canceledRef.current) {
teardownAudio();
stopTracks();
startingRef.current = false;
setStatus("idle");
return;
}
let source: MediaStreamAudioSourceNode;
let worklet: AudioWorkletNode;
try {
source = audioContext.createMediaStreamSource(stream);
worklet = new AudioWorkletNode(audioContext, "pcm16-worklet");
sourceRef.current = source;
workletRef.current = worklet;
// MediaStreamSource → worklet → destination. The worklet emits silence, so
// connecting to destination drives the render graph without echoing the mic.
source.connect(worklet);
worklet.connect(audioContext.destination);
} catch (err) {
console.error("[realtime-dictation] audio graph wiring failed", err);
teardownAudio();
stopTracks();
const detail = (err as { message?: string })?.message ?? String(err);
notifications.show({
color: "red",
message: `${t("Could not start recording")}: ${detail}`,
});
setStatus("idle");
startingRef.current = false;
return;
}
// Each worklet message is a PCM16 ArrayBuffer. Forward it once the upstream
// session is ready; until then buffer so no leading audio is dropped.
worklet.port.onmessage = (event: MessageEvent) => {
if (canceledRef.current) return;
const buf = event.data as ArrayBuffer;
if (!(buf instanceof ArrayBuffer)) return;
if (readyRef.current && clientRef.current) {
clientRef.current.sendAudio(buf);
} else {
pendingAudioRef.current.push(buf);
}
};
// Wire the realtime transport. The server replies `ready` once the upstream
// STT session is live; we then flush any buffered audio.
const client = new RealtimeDictationClient({
onReady: () => {
if (canceledRef.current) return;
readyRef.current = true;
const pending = pendingAudioRef.current;
pendingAudioRef.current = [];
for (const buf of pending) clientRef.current?.sendAudio(buf);
},
onInterim: (_itemId, text) => {
if (canceledRef.current) return;
optionsRef.current.onInterim(text);
},
onFinal: (_itemId, text) => {
if (canceledRef.current) return;
const trimmed = text.trim();
if (trimmed.length > 0) optionsRef.current.onFinal(trimmed);
},
onError: (message) => {
handleError(message);
},
onClosed: () => {
// The server ended the session (idle/max-duration or graceful upstream
// close). Skip if a cancel already tore everything down, or if an error
// path already owns the status (its error→idle timer is pending), or if a
// local stop already cleared the live refs. Otherwise tear down the capture
// graph + socket and return to idle so the mic/AudioContext don't leak and
// the button doesn't stay stuck on "recording".
if (canceledRef.current) return;
if (errorTimerRef.current !== null) return;
if (
!clientRef.current &&
!audioContextRef.current &&
!streamRef.current
) {
return;
}
teardown();
setStatus("idle");
},
});
clientRef.current = client;
// Notify the caller right when capture begins (before opening the socket) so
// the editor can snapshot the caret position.
try {
optionsRef.current.onStart?.();
} catch (err) {
console.error("[realtime-dictation] onStart callback threw", err);
}
// Open the socket, then ask the server to open the upstream session. The
// language hint is the base subtag of the resolved UI language (e.g. "en-US"
// → "en"), since the upstream transcription model expects an ISO language
// code, not a region-tagged locale; the server omits it upstream when absent.
client.connect();
const locale = i18n.resolvedLanguage || i18n.language || "";
const language = locale.split("-")[0] || undefined;
client.start({ language });
setStatus("recording");
// Capture has truly begun; release the synchronous start guard.
startingRef.current = false;
const maxDurationMs = optionsRef.current.maxDurationMs ?? 120000;
timerRef.current = setTimeout(() => {
// Reuse stop() so the upstream is flushed/closed gracefully.
stopRef.current?.();
}, maxDurationMs);
}, [status, t, i18n, stopTracks, teardownAudio, handleError]);
const stop = useCallback((): void => {
// Nothing live → no-op (never crash on an idle/destroyed state).
if (
!clientRef.current &&
!audioContextRef.current &&
!streamRef.current &&
!startingRef.current
) {
return;
}
teardown();
setStatus("idle");
}, [teardown]);
// Keep the stop ref pointed at the latest stop() for the max-duration timer.
stopRef.current = stop;
const cancel = useCallback((): void => {
// Mark canceled first so any late socket/worklet callbacks are ignored.
canceledRef.current = true;
teardown();
setStatus("idle");
}, [teardown]);
// Clean up on unmount: stop tracks, close the context/worklet, disconnect the
// socket, and clear timers.
useEffect(() => {
return () => {
canceledRef.current = true;
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
errorTimerRef.current = null;
}
teardown();
};
}, [teardown]);
return { status, start, stop, cancel };
}

View File

@@ -0,0 +1,124 @@
import { io, Socket } from "socket.io-client";
import { SOCKET_URL } from "@/features/websocket/types";
// Handlers the hook supplies; the client translates the normalized `/ai-realtime`
// Socket.IO events into these callbacks. The client itself owns no React state —
// it is a thin transport wrapper so the hook can stay focused on the audio graph.
export interface RealtimeDictationHandlers {
// Upstream STT session is established; safe to start sending audio.
onReady: () => void;
// Latest partial transcript for the current (not-yet-final) segment.
onInterim: (itemId: string, text: string) => void;
// A completed segment's transcript.
onFinal: (itemId: string, text: string) => void;
// Concrete failure reason (connect error or server-surfaced error).
onError: (message: string) => void;
// Session ended (graceful stop or upstream closed).
onClosed: () => void;
}
interface StartOptions {
language?: string;
}
// Wraps the dedicated `/ai-realtime` Socket.IO namespace. Cookie-based auth rides
// the handshake via `withCredentials` (no bearer token), exactly like the main
// app socket. `autoConnect: false` lets the hook wire listeners up before the
// handshake fires so no early event is missed.
export class RealtimeDictationClient {
private socket: Socket | null = null;
// onError must fire at most once per session: the server `error` and socket
// `connect_error` can both arrive (e.g. an error then a failed reconnect), but
// the hook owns the error→idle flow and a second call would double-fire it.
private erroredFlag = false;
constructor(private readonly handlers: RealtimeDictationHandlers) {}
// Forward the first error reason only; later error/connect_error are swallowed.
private emitError(message: string): void {
if (this.erroredFlag) return;
this.erroredFlag = true;
this.handlers.onError(message);
}
// Create the socket, register listeners, then open the connection. Safe to call
// once per client instance; a second call is a no-op while a socket exists.
connect(): void {
if (this.socket) return;
// Fresh socket → allow onError to fire again for this connection.
this.erroredFlag = false;
// SOCKET_URL is undefined in this app (socket.io derives the page origin), so
// the `/ai-realtime` namespace rides the same `/socket.io` path as the main
// socket — which the Vite dev server proxies as a websocket.
const socket: Socket = SOCKET_URL
? io(`${SOCKET_URL}/ai-realtime`, {
transports: ["websocket"],
withCredentials: true,
autoConnect: false,
})
: io("/ai-realtime", {
transports: ["websocket"],
withCredentials: true,
autoConnect: false,
});
this.socket = socket;
socket.on("ready", () => this.handlers.onReady());
socket.on("interim", (payload: { itemId: string; text: string }) => {
this.handlers.onInterim(payload?.itemId ?? "", payload?.text ?? "");
});
socket.on("final", (payload: { itemId: string; text: string }) => {
this.handlers.onFinal(payload?.itemId ?? "", payload?.text ?? "");
});
socket.on("error", (payload: { message?: string } | string) => {
const message =
typeof payload === "string"
? payload
: payload?.message || "Realtime dictation error";
this.emitError(message);
});
socket.on("closed", () => this.handlers.onClosed());
// Low-level transport failure (handshake/auth/proxy). Surface a concrete cause.
socket.on("connect_error", (err: Error) => {
const message = err?.message
? `Realtime connection failed: ${err.message}`
: "Realtime connection failed";
this.emitError(message);
});
socket.connect();
}
// Ask the server to resolve config and open the upstream STT session.
start(opts: StartOptions): void {
this.socket?.emit("start", { language: opts.language });
}
// Forward a raw PCM16 chunk; socket.io serializes the ArrayBuffer as binary.
sendAudio(buf: ArrayBuffer): void {
this.socket?.emit("audio", buf);
}
// Request a graceful flush/close of the upstream session.
stop(): void {
this.socket?.emit("stop");
}
// Tear down: drop every listener and close the connection. Idempotent.
disconnect(): void {
const socket = this.socket;
if (!socket) return;
this.socket = null;
// Reset so a subsequent connect() on a reused instance can error again.
this.erroredFlag = false;
socket.removeAllListeners();
socket.disconnect();
}
}

View File

@@ -1,12 +1,21 @@
import { FC, useRef } from "react"; import { FC, useRef } from "react";
import type { Editor } from "@tiptap/react"; import type { Editor } from "@tiptap/react";
import { useAtomValue } from "jotai";
import { MicButton } from "@/features/dictation/components/mic-button"; import { MicButton } from "@/features/dictation/components/mic-button";
import { RealtimeMicButton } from "@/features/dictation/components/realtime-mic-button";
import { workspaceAtom } from "@/features/user/atoms/current-user-atom";
import {
setDictationInterim,
clearDictationInterim,
} from "@/features/editor/extensions/dictation-interim/dictation-interim.ts";
interface Props { interface Props {
editor: Editor; editor: Editor;
} }
export const DictationGroup: FC<Props> = ({ editor }) => { export const DictationGroup: FC<Props> = ({ editor }) => {
const workspace = useAtomValue(workspaceAtom);
const isRealtime = workspace?.settings?.ai?.dictationRealtime === true;
const rangeRef = useRef<{ from: number; to: number } | null>(null); const rangeRef = useRef<{ from: number; to: number } | null>(null);
const handleStart = () => { const handleStart = () => {
@@ -50,6 +59,33 @@ export const DictationGroup: FC<Props> = ({ editor }) => {
} }
}; };
// Realtime path: commit each final segment at the LIVE caret (inserts happen
// during recording, so no fixed snapshot is needed); interim is shown via the
// ghost decoration only.
if (isRealtime) {
return (
<RealtimeMicButton
size="md"
disabled={!editor.isEditable}
onStart={() => {
if (editor && !editor.isDestroyed) clearDictationInterim(editor);
}}
onInterim={(text) => {
if (editor && !editor.isDestroyed) setDictationInterim(editor, text);
}}
onFinal={(text) => {
if (!editor || editor.isDestroyed) return;
clearDictationInterim(editor);
try {
editor.chain().focus().insertContent(`${text} `).run();
} catch {
// The editor may have been destroyed mid-stream; ignore.
}
}}
/>
);
}
return ( return (
<MicButton <MicButton
size="md" size="md"

View File

@@ -0,0 +1,97 @@
import { Extension } from "@tiptap/core";
import type { Editor } from "@tiptap/core";
import { Plugin, PluginKey } from "@tiptap/pm/state";
import { Decoration, DecorationSet } from "@tiptap/pm/view";
// Plugin key shared by the extension and the imperative helpers below so they
// dispatch/read the same plugin state.
const dictationInterimKey = new PluginKey<DictationInterimState>(
"dictationInterim",
);
interface DictationInterimState {
// The current interim (partial) transcript. Empty string means "no ghost".
text: string;
}
/**
* B2 editor decoration: shows the realtime interim (partial) transcript as a
* ghost widget at the caret. The interim is held ONLY in plugin meta state and
* rendered as a widget Decoration — it is NEVER written into the document, so
* it produces no Yjs update and no history entry. Only final segments are
* committed (by the dictation-group / chat consumers).
*/
export const DictationInterim = Extension.create({
name: "dictationInterim",
addProseMirrorPlugins() {
return [
new Plugin<DictationInterimState>({
key: dictationInterimKey,
state: {
init: (): DictationInterimState => ({ text: "" }),
apply: (tr, value): DictationInterimState => {
const meta = tr.getMeta(dictationInterimKey) as
| DictationInterimState
| undefined;
// Meta-only updates replace the interim text; everything else keeps
// the existing value (it follows the caret on its own since the
// decoration is recomputed against the live selection).
if (meta) {
return { text: meta.text };
}
return value;
},
},
props: {
decorations(state) {
const pluginState = dictationInterimKey.getState(state);
const text = pluginState?.text ?? "";
if (!text) {
return null;
}
// Render the interim as an inline ghost at the caret. Inline styles
// keep this self-contained — no global CSS is required.
const widget = Decoration.widget(
state.selection.head,
() => {
const span = document.createElement("span");
span.textContent = text;
span.setAttribute("contenteditable", "false");
span.style.opacity = "0.5";
span.style.fontStyle = "italic";
return span;
},
{ side: 1, ignoreSelection: true },
);
return DecorationSet.create(state.doc, [widget]);
},
},
}),
];
},
});
/**
* Set the interim ghost text via a META-ONLY transaction — no doc steps, so it
* generates no Yjs update and no history entry.
*/
export function setDictationInterim(editor: Editor, text: string): void {
editor.view.dispatch(
editor.state.tr.setMeta(dictationInterimKey, { text }),
);
}
/**
* Clear the interim ghost text via a META-ONLY transaction (same no-op-on-doc
* guarantee as setDictationInterim).
*/
export function clearDictationInterim(editor: Editor): void {
editor.view.dispatch(
editor.state.tr.setMeta(dictationInterimKey, { text: "" }),
);
}
export default DictationInterim;

View File

@@ -123,6 +123,7 @@ import { countWords } from "alfaaz";
import AutoJoiner from "@/features/editor/extensions/autojoiner.ts"; import AutoJoiner from "@/features/editor/extensions/autojoiner.ts";
import GlobalDragHandle from "@/features/editor/extensions/drag-handle.ts"; import GlobalDragHandle from "@/features/editor/extensions/drag-handle.ts";
import { CleanStyles } from "@/features/editor/extensions/clean-styles.ts"; import { CleanStyles } from "@/features/editor/extensions/clean-styles.ts";
import { DictationInterim } from "@/features/editor/extensions/dictation-interim/dictation-interim.ts";
const lowlight = createLowlight(common); const lowlight = createLowlight(common);
lowlight.register("mermaid", plaintext); lowlight.register("mermaid", plaintext);
@@ -343,6 +344,7 @@ export const mainExtensions = [
}, },
}), }),
Selection, Selection,
DictationInterim,
Attachment.configure({ Attachment.configure({
view: AttachmentView, view: AttachmentView,
}), }),

View File

@@ -32,6 +32,7 @@ import {
useAiSettingsQuery, useAiSettingsQuery,
useReindexAiEmbeddingsMutation, useReindexAiEmbeddingsMutation,
useTestAiConnectionMutation, useTestAiConnectionMutation,
useTestRealtimeConnectionMutation,
useUpdateAiSettingsMutation, useUpdateAiSettingsMutation,
} from "@/features/workspace/queries/ai-settings-query.ts"; } from "@/features/workspace/queries/ai-settings-query.ts";
import { import {
@@ -62,6 +63,10 @@ const formSchema = z.object({
// STT-specific fields. Empty base URL / key fall back to the chat ones. // STT-specific fields. Empty base URL / key fall back to the chat ones.
sttModel: z.string(), sttModel: z.string(),
sttBaseUrl: z.string(), sttBaseUrl: z.string(),
// Realtime (streaming) STT fields. Empty model falls back to sttModel and
// empty base URL falls back to the STT base URL server-side.
sttRealtimeModel: z.string(),
sttRealtimeBaseUrl: z.string(),
sttApiStyle: z.enum(["multipart", "json"]), sttApiStyle: z.enum(["multipart", "json"]),
sttApiKey: z.string(), sttApiKey: z.string(),
}); });
@@ -176,6 +181,8 @@ export default function AiProviderSettings() {
const chatTest = useTestAiConnectionMutation(); const chatTest = useTestAiConnectionMutation();
const embedTest = useTestAiConnectionMutation(); const embedTest = useTestAiConnectionMutation();
const sttTest = useTestAiConnectionMutation(); const sttTest = useTestAiConnectionMutation();
// Realtime probe hits a separate /ai-chat/realtime/test route (admin-gated).
const realtimeTest = useTestRealtimeConnectionMutation();
// Agent roles drive the public-share assistant identity picker. Admin-gated // Agent roles drive the public-share assistant identity picker. Admin-gated
// (the component returns early for non-admins), same as the AI settings query. // (the component returns early for non-admins), same as the AI settings query.
@@ -192,6 +199,8 @@ export default function AiProviderSettings() {
const [dictationEnabled, setDictationEnabled] = useState<boolean>( const [dictationEnabled, setDictationEnabled] = useState<boolean>(
workspace?.settings?.ai?.dictation ?? false, workspace?.settings?.ai?.dictation ?? false,
); );
const [realtimeDictationEnabled, setRealtimeDictationEnabled] =
useState<boolean>(workspace?.settings?.ai?.dictationRealtime ?? false);
const [publicShareAssistantEnabled, setPublicShareAssistantEnabled] = const [publicShareAssistantEnabled, setPublicShareAssistantEnabled] =
useState<boolean>( useState<boolean>(
workspace?.settings?.ai?.publicShareAssistant ?? false, workspace?.settings?.ai?.publicShareAssistant ?? false,
@@ -199,6 +208,10 @@ export default function AiProviderSettings() {
const [chatToggleLoading, setChatToggleLoading] = useState(false); const [chatToggleLoading, setChatToggleLoading] = useState(false);
const [searchToggleLoading, setSearchToggleLoading] = useState(false); const [searchToggleLoading, setSearchToggleLoading] = useState(false);
const [dictationToggleLoading, setDictationToggleLoading] = useState(false); const [dictationToggleLoading, setDictationToggleLoading] = useState(false);
const [
realtimeDictationToggleLoading,
setRealtimeDictationToggleLoading,
] = useState(false);
const [ const [
publicShareAssistantToggleLoading, publicShareAssistantToggleLoading,
setPublicShareAssistantToggleLoading, setPublicShareAssistantToggleLoading,
@@ -232,6 +245,8 @@ export default function AiProviderSettings() {
embeddingApiKey: "", embeddingApiKey: "",
sttModel: "", sttModel: "",
sttBaseUrl: "", sttBaseUrl: "",
sttRealtimeModel: "",
sttRealtimeBaseUrl: "",
sttApiStyle: "multipart" as SttApiStyle, sttApiStyle: "multipart" as SttApiStyle,
sttApiKey: "", sttApiKey: "",
}, },
@@ -253,6 +268,8 @@ export default function AiProviderSettings() {
embeddingApiKey: "", embeddingApiKey: "",
sttModel: settings.sttModel ?? "", sttModel: settings.sttModel ?? "",
sttBaseUrl: settings.sttBaseUrl ?? "", sttBaseUrl: settings.sttBaseUrl ?? "",
sttRealtimeModel: settings.sttRealtimeModel ?? "",
sttRealtimeBaseUrl: settings.sttRealtimeBaseUrl ?? "",
sttApiStyle: settings.sttApiStyle ?? "multipart", sttApiStyle: settings.sttApiStyle ?? "multipart",
sttApiKey: "", sttApiKey: "",
}); });
@@ -287,6 +304,10 @@ export default function AiProviderSettings() {
// server-side. // server-side.
sttModel: values.sttModel, sttModel: values.sttModel,
sttBaseUrl: values.sttBaseUrl, sttBaseUrl: values.sttBaseUrl,
// Realtime STT: empty model falls back to sttModel, empty base URL falls
// back to the STT base URL server-side.
sttRealtimeModel: values.sttRealtimeModel,
sttRealtimeBaseUrl: values.sttRealtimeBaseUrl,
sttApiStyle: values.sttApiStyle, sttApiStyle: values.sttApiStyle,
}; };
@@ -434,6 +455,35 @@ export default function AiProviderSettings() {
} }
} }
// Optimistic toggle for the "Realtime dictation" feature
// (settings.ai.dictationRealtime). Layered on top of batch dictation.
async function handleToggleRealtimeDictation(value: boolean) {
setRealtimeDictationToggleLoading(true);
const previous = realtimeDictationEnabled;
setRealtimeDictationEnabled(value);
try {
const updated = await updateWorkspace({ aiDictationRealtime: value });
setWorkspace({
...updated,
settings: {
...updated.settings,
ai: { ...updated.settings?.ai, dictationRealtime: value },
},
});
notifications.show({ message: t("Updated successfully") });
} catch (err) {
setRealtimeDictationEnabled(previous);
const message = (err as { response?: { data?: { message?: string } } })
?.response?.data?.message;
notifications.show({
message: message ?? t("Failed to update data"),
color: "red",
});
} finally {
setRealtimeDictationToggleLoading(false);
}
}
// Optimistic toggle for the anonymous public-share AI assistant // Optimistic toggle for the anonymous public-share AI assistant
// (settings.ai.publicShareAssistant). When off, the public endpoint 404s. // (settings.ai.publicShareAssistant). When off, the public endpoint 404s.
async function handleTogglePublicShareAssistant(value: boolean) { async function handleTogglePublicShareAssistant(value: boolean) {
@@ -853,13 +903,24 @@ export default function AiProviderSettings() {
<StatusDot status={sttStatus} label={cardStatusLabel(sttStatus, t)} /> <StatusDot status={sttStatus} label={cardStatusLabel(sttStatus, t)} />
<Text fw={600}>{t("Voice / STT")}</Text> <Text fw={600}>{t("Voice / STT")}</Text>
</Group> </Group>
<Switch <Group gap="md" align="center" wrap="nowrap">
label={t("Voice dictation")} <Switch
labelPosition="left" label={t("Voice dictation")}
checked={dictationEnabled} labelPosition="left"
disabled={dictationToggleLoading} checked={dictationEnabled}
onChange={(e) => handleToggleDictation(e.currentTarget.checked)} disabled={dictationToggleLoading}
/> onChange={(e) => handleToggleDictation(e.currentTarget.checked)}
/>
<Switch
label={t("Realtime dictation")}
labelPosition="left"
checked={realtimeDictationEnabled}
disabled={realtimeDictationToggleLoading}
onChange={(e) =>
handleToggleRealtimeDictation(e.currentTarget.checked)
}
/>
</Group>
</Group> </Group>
<Text size="xs" c="dimmed" mt={4} mb="md"> <Text size="xs" c="dimmed" mt={4} mb="md">
{t( {t(
@@ -954,6 +1015,58 @@ export default function AiProviderSettings() {
</Text> </Text>
))} ))}
</Group> </Group>
{/* Realtime (streaming) dictation: layered on top of batch STT and only
shown when the workspace toggle is on. Model falls back to the STT
model and the endpoint falls back to the STT base URL server-side. */}
{realtimeDictationEnabled && (
<>
<Text size="xs" c="dimmed" mt="md" mb="xs">
{t(
"Streams audio live and inserts text as you speak (requires an OpenAI-compatible Realtime endpoint)",
)}
</Text>
<TextInput
label={t("Realtime model")}
placeholder="gpt-4o-mini-transcribe"
disabled={isLoading}
{...form.getInputProps("sttRealtimeModel")}
/>
<TextInput
mt="sm"
label={t("Realtime endpoint")}
description={t(
"Leave empty to use the STT base URL",
)}
placeholder={t("Leave empty to use the STT base URL")}
disabled={isLoading}
{...form.getInputProps("sttRealtimeBaseUrl")}
/>
<Group mt="md" align="center">
<Button
variant="default"
size="sm"
loading={realtimeTest.isPending}
onClick={() => realtimeTest.mutate()}
>
{t("Test endpoint")}
</Button>
{realtimeTest.data &&
(realtimeTest.data.ok ? (
<Text size="sm" c="green">
{t("Connection successful")}
</Text>
) : (
<Text size="sm" c="red">
{realtimeTest.data.error || t("Connection failed")}
</Text>
))}
</Group>
</>
)}
</Paper> </Paper>
{/* Nested: external MCP tools the agent calls out to */} {/* Nested: external MCP tools the agent calls out to */}

View File

@@ -8,6 +8,7 @@ import {
getAiSettings, getAiSettings,
updateAiSettings, updateAiSettings,
testAiConnection, testAiConnection,
testRealtimeConnection,
reindexAiEmbeddings, reindexAiEmbeddings,
IAiSettings, IAiSettings,
IAiSettingsUpdate, IAiSettingsUpdate,
@@ -55,6 +56,12 @@ export function useTestAiConnectionMutation() {
}); });
} }
export function useTestRealtimeConnectionMutation() {
return useMutation<IAiTestResult, Error, void>({
mutationFn: () => testRealtimeConnection(),
});
}
export function useReindexAiEmbeddingsMutation() { export function useReindexAiEmbeddingsMutation() {
const { t } = useTranslation(); const { t } = useTranslation();
const queryClient = useQueryClient(); const queryClient = useQueryClient();

View File

@@ -32,6 +32,8 @@ export interface IAiSettings {
// key is stored (empty means "uses the chat API key"). // key is stored (empty means "uses the chat API key").
sttModel?: string; sttModel?: string;
sttBaseUrl?: string; sttBaseUrl?: string;
sttRealtimeModel?: string;
sttRealtimeBaseUrl?: string;
sttApiStyle?: SttApiStyle; sttApiStyle?: SttApiStyle;
hasSttApiKey: boolean; hasSttApiKey: boolean;
// RAG indexing coverage (pages indexed for semantic search). // RAG indexing coverage (pages indexed for semantic search).
@@ -59,6 +61,8 @@ export interface IAiSettingsUpdate {
embeddingApiKey?: string; embeddingApiKey?: string;
sttModel?: string; sttModel?: string;
sttBaseUrl?: string; sttBaseUrl?: string;
sttRealtimeModel?: string;
sttRealtimeBaseUrl?: string;
sttApiStyle?: SttApiStyle; sttApiStyle?: SttApiStyle;
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`). // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
sttApiKey?: string; sttApiKey?: string;
@@ -95,6 +99,14 @@ export async function testAiConnection(
return req.data; return req.data;
} }
// Probes the realtime (streaming STT) endpoint. Unlike the other tests this
// route lives under /ai-chat (not /workspace/ai-settings); it is admin-gated
// server-side and returns the same { ok, error? } envelope at req.data.
export async function testRealtimeConnection(): Promise<IAiTestResult> {
const req = await api.post<IAiTestResult>("/ai-chat/realtime/test");
return req.data;
}
export async function reindexAiEmbeddings(): Promise<IAiSettings> { export async function reindexAiEmbeddings(): Promise<IAiSettings> {
const req = await api.post<IAiSettings>("/workspace/ai-settings/reindex"); const req = await api.post<IAiSettings>("/workspace/ai-settings/reindex");
return req.data; return req.data;

View File

@@ -25,6 +25,7 @@ export interface IWorkspace {
mcpEnabled?: boolean; mcpEnabled?: boolean;
aiChat?: boolean; aiChat?: boolean;
aiDictation?: boolean; aiDictation?: boolean;
aiDictationRealtime?: boolean;
aiPublicShareAssistant?: boolean; aiPublicShareAssistant?: boolean;
trashRetentionDays?: number; trashRetentionDays?: number;
restrictApiToAdmins?: boolean; restrictApiToAdmins?: boolean;
@@ -62,6 +63,7 @@ export interface IWorkspaceAiSettings {
mcp?: boolean; mcp?: boolean;
chat?: boolean; chat?: boolean;
dictation?: boolean; dictation?: boolean;
dictationRealtime?: boolean;
publicShareAssistant?: boolean; publicShareAssistant?: boolean;
} }

View File

@@ -27,8 +27,14 @@ import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.rep
import { UserThrottlerGuard } from '../../integrations/throttle/user-throttler.guard'; import { UserThrottlerGuard } from '../../integrations/throttle/user-throttler.guard';
import { AI_CHAT_THROTTLER } from '../../integrations/throttle/throttler-names'; import { AI_CHAT_THROTTLER } from '../../integrations/throttle/throttler-names';
import { FileInterceptor } from '../../common/interceptors/file.interceptor'; import { FileInterceptor } from '../../common/interceptors/file.interceptor';
import WorkspaceAbilityFactory from '../casl/abilities/workspace-ability.factory';
import {
WorkspaceCaslAction,
WorkspaceCaslSubject,
} from '../casl/interfaces/workspace-ability.type';
import { AiChatService, AiChatStreamBody } from './ai-chat.service'; import { AiChatService, AiChatStreamBody } from './ai-chat.service';
import { AiTranscriptionService } from './ai-transcription.service'; import { AiTranscriptionService } from './ai-transcription.service';
import { AiRealtimeService } from './realtime/ai-realtime.service';
import { import {
ChatIdDto, ChatIdDto,
GetChatMessagesDto, GetChatMessagesDto,
@@ -51,8 +57,23 @@ export class AiChatController {
private readonly aiChatRepo: AiChatRepo, private readonly aiChatRepo: AiChatRepo,
private readonly aiChatMessageRepo: AiChatMessageRepo, private readonly aiChatMessageRepo: AiChatMessageRepo,
private readonly aiTranscription: AiTranscriptionService, private readonly aiTranscription: AiTranscriptionService,
private readonly aiRealtimeService: AiRealtimeService,
private readonly workspaceAbility: WorkspaceAbilityFactory,
) {} ) {}
/**
* Admin gate, identical to AiSettingsController.assertAdmin: require the
* workspace Manage/Settings ability (same gate as POST /workspace/update).
*/
private assertAdmin(user: User, workspace: Workspace): void {
const ability = this.workspaceAbility.createForUser(user, workspace);
if (
ability.cannot(WorkspaceCaslAction.Manage, WorkspaceCaslSubject.Settings)
) {
throw new ForbiddenException();
}
}
/** List the requesting user's chats in this workspace (paginated). */ /** List the requesting user's chats in this workspace (paginated). */
@HttpCode(HttpStatus.OK) @HttpCode(HttpStatus.OK)
@Post('chats') @Post('chats')
@@ -287,6 +308,23 @@ export class AiChatController {
return { text }; return { text };
} }
/**
* Admin-only "test connection" probe for the realtime STT upstream. Reuses
* AiRealtimeService.openSession to exercise the real config/SSRF/handshake path
* and tears the socket down immediately. The API key never leaves the server.
* Response is the FROZEN contract { ok: true } | { ok: false, error: string }
* (the global response transform wraps it; the client reads req.data).
*/
@HttpCode(HttpStatus.OK)
@Post('realtime/test')
async testRealtime(
@AuthUser() user: User,
@AuthWorkspace() workspace: Workspace,
): Promise<{ ok: true } | { ok: false; error: string }> {
this.assertAdmin(user, workspace);
return this.aiRealtimeService.testConnection(workspace.id);
}
/** /**
* Ensure the chat exists, belongs to this workspace, AND was created by the * Ensure the chat exists, belongs to this workspace, AND was created by the
* requesting user (per-user isolation). Throws ForbiddenException otherwise. * requesting user (per-user isolation). Throws ForbiddenException otherwise.

View File

@@ -13,6 +13,8 @@ import { SearchModule } from '../search/search.module';
import { PublicShareChatController } from './public-share-chat.controller'; import { PublicShareChatController } from './public-share-chat.controller';
import { PublicShareChatService } from './public-share-chat.service'; import { PublicShareChatService } from './public-share-chat.service';
import { PublicShareChatToolsService } from './tools/public-share-chat-tools.service'; import { PublicShareChatToolsService } from './tools/public-share-chat-tools.service';
import { AiRealtimeGateway } from './realtime/ai-realtime.gateway';
import { AiRealtimeService } from './realtime/ai-realtime.service';
/** /**
* Per-user AI chat module (§6.1). * Per-user AI chat module (§6.1).
@@ -46,6 +48,11 @@ import { PublicShareChatToolsService } from './tools/public-share-chat-tools.ser
AiChatToolsService, AiChatToolsService,
PublicShareChatService, PublicShareChatService,
PublicShareChatToolsService, PublicShareChatToolsService,
// Realtime dictation: the Socket.IO `/ai-realtime` gateway + its upstream
// proxy service. AiSettingsService comes from AiModule; WorkspaceRepo from
// the global DatabaseModule; TokenService from TokenModule (both imported).
AiRealtimeGateway,
AiRealtimeService,
], ],
}) })
export class AiChatModule {} export class AiChatModule {}

View File

@@ -0,0 +1,236 @@
import {
OnGatewayConnection,
OnGatewayDisconnect,
SubscribeMessage,
WebSocketGateway,
} from '@nestjs/websockets';
import { Logger } from '@nestjs/common';
import { Socket } from 'socket.io';
import * as cookie from 'cookie';
import { TokenService } from '../../auth/services/token.service';
import { JwtPayload, JwtType } from '../../auth/dto/jwt-payload';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
import { AiSttNotConfiguredException } from '../../../integrations/ai/ai-stt-not-configured.exception';
import { describeProviderError } from '../../../integrations/ai/ai-error.util';
import {
AiRealtimeService,
RealtimeSessionHandle,
} from './ai-realtime.service';
/**
* Realtime dictation gateway — the server side of the FROZEN normalized
* Socket.IO `/ai-realtime` protocol. The browser talks ONLY to this namespace;
* the raw OpenAI GA schema and the provider key never reach the client.
*
* Client → server: connect (cookie-JWT auth), `start` { language? }, `audio`
* (PCM16 binary), `stop`. Server → client: `ready`, `interim`, `final`,
* `error`, `closed`.
*
* Gate (before opening upstream): the workspace must have BOTH
* `settings.ai.dictation === true` AND `settings.ai.dictationRealtime === true`.
* Hard concurrency caps (realtime is expensive) are enforced in-memory per user
* and per workspace.
*/
/** Realtime is expensive: one live session per user, a handful per workspace. */
const MAX_SESSIONS_PER_USER = 1;
const MAX_SESSIONS_PER_WORKSPACE = 5;
// Module-level concurrency counters. A single Node process backs the gateway;
// these caps are best-effort within that process (a horizontally-scaled
// deployment would need a shared store, out of scope here).
const sessionsPerUser = new Map<string, number>();
const sessionsPerWorkspace = new Map<string, number>();
function incr(map: Map<string, number>, key: string): number {
const next = (map.get(key) ?? 0) + 1;
map.set(key, next);
return next;
}
function decr(map: Map<string, number>, key: string): void {
const next = (map.get(key) ?? 0) - 1;
if (next <= 0) {
map.delete(key);
} else {
map.set(key, next);
}
}
/** Per-socket state we stash on client.data. */
interface RealtimeClientData {
userId: string;
workspaceId: string;
handle?: RealtimeSessionHandle;
// What we incremented at connect time, so disconnect decrements exactly that.
countedUserId?: string;
countedWorkspaceId?: string;
}
@WebSocketGateway({
namespace: '/ai-realtime',
cors: { origin: '*' },
transports: ['websocket'],
})
export class AiRealtimeGateway
implements OnGatewayConnection, OnGatewayDisconnect
{
private readonly logger = new Logger(AiRealtimeGateway.name);
constructor(
private readonly tokenService: TokenService,
private readonly workspaceRepo: WorkspaceRepo,
private readonly aiRealtimeService: AiRealtimeService,
) {}
async handleConnection(client: Socket): Promise<void> {
try {
const cookies = cookie.parse(client.handshake.headers.cookie ?? '');
const token: JwtPayload = await this.tokenService.verifyJwt(
cookies['authToken'],
JwtType.ACCESS,
);
const userId = token.sub;
const workspaceId = token.workspaceId;
const data = client.data as RealtimeClientData;
data.userId = userId;
data.workspaceId = workspaceId;
// Gate: realtime dictation must be enabled at the workspace level.
const workspace = await this.workspaceRepo.findById(workspaceId);
const settings = (workspace?.settings ?? {}) as {
ai?: { dictation?: boolean; dictationRealtime?: boolean };
};
if (
settings.ai?.dictation !== true ||
settings.ai?.dictationRealtime !== true
) {
client.emit('error', {
message: 'Realtime dictation is not enabled',
});
client.disconnect();
return;
}
// Hard concurrency caps (realtime is expensive). Check both before
// incrementing either, so a rejected connection leaves the counters clean.
const userCount = sessionsPerUser.get(userId) ?? 0;
const workspaceCount = sessionsPerWorkspace.get(workspaceId) ?? 0;
if (userCount >= MAX_SESSIONS_PER_USER) {
client.emit('error', {
message:
'A realtime dictation session is already active for your account',
});
client.disconnect();
return;
}
if (workspaceCount >= MAX_SESSIONS_PER_WORKSPACE) {
client.emit('error', {
message:
'The maximum number of concurrent realtime dictation sessions for this workspace has been reached',
});
client.disconnect();
return;
}
incr(sessionsPerUser, userId);
incr(sessionsPerWorkspace, workspaceId);
// Remember exactly what we counted so disconnect decrements symmetrically.
data.countedUserId = userId;
data.countedWorkspaceId = workspaceId;
} catch (err) {
// Auth failure (or any unexpected connect error): never leak details.
this.logger.error('Realtime dictation connection rejected', err as Error);
client.emit('error', { message: 'Unauthorized' });
client.disconnect();
}
}
@SubscribeMessage('start')
async handleStart(
client: Socket,
data?: { language?: string },
): Promise<void> {
const state = client.data as RealtimeClientData;
// Guard double-start: a session is already open on this socket.
if (state.handle) {
client.emit('error', {
message: 'A realtime dictation session is already in progress',
});
return;
}
try {
const handle = await this.aiRealtimeService.openSession(
state.workspaceId,
{
language: data?.language,
onReady: () => client.emit('ready', {}),
onInterim: (itemId, text) => client.emit('interim', { itemId, text }),
onFinal: (itemId, text) => client.emit('final', { itemId, text }),
onError: (message) => client.emit('error', { message }),
onClosed: () => {
// Session ended (graceful stop, idle/max-duration, or upstream close):
// clear the handle so the double-start guard is released, then notify.
state.handle = undefined;
client.emit('closed', {});
},
},
);
state.handle = handle;
} catch (err) {
// Concrete reason to the client: a not-configured 503 vs a provider error.
this.logger.error('Failed to open realtime dictation session', err as Error);
const message =
err instanceof AiSttNotConfiguredException
? err.message
: describeProviderError(err, 'Failed to start realtime dictation');
client.emit('error', { message });
}
}
@SubscribeMessage('audio')
handleAudio(client: Socket, payload: unknown): void {
const state = client.data as RealtimeClientData;
if (!state.handle) return;
const chunk = AiRealtimeGateway.toBuffer(payload);
if (!chunk) return;
state.handle.appendAudio(chunk);
}
@SubscribeMessage('stop')
handleStop(client: Socket): void {
const state = client.data as RealtimeClientData;
state.handle?.stop();
}
handleDisconnect(client: Socket): void {
const state = client.data as RealtimeClientData;
// Tear down the upstream session, then release the concurrency slots we took.
state.handle?.close();
state.handle = undefined;
if (state.countedUserId) {
decr(sessionsPerUser, state.countedUserId);
state.countedUserId = undefined;
}
if (state.countedWorkspaceId) {
decr(sessionsPerWorkspace, state.countedWorkspaceId);
state.countedWorkspaceId = undefined;
}
}
/**
* Normalize an incoming `audio` payload to a Buffer. Socket.IO delivers binary
* as Buffer (Node) but may also surface Uint8Array / ArrayBuffer; accept all.
* Returns null for anything we cannot interpret as binary audio.
*/
private static toBuffer(payload: unknown): Buffer | null {
if (Buffer.isBuffer(payload)) return payload;
if (payload instanceof Uint8Array) return Buffer.from(payload);
if (payload instanceof ArrayBuffer) return Buffer.from(payload);
return null;
}
}

View File

@@ -0,0 +1,186 @@
import { parseUpstreamEvent } from './ai-realtime.service';
/**
* Unit tests for the PURE `parseUpstreamEvent` normalizer (no network). They
* feed synthetic OpenAI GA frames through a shared per-item delta accumulator
* and assert the normalized `/ai-realtime` outputs, including that two deltas
* for the same item_id accumulate and that the accumulator is cleared once the
* segment completes.
*/
describe('parseUpstreamEvent (OpenAI GA → normalized realtime events)', () => {
let acc: Map<string, string>;
beforeEach(() => {
acc = new Map<string, string>();
});
it('maps session.created / session.updated to { type: "ready" }', () => {
expect(parseUpstreamEvent(JSON.stringify({ type: 'session.created' }), acc)).toEqual({
type: 'ready',
});
expect(parseUpstreamEvent(JSON.stringify({ type: 'session.updated' }), acc)).toEqual({
type: 'ready',
});
// No accumulator side effects from session frames.
expect(acc.size).toBe(0);
});
it('accumulates two deltas for the same item_id into the interim text', () => {
const first = parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.delta',
item_id: 'item-1',
delta: 'Hello',
}),
acc,
);
expect(first).toEqual({ type: 'interim', itemId: 'item-1', text: 'Hello' });
const second = parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.delta',
item_id: 'item-1',
delta: ' world',
}),
acc,
);
// The second delta appends to the first: the interim is the full running text.
expect(second).toEqual({
type: 'interim',
itemId: 'item-1',
text: 'Hello world',
});
expect(acc.get('item-1')).toBe('Hello world');
});
it('emits a trimmed final from the completed transcript and clears the accumulator', () => {
// Seed an in-flight accumulation, then complete it.
parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.delta',
item_id: 'item-2',
delta: 'partial',
}),
acc,
);
expect(acc.has('item-2')).toBe(true);
const final = parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.completed',
item_id: 'item-2',
transcript: ' Final transcript. ',
}),
acc,
);
expect(final).toEqual({
type: 'final',
itemId: 'item-2',
text: 'Final transcript.',
});
// The accumulator entry for the completed segment is removed.
expect(acc.has('item-2')).toBe(false);
});
it('falls back to the accumulated text when completed omits the transcript', () => {
parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.delta',
item_id: 'item-3',
delta: 'accumulated only',
}),
acc,
);
const final = parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.completed',
item_id: 'item-3',
}),
acc,
);
expect(final).toEqual({
type: 'final',
itemId: 'item-3',
text: 'accumulated only',
});
expect(acc.has('item-3')).toBe(false);
});
it('maps an error frame to { type: "error" } with the provider message', () => {
const out = parseUpstreamEvent(
JSON.stringify({
type: 'error',
error: { message: 'invalid_api_key', code: 'invalid', type: 'auth' },
}),
acc,
);
expect(out.type).toBe('error');
expect(out.message).toBe('invalid_api_key');
});
it('maps an unknown frame to { type: "ignore" }', () => {
expect(
parseUpstreamEvent(JSON.stringify({ type: 'response.created' }), acc),
).toEqual({ type: 'ignore' });
// An unknown frame leaves a running accumulation untouched.
expect(acc.size).toBe(0);
});
it('maps an unparseable (non-JSON) frame to { type: "ignore" }', () => {
expect(parseUpstreamEvent('not json', acc)).toEqual({ type: 'ignore' });
});
it('runs the full GA sequence end-to-end and ends with a clean accumulator', () => {
// session.created → two deltas (same item) → completed → error → unknown.
expect(parseUpstreamEvent(JSON.stringify({ type: 'session.created' }), acc)).toEqual({
type: 'ready',
});
expect(
parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.delta',
item_id: 'seg',
delta: 'one ',
}),
acc,
),
).toEqual({ type: 'interim', itemId: 'seg', text: 'one ' });
expect(
parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.delta',
item_id: 'seg',
delta: 'two',
}),
acc,
),
).toEqual({ type: 'interim', itemId: 'seg', text: 'one two' });
expect(
parseUpstreamEvent(
JSON.stringify({
type: 'conversation.item.input_audio_transcription.completed',
item_id: 'seg',
transcript: 'one two',
}),
acc,
),
).toEqual({ type: 'final', itemId: 'seg', text: 'one two' });
expect(
parseUpstreamEvent(
JSON.stringify({ type: 'error', error: { message: 'boom' } }),
acc,
),
).toEqual({ type: 'error', message: 'boom' });
expect(parseUpstreamEvent(JSON.stringify({ type: 'whatever' }), acc)).toEqual({
type: 'ignore',
});
// Every started segment was completed → the accumulator is empty.
expect(acc.size).toBe(0);
});
});

View File

@@ -0,0 +1,485 @@
import { Injectable, Logger } from '@nestjs/common';
import WebSocket from 'ws';
import { AiSettingsService } from '../../../integrations/ai/ai-settings.service';
import { AiSttNotConfiguredException } from '../../../integrations/ai/ai-stt-not-configured.exception';
import { describeProviderError } from '../../../integrations/ai/ai-error.util';
import { isUrlAllowed } from '../external-mcp/ssrf-guard';
/**
* Realtime STT proxy (server side of the A2 transport: browser ↔ OUR server ↔
* OpenAI). The provider API key is resolved here and NEVER leaves the server /
* NEVER logged. The client only ever sees the normalized events emitted via the
* callbacks below — never the raw OpenAI GA schema.
*
* The upstream contract is the GA (2026) OpenAI Realtime transcription shape:
* wss://<host>/v1/realtime?intent=transcription
* header: Authorization: Bearer <sttApiKey> (NO OpenAI-Beta header in GA)
* one session.update after open, then input_audio_buffer.append frames.
*/
/** Normalized result of parsing a single raw upstream (OpenAI GA) event. */
export interface ParsedUpstreamEvent {
type: 'ready' | 'interim' | 'final' | 'error' | 'ignore';
itemId?: string;
text?: string;
message?: string;
}
/** Callbacks the gateway supplies to bridge upstream events to the client. */
export interface OpenSessionOptions {
/** Optional transcription language hint (e.g. 'en'); omitted from session.update when absent. */
language?: string;
/** Upstream session is live → client may start sending audio. */
onReady: () => void;
/** Latest accumulated partial text for a not-yet-final segment. */
onInterim: (itemId: string, text: string) => void;
/** A completed segment's final (trimmed) transcript. */
onFinal: (itemId: string, text: string) => void;
/** Concrete error reason for the client. */
onError: (message: string) => void;
/** Session ended (graceful stop or upstream close). */
onClosed: () => void;
}
/** Handle returned by openSession; the gateway drives audio/stop/close through it. */
export interface RealtimeSessionHandle {
/** Base64-encode a PCM16 chunk and forward as input_audio_buffer.append (if upstream OPEN). */
appendAudio: (chunk: Buffer | Uint8Array) => void;
/** Graceful stop: optionally commit, then close the upstream. */
stop: () => void;
/** Force-close the upstream and clear timers (idempotent). */
close: () => void;
}
/** No audio appended for this long → close the session with a clear reason. */
const IDLE_TIMEOUT_MS = 15_000;
/** Hard cap on a single realtime session's lifetime (mirrors the client's 120s). */
const MAX_SESSION_DURATION_MS = 120_000;
/** How long testConnection waits for the upstream to become ready before failing. */
const TEST_CONNECTION_TIMEOUT_MS = 8_000;
/**
* Parse ONE raw upstream (OpenAI GA) event JSON and normalize it, updating the
* per-item delta accumulator `acc` in place. PURE (aside from the supplied `acc`
* mutation) so it can be unit-tested without any network. Unknown or unparseable
* frames normalize to { type: 'ignore' } so the proxy silently skips them.
*
* - session.created / session.updated → { type: 'ready' }
* - conversation.item.input_audio_transcription.delta → append delta to
* acc[item_id]; return { type: 'interim', itemId, text: <accumulated> }
* - conversation.item.input_audio_transcription.completed → final transcript
* (trimmed), delete acc[item_id]; return { type: 'final', itemId, text }
* - error → { type: 'error', message } (provider message, else describeProviderError)
* - anything else / unparseable → { type: 'ignore' }
*/
export function parseUpstreamEvent(
raw: string,
acc: Map<string, string>,
): ParsedUpstreamEvent {
let evt: {
type?: string;
item_id?: string;
delta?: string;
transcript?: string;
error?: { message?: string; code?: string; type?: string };
};
try {
evt = JSON.parse(raw);
} catch {
// Non-JSON frame: ignore rather than crash the proxy.
return { type: 'ignore' };
}
if (typeof evt !== 'object' || evt === null || typeof evt.type !== 'string') {
return { type: 'ignore' };
}
switch (evt.type) {
case 'session.created':
case 'session.updated':
return { type: 'ready' };
case 'conversation.item.input_audio_transcription.delta': {
const itemId = evt.item_id;
if (!itemId) return { type: 'ignore' };
const prev = acc.get(itemId) ?? '';
const next = prev + (evt.delta ?? '');
acc.set(itemId, next);
return { type: 'interim', itemId, text: next };
}
case 'conversation.item.input_audio_transcription.completed': {
const itemId = evt.item_id;
if (!itemId) return { type: 'ignore' };
// Prefer the authoritative `transcript`; fall back to whatever we
// accumulated from deltas if the completed frame omits it.
const text = (evt.transcript ?? acc.get(itemId) ?? '').trim();
acc.delete(itemId);
return { type: 'final', itemId, text };
}
case 'error': {
// Surface the provider's concrete cause; never a generic message.
const message =
evt.error?.message?.trim() ||
describeProviderError(evt.error, 'Realtime transcription error');
return { type: 'error', message };
}
default:
return { type: 'ignore' };
}
}
@Injectable()
export class AiRealtimeService {
private readonly logger = new Logger(AiRealtimeService.name);
constructor(private readonly aiSettings: AiSettingsService) {}
/**
* Resolve the workspace STT config, SSRF-check the upstream, open the upstream
* realtime WS and wire its events to the supplied callbacks. Returns a handle
* the caller uses to push audio / stop / close. Throws
* AiSttNotConfiguredException when no driver/STT model is configured, or a
* plain Error (with a concrete reason) when the SSRF check fails.
*/
async openSession(
workspaceId: string,
opts: OpenSessionOptions,
): Promise<RealtimeSessionHandle> {
const cfg = await this.aiSettings.resolve(workspaceId);
const model = cfg?.sttRealtimeModel || cfg?.sttModel;
if (!cfg?.driver || !model) {
throw new AiSttNotConfiguredException();
}
const baseUrl = cfg.sttRealtimeBaseUrl || cfg.sttBaseUrl || cfg.baseUrl;
const wssUrl = AiRealtimeService.deriveRealtimeUrl(baseUrl);
// SSRF check on the http(s) equivalent (ssrf-guard only allows http/https):
// wss→https, ws→http. Re-checked here, right before connecting, to close the
// DNS-rebinding window (same defense the external-MCP layer uses).
const httpEquivalent = wssUrl.replace(/^wss:/i, 'https:').replace(/^ws:/i, 'http:');
const check = await isUrlAllowed(httpEquivalent);
if (!check.ok) {
throw new Error(
`Realtime endpoint blocked by SSRF guard: ${check.reason ?? 'not allowed'}`,
);
}
const key = cfg.sttApiKey;
// Never log the key; only the (non-secret) URL is safe to log.
this.logger.log(`Opening realtime STT session for workspace ${workspaceId}`);
const ws = new WebSocket(wssUrl, {
headers: key ? { Authorization: `Bearer ${key}` } : {},
// DO NOT send OpenAI-Beta: realtime=v1 — removed in GA.
});
let closed = false;
let idleTimer: NodeJS.Timeout | undefined;
let maxTimer: NodeJS.Timeout | undefined;
const clearTimers = (): void => {
if (idleTimer) {
clearTimeout(idleTimer);
idleTimer = undefined;
}
if (maxTimer) {
clearTimeout(maxTimer);
maxTimer = undefined;
}
};
// Idempotent teardown: clears timers, force-closes the upstream, fires
// onClosed exactly once.
const teardown = (): void => {
if (closed) return;
closed = true;
clearTimers();
try {
if (
ws.readyState === WebSocket.OPEN ||
ws.readyState === WebSocket.CONNECTING
) {
ws.close();
}
} catch {
// Ignore close races; the socket is being discarded anyway.
}
opts.onClosed();
};
const failWith = (message: string): void => {
if (closed) return;
opts.onError(message);
teardown();
};
const resetIdleTimer = (): void => {
if (closed) return;
if (idleTimer) clearTimeout(idleTimer);
idleTimer = setTimeout(() => {
failWith(
`Realtime session idle for ${IDLE_TIMEOUT_MS}ms (no audio received); closing.`,
);
}, IDLE_TIMEOUT_MS);
};
// Hard lifetime cap, armed immediately so a never-opening or runaway session
// is always reclaimed.
maxTimer = setTimeout(() => {
failWith(
`Realtime session exceeded the maximum duration of ${MAX_SESSION_DURATION_MS}ms; closing.`,
);
}, MAX_SESSION_DURATION_MS);
// Also guard the handshake itself: if the upstream never opens / never sends,
// the idle timer (15s) reclaims it well before the 120s max-duration cap.
resetIdleTimer();
const acc = new Map<string, string>();
ws.on('open', () => {
if (closed) return;
// GA session.update: declare the transcription session, PCM16/24kHz mono
// input, server VAD auto-segmentation, the effective model and (optional)
// language. `language` is included only when the client supplied one.
const transcription: { model: string; language?: string } = { model };
if (opts.language) transcription.language = opts.language;
const sessionUpdate = {
type: 'session.update',
session: {
type: 'transcription',
audio: {
input: {
format: { type: 'audio/pcm', rate: 24000 },
turn_detection: { type: 'server_vad' },
transcription,
},
},
},
};
try {
ws.send(JSON.stringify(sessionUpdate));
} catch (err) {
this.logger.error('Failed to send realtime session.update', err as Error);
failWith(describeProviderError(err, 'Failed to start realtime session'));
return;
}
// Start the idle clock once the upstream is live.
resetIdleTimer();
});
ws.on('message', (data: WebSocket.RawData) => {
if (closed) return;
const raw = AiRealtimeService.rawDataToString(data);
const parsed = parseUpstreamEvent(raw, acc);
switch (parsed.type) {
case 'ready':
opts.onReady();
break;
case 'interim':
opts.onInterim(parsed.itemId!, parsed.text ?? '');
break;
case 'final':
opts.onFinal(parsed.itemId!, parsed.text ?? '');
break;
case 'error':
// Log the full upstream error then surface the concrete reason.
this.logger.error(`Realtime upstream error: ${parsed.message}`);
failWith(parsed.message ?? 'Realtime transcription error');
break;
case 'ignore':
default:
break;
}
});
ws.on('error', (err: Error) => {
// Log the full error (name/message/stack); never the key/audio.
this.logger.error('Realtime upstream socket error', err);
failWith(describeProviderError(err, 'Realtime upstream connection error'));
});
ws.on('close', (code: number, reason: Buffer) => {
if (closed) return;
const why = reason?.toString?.() || '';
// An unexpected close (not via stop()/teardown) is reported as a concrete
// reason; onClosed always fires via teardown.
this.logger.log(
`Realtime upstream closed (code ${code}${why ? `: ${why}` : ''})`,
);
if (code !== 1000) {
failWith(
`Realtime upstream closed (code ${code}${why ? `: ${why}` : ''}).`,
);
return;
}
teardown();
});
return {
appendAudio: (chunk: Buffer | Uint8Array): void => {
if (closed || ws.readyState !== WebSocket.OPEN) return;
const audio = Buffer.from(chunk).toString('base64');
try {
ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio }));
} catch (err) {
this.logger.error('Failed to forward realtime audio chunk', err as Error);
failWith(describeProviderError(err, 'Failed to forward audio'));
return;
}
// Audio flowing again → push the idle deadline out.
resetIdleTimer();
},
stop: (): void => {
// Graceful stop: with server_vad no manual commit is required, but an
// explicit commit flushes any buffered tail before we close.
if (!closed && ws.readyState === WebSocket.OPEN) {
try {
ws.send(JSON.stringify({ type: 'input_audio_buffer.commit' }));
} catch (err) {
// A failed commit is non-fatal; we still close gracefully below.
this.logger.error('Failed to commit realtime audio buffer', err as Error);
}
}
teardown();
},
close: (): void => {
teardown();
},
};
}
/**
* Admin "test connection" probe for the realtime STT upstream. Reuses
* openSession so the real config-resolution, SSRF check and handshake path are
* exercised, then tears the upstream socket down immediately — no audio is ever
* sent. Resolves to the FROZEN contract { ok: true } | { ok: false, error }.
*
* Resolution rules (settle exactly once, guarded by `settled`):
* - first onReady → { ok: true }
* - first onError(message) → { ok: false, error: message }
* - ~8s timeout → { ok: false, error: 'Realtime connection timed out' }
* - openSession(...) throws → { ok: false, error } (AiSttNotConfigured message,
* else describeProviderError)
*
* On any outcome the upstream handle is closed and the timer cleared exactly
* once, so this never leaves a socket open. The API key is never logged.
*/
async testConnection(
workspaceId: string,
): Promise<{ ok: true } | { ok: false; error: string }> {
return new Promise<{ ok: true } | { ok: false; error: string }>(
(resolve) => {
let settled = false;
let handle: RealtimeSessionHandle | undefined;
let timer: NodeJS.Timeout | undefined;
// Settle once: clear the timer, close the upstream handle, resolve.
const finish = (
result: { ok: true } | { ok: false; error: string },
): void => {
if (settled) return;
settled = true;
if (timer) {
clearTimeout(timer);
timer = undefined;
}
try {
handle?.close();
} catch {
// Ignore close races; the socket is being discarded anyway.
}
resolve(result);
};
// Arm the timeout before opening so a never-readying upstream is reclaimed.
timer = setTimeout(() => {
finish({ ok: false, error: 'Realtime connection timed out' });
}, TEST_CONNECTION_TIMEOUT_MS);
this.openSession(workspaceId, {
onReady: () => finish({ ok: true }),
onError: (message) => finish({ ok: false, error: message }),
// No audio is ever sent; these are no-ops for the probe.
onInterim: () => {},
onFinal: () => {},
onClosed: () => {},
})
.then((opened) => {
handle = opened;
// openSession may have already errored/closed synchronously before
// we stored the handle; if we've settled, close it now.
if (settled) {
try {
handle.close();
} catch {
// Ignore close races.
}
}
})
.catch((err: unknown) => {
// openSession threw (AiSttNotConfiguredException or SSRF/Error)
// before any socket was returned: surface a concrete reason.
const error =
err instanceof AiSttNotConfiguredException
? err.message
: describeProviderError(err, 'Realtime connection failed');
finish({ ok: false, error });
});
},
);
}
/**
* Derive the upstream realtime WSS URL from the (optional) effective base URL.
*
* - No base URL → OpenAI default
* `wss://api.openai.com/v1/realtime?intent=transcription`.
* - Otherwise: take the base origin, ensure exactly one
* `/v1/realtime?intent=transcription` path, and upgrade the scheme to wss
* (http→ws→wss; https→wss). A base that already ends in `/v1` (or
* `/v1/realtime`) does not get a duplicated `/v1`.
*/
static deriveRealtimeUrl(baseUrl?: string): string {
if (!baseUrl || !baseUrl.trim()) {
return 'wss://api.openai.com/v1/realtime?intent=transcription';
}
let parsed: URL;
try {
parsed = new URL(baseUrl.trim());
} catch {
// Unparseable base: fall back to the OpenAI default rather than throwing
// here; the SSRF check on the default still applies downstream.
return 'wss://api.openai.com/v1/realtime?intent=transcription';
}
// Normalize the path: strip a trailing slash, drop an existing
// `/realtime` suffix, ensure a single `/v1`, then append the realtime path.
let path = parsed.pathname.replace(/\/+$/, '');
path = path.replace(/\/realtime$/i, '');
if (!/\/v1$/i.test(path)) {
path = `${path}/v1`;
}
path = `${path}/realtime`;
// Scheme → wss (secure) / ws (insecure). The SSRF guard runs on the
// http(s) equivalent before connecting.
const scheme = parsed.protocol === 'http:' || parsed.protocol === 'ws:' ? 'ws' : 'wss';
return `${scheme}://${parsed.host}${path}?intent=transcription`;
}
/** Normalize a ws RawData payload (Buffer | ArrayBuffer | Buffer[]) to a string. */
private static rawDataToString(data: WebSocket.RawData): string {
if (typeof data === 'string') return data;
if (Buffer.isBuffer(data)) return data.toString('utf8');
if (Array.isArray(data)) return Buffer.concat(data).toString('utf8');
// ArrayBuffer
return Buffer.from(data as ArrayBuffer).toString('utf8');
}
}

View File

@@ -55,6 +55,10 @@ export class UpdateWorkspaceDto extends PartialType(CreateWorkspaceDto) {
@IsBoolean() @IsBoolean()
aiDictation: boolean; aiDictation: boolean;
@IsOptional()
@IsBoolean()
aiDictationRealtime: boolean;
// Workspace master toggle that enables/disables the HTML embed block type. // Workspace master toggle that enables/disables the HTML embed block type.
// Persisted at settings.htmlEmbed. ABSENT/false => OFF (default). The block // Persisted at settings.htmlEmbed. ABSENT/false => OFF (default). The block
// itself renders in a sandboxed iframe, so this is a feature switch, not a // itself renders in a sandboxed iframe, so this is a feature switch, not a

View File

@@ -511,6 +511,20 @@ export class WorkspaceService {
); );
} }
if (typeof updateWorkspaceDto.aiDictationRealtime !== 'undefined') {
const prev = settingsBefore?.ai?.dictationRealtime ?? false;
if (prev !== updateWorkspaceDto.aiDictationRealtime) {
before.aiDictationRealtime = prev;
after.aiDictationRealtime = updateWorkspaceDto.aiDictationRealtime;
}
await this.workspaceRepo.updateAiSettings(
workspaceId,
'dictationRealtime',
updateWorkspaceDto.aiDictationRealtime,
trx,
);
}
if (typeof updateWorkspaceDto.htmlEmbed !== 'undefined') { if (typeof updateWorkspaceDto.htmlEmbed !== 'undefined') {
const prev = settingsBefore?.htmlEmbed ?? false; const prev = settingsBefore?.htmlEmbed ?? false;
if (prev !== updateWorkspaceDto.htmlEmbed) { if (prev !== updateWorkspaceDto.htmlEmbed) {
@@ -564,6 +578,7 @@ export class WorkspaceService {
delete updateWorkspaceDto.allowMemberTemplates; delete updateWorkspaceDto.allowMemberTemplates;
delete updateWorkspaceDto.aiChat; delete updateWorkspaceDto.aiChat;
delete updateWorkspaceDto.aiDictation; delete updateWorkspaceDto.aiDictation;
delete updateWorkspaceDto.aiDictationRealtime;
delete updateWorkspaceDto.htmlEmbed; delete updateWorkspaceDto.htmlEmbed;
delete updateWorkspaceDto.trackerHead; delete updateWorkspaceDto.trackerHead;
delete updateWorkspaceDto.aiPublicShareAssistant; delete updateWorkspaceDto.aiPublicShareAssistant;

View File

@@ -239,7 +239,7 @@ export class WorkspaceRepo {
// is a real jsonb object, never a double-encoded string. The CASE self-heals // is a real jsonb object, never a double-encoded string. The CASE self-heals
// workspaces whose settings.ai.provider was previously corrupted into an // workspaces whose settings.ai.provider was previously corrupted into an
// array/string. // array/string.
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'systemPrompt', 'publicShareChatModel', 'publicShareAssistantRoleId']; const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttRealtimeModel', 'sttRealtimeBaseUrl', 'sttApiStyle', 'systemPrompt', 'publicShareChatModel', 'publicShareAssistantRoleId'];
const entries = Object.entries(provider).filter( const entries = Object.entries(provider).filter(
([k, v]) => v !== undefined && ALLOWED.includes(k), ([k, v]) => v !== undefined && ALLOWED.includes(k),
); );

View File

@@ -32,6 +32,8 @@ export interface UpdateAiSettingsInput {
embeddingApiKey?: string; embeddingApiKey?: string;
sttModel?: string; sttModel?: string;
sttBaseUrl?: string; sttBaseUrl?: string;
sttRealtimeModel?: string;
sttRealtimeBaseUrl?: string;
sttApiStyle?: SttApiStyle; sttApiStyle?: SttApiStyle;
sttApiKey?: string; sttApiKey?: string;
publicShareChatModel?: string; publicShareChatModel?: string;
@@ -163,6 +165,10 @@ export class AiSettingsService {
publicShareAssistantRoleId: provider.publicShareAssistantRoleId, publicShareAssistantRoleId: provider.publicShareAssistantRoleId,
embeddingModel: provider.embeddingModel, embeddingModel: provider.embeddingModel,
sttModel: provider.sttModel, sttModel: provider.sttModel,
// Raw passthrough, NO fallback; the realtime consumer falls back to
// `sttModel` / (`sttBaseUrl` || `baseUrl`) at use time.
sttRealtimeModel: provider.sttRealtimeModel,
sttRealtimeBaseUrl: provider.sttRealtimeBaseUrl,
// Plain passthrough, no fallback; the transcribe path defaults unset to // Plain passthrough, no fallback; the transcribe path defaults unset to
// 'multipart' (current behavior). // 'multipart' (current behavior).
sttApiStyle: provider.sttApiStyle, sttApiStyle: provider.sttApiStyle,
@@ -239,6 +245,8 @@ export class AiSettingsService {
embeddingBaseUrl: provider.embeddingBaseUrl, embeddingBaseUrl: provider.embeddingBaseUrl,
sttModel: provider.sttModel, sttModel: provider.sttModel,
sttBaseUrl: provider.sttBaseUrl, sttBaseUrl: provider.sttBaseUrl,
sttRealtimeModel: provider.sttRealtimeModel,
sttRealtimeBaseUrl: provider.sttRealtimeBaseUrl,
sttApiStyle: provider.sttApiStyle, sttApiStyle: provider.sttApiStyle,
systemPrompt: provider.systemPrompt, systemPrompt: provider.systemPrompt,
publicShareChatModel: provider.publicShareChatModel, publicShareChatModel: provider.publicShareChatModel,
@@ -278,6 +286,8 @@ export class AiSettingsService {
'embeddingBaseUrl', 'embeddingBaseUrl',
'sttModel', 'sttModel',
'sttBaseUrl', 'sttBaseUrl',
'sttRealtimeModel',
'sttRealtimeBaseUrl',
'sttApiStyle', 'sttApiStyle',
'systemPrompt', 'systemPrompt',
'publicShareChatModel', 'publicShareChatModel',

View File

@@ -30,6 +30,11 @@ export interface AiProviderSettings {
sttModel?: string; sttModel?: string;
// STT-specific base URL. Falls back to baseUrl when empty/unset. // STT-specific base URL. Falls back to baseUrl when empty/unset.
sttBaseUrl?: string; sttBaseUrl?: string;
// Realtime STT model id. Falls back to `sttModel` at use time when empty/unset.
sttRealtimeModel?: string;
// Realtime STT base URL. Falls back to `sttBaseUrl` || `baseUrl` at use time
// when empty/unset.
sttRealtimeBaseUrl?: string;
sttApiStyle?: SttApiStyle; sttApiStyle?: SttApiStyle;
systemPrompt?: string; systemPrompt?: string;
// Cheap chat model id used ONLY by the anonymous public-share assistant. The // Cheap chat model id used ONLY by the anonymous public-share assistant. The
@@ -79,6 +84,8 @@ export interface MaskedAiSettings {
embeddingBaseUrl?: string; embeddingBaseUrl?: string;
sttModel?: string; sttModel?: string;
sttBaseUrl?: string; sttBaseUrl?: string;
sttRealtimeModel?: string;
sttRealtimeBaseUrl?: string;
sttApiStyle?: SttApiStyle; sttApiStyle?: SttApiStyle;
systemPrompt?: string; systemPrompt?: string;
publicShareChatModel?: string; publicShareChatModel?: string;

View File

@@ -50,6 +50,14 @@ export class UpdateAiSettingsDto {
@IsString() @IsString()
sttBaseUrl?: string; sttBaseUrl?: string;
@IsOptional()
@IsString()
sttRealtimeModel?: string;
@IsOptional()
@IsString()
sttRealtimeBaseUrl?: string;
@IsOptional() @IsOptional()
@IsIn(STT_API_STYLES) @IsIn(STT_API_STYLES)
sttApiStyle?: SttApiStyle; sttApiStyle?: SttApiStyle;