From 77249d59c6065b12d65312a2dc96c07353059179 Mon Sep 17 00:00:00 2001 From: vvzvlad Date: Thu, 18 Jun 2026 19:26:35 +0300 Subject: [PATCH] feat(ai): OpenRouter STT support + real error surfacing + STT endpoint test - ai.service: route *.openrouter.ai STT to its JSON+base64 /audio/transcriptions API; keep the OpenAI multipart path (AI SDK) for OpenAI/self-hosted whisper. Unify transcription behind transcribe(). - /transcribe controller: surface the real provider/transport reason (describeProviderError) instead of an opaque 500; preserve HttpException. - testConnection: add an 'stt' capability (silent-WAV probe) + DTO; client gets a Test endpoint button and status dot on the Voice/STT card. - useDictation: log full errors to the console and show the real reason (mic start + transcription paths); handle NotReadable/Abort and missing mediaDevices. - docs(CLAUDE.md): require full error logging + specific user-facing messages. --- .../public/locales/en-US/translation.json | 4 +- .../features/dictation/hooks/use-dictation.ts | 57 +++++-- .../components/ai-provider-settings.tsx | 31 +++- .../workspace/services/ai-settings-service.ts | 2 +- .../src/core/ai-chat/ai-chat.controller.ts | 29 +++- .../core/ai-chat/ai-transcription.service.ts | 17 ++- apps/server/src/integrations/ai/ai.service.ts | 140 +++++++++++++++--- .../ai/dto/test-ai-connection.dto.ts | 4 +- 8 files changed, 237 insertions(+), 47 deletions(-) diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json index 591b362a..8cfd742c 100644 --- a/apps/client/public/locales/en-US/translation.json +++ b/apps/client/public/locales/en-US/translation.json @@ -1189,5 +1189,7 @@ "No microphone found": "No microphone found", "Could not start recording": "Could not start recording", "Transcription failed": "Transcription failed", - "Voice dictation is not configured": "Voice dictation is not configured" + "Voice dictation is not configured": "Voice dictation is not configured", + "Microphone is unavailable or already in use": "Microphone is unavailable or already in use", + "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context" } diff --git a/apps/client/src/features/dictation/hooks/use-dictation.ts b/apps/client/src/features/dictation/hooks/use-dictation.ts index 059949f0..86af4c78 100644 --- a/apps/client/src/features/dictation/hooks/use-dictation.ts +++ b/apps/client/src/features/dictation/hooks/use-dictation.ts @@ -90,18 +90,37 @@ export function useDictation( if (status !== "idle") return; startingRef.current = true; + if (!navigator.mediaDevices?.getUserMedia) { + const reason = + "navigator.mediaDevices.getUserMedia is unavailable in this context"; + console.error("[dictation] " + reason); + notifications.show({ + color: "red", + message: t("Audio recording is not available in this browser/context"), + }); + setStatus("idle"); + startingRef.current = false; + return; + } + let stream: MediaStream; try { stream = await navigator.mediaDevices.getUserMedia({ audio: true }); } catch (err) { + // Always log the full error for diagnosis (name, message, stack). + console.error("[dictation] getUserMedia failed", err); const name = (err as { name?: string })?.name; + const detail = (err as { message?: string })?.message ?? String(err); let message: string; if (name === "NotAllowedError" || name === "SecurityError") { message = t("Microphone access denied"); } else if (name === "NotFoundError" || name === "OverconstrainedError") { message = t("No microphone found"); + } else if (name === "NotReadableError" || name === "AbortError") { + message = t("Microphone is unavailable or already in use"); } else { - message = t("Could not start recording"); + // Unknown failure: show the real reason instead of a generic string. + message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`; } notifications.show({ color: "red", message }); setStatus("idle"); @@ -120,13 +139,14 @@ export function useDictation( stream, mimeType ? { mimeType } : undefined, ); - } catch { + } catch (err) { + console.error("[dictation] MediaRecorder failed", err); // The stream was acquired but the recorder failed to construct; stop the // tracks so the MediaStream does not leak before bailing out. stopTracks(); notifications.show({ color: "red", - message: t("Could not start recording"), + message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`, }); setStatus("idle"); startingRef.current = false; @@ -165,17 +185,23 @@ export function useDictation( setStatus("idle"); }) .catch((err: unknown) => { - const httpStatus = (err as { response?: { status?: number } }) - ?.response?.status; - // The server returns 503 when dictation is unconfigured and 403 when - // it is disabled server-side; both map to the same "not configured". - const message = - httpStatus === 503 || httpStatus === 403 - ? t("Voice dictation is not configured") - : t("Transcription failed"); + // Log the full error for diagnosis (status + body + stack). + console.error("[dictation] transcription failed", err); + const resp = ( + err as { response?: { status?: number; data?: { message?: string } } } + )?.response; + const serverMsg = resp?.data?.message; + let message: string; + if (serverMsg && serverMsg.trim().length > 0) { + // The server already explains the cause (e.g. provider 404, bad + // format, STT not configured) — show it verbatim. + message = serverMsg; + } else if (resp?.status === 503 || resp?.status === 403) { + message = t("Voice dictation is not configured"); + } else { + message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`; + } notifications.show({ color: "red", message }); - // Surface the error state briefly, then return to idle. Store the - // timer so it can be cleared on unmount. setStatus("error"); if (errorTimerRef.current !== null) { clearTimeout(errorTimerRef.current); @@ -192,7 +218,8 @@ export function useDictation( try { optionsRef.current.onStart?.(); recorder.start(); - } catch { + } catch (err) { + console.error("[dictation] MediaRecorder.start failed", err); // recorder.start() can synchronously throw (InvalidStateError / // NotSupportedError); clean up so the button is not left stuck and the // MediaStream does not leak. @@ -201,7 +228,7 @@ export function useDictation( startingRef.current = false; notifications.show({ color: "red", - message: t("Could not start recording"), + message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`, }); setStatus("idle"); return; diff --git a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx index e39176fd..b908fc03 100644 --- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx +++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx @@ -93,9 +93,10 @@ export default function AiProviderSettings() { const updateMutation = useUpdateAiSettingsMutation(); const reindexMutation = useReindexAiEmbeddingsMutation(); - // Two independent test mutations so each card has its own loading + result. + // Independent test mutations so each card has its own loading + result. const chatTest = useTestAiConnectionMutation(); const embedTest = useTestAiConnectionMutation(); + const sttTest = useTestAiConnectionMutation(); // Workspace-level feature toggles live in the card headers. const [workspace, setWorkspace] = useAtom(workspaceAtom); @@ -354,6 +355,11 @@ export default function AiProviderSettings() { ? "ok" : "error" : "idle"; + const sttStatus: CardStatus = sttTest.data + ? sttTest.data.ok + ? "ok" + : "error" + : "idle"; const chatResolved = resolveUrl(form.values.baseUrl, "/chat/completions"); const embedResolved = resolveUrl( @@ -617,7 +623,7 @@ export default function AiProviderSettings() { - + {t("Voice / STT")} {t("Resolves to {{url}}", { url: sttResolved })} + + + + {sttTest.data && + (sttTest.data.ok ? ( + + {t("Connection successful")} + + ) : ( + + {sttTest.data.error || t("Connection failed")} + + ))} + {/* Nested: external MCP tools the agent calls out to */} diff --git a/apps/client/src/features/workspace/services/ai-settings-service.ts b/apps/client/src/features/workspace/services/ai-settings-service.ts index 53809ab9..99490189 100644 --- a/apps/client/src/features/workspace/services/ai-settings-service.ts +++ b/apps/client/src/features/workspace/services/ai-settings-service.ts @@ -55,7 +55,7 @@ export interface IAiTestResult { } // Which endpoint a connection test probes. -export type AiTestCapability = "chat" | "embeddings"; +export type AiTestCapability = "chat" | "embeddings" | "stt"; export async function getAiSettings(): Promise { const req = await api.post("/workspace/ai-settings"); diff --git a/apps/server/src/core/ai-chat/ai-chat.controller.ts b/apps/server/src/core/ai-chat/ai-chat.controller.ts index d1007a78..c32e8e3c 100644 --- a/apps/server/src/core/ai-chat/ai-chat.controller.ts +++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts @@ -4,11 +4,13 @@ import { Controller, ForbiddenException, HttpCode, + HttpException, HttpStatus, Logger, Post, Req, Res, + ServiceUnavailableException, UseGuards, UseInterceptors, } from '@nestjs/common'; @@ -32,6 +34,7 @@ import { GetChatMessagesDto, RenameChatDto, } from './dto/ai-chat.dto'; +import { describeProviderError } from '../../integrations/ai/ai-error.util'; /** * Per-user AI chat API (§6.1). Routes are POST to match this codebase's @@ -249,7 +252,31 @@ export class AiChatController { } throw err; } - const text = await this.aiTranscription.transcribe(workspace.id, buf); + // Container hint for JSON-style STT providers (e.g. OpenRouter); multipart + // endpoints ignore it. + const formatMap: Record = { + 'audio/webm': 'webm', + 'audio/ogg': 'ogg', + 'audio/mp4': 'mp4', + 'audio/mpeg': 'mp3', + 'audio/wav': 'wav', + 'audio/x-wav': 'wav', + 'audio/wave': 'wav', + 'audio/m4a': 'm4a', + 'audio/x-m4a': 'm4a', + }; + const format = formatMap[baseMime] ?? 'webm'; + let text: string; + try { + text = await this.aiTranscription.transcribe(workspace.id, buf, format); + } catch (err) { + // Preserve meaningful HTTP errors (e.g. AiSttNotConfiguredException -> 503). + if (err instanceof HttpException) throw err; + // Log the full error and surface the real provider/transport reason instead + // of an opaque 500 (e.g. "the STT endpoint returned 404 ..."). + this.logger.error('AI transcription failed', err as Error); + throw new ServiceUnavailableException(describeProviderError(err)); + } return { text }; } diff --git a/apps/server/src/core/ai-chat/ai-transcription.service.ts b/apps/server/src/core/ai-chat/ai-transcription.service.ts index 72d3ea9f..b95cbb69 100644 --- a/apps/server/src/core/ai-chat/ai-transcription.service.ts +++ b/apps/server/src/core/ai-chat/ai-transcription.service.ts @@ -1,20 +1,21 @@ import { Injectable } from '@nestjs/common'; -import { experimental_transcribe as transcribe } from 'ai'; import { AiService } from '../../integrations/ai/ai.service'; /** * Transcribes uploaded audio to text using the per-workspace STT model. - * Thin wrapper over the AI SDK's experimental_transcribe; never logs the - * audio or the key. + * Delegates to AiService, which picks the OpenAI-multipart or OpenRouter-JSON + * path. Never logs the audio or the key. */ @Injectable() export class AiTranscriptionService { constructor(private readonly ai: AiService) {} - // Transcribe an uploaded audio buffer using the workspace STT model. - async transcribe(workspaceId: string, audio: Uint8Array): Promise { - const model = await this.ai.getTranscriptionModel(workspaceId); - const { text } = await transcribe({ model, audio }); - return text.trim(); + // Transcribe an uploaded audio buffer. `format` is the container hint. + async transcribe( + workspaceId: string, + audio: Uint8Array, + format: string, + ): Promise { + return this.ai.transcribe(workspaceId, audio, format); } } diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts index b93416d0..e894d703 100644 --- a/apps/server/src/integrations/ai/ai.service.ts +++ b/apps/server/src/integrations/ai/ai.service.ts @@ -1,10 +1,10 @@ import { Injectable, Logger } from '@nestjs/common'; import { embedMany, + experimental_transcribe as transcribe, generateText, type EmbeddingModel, type LanguageModel, - type TranscriptionModel, } from 'ai'; import { createOpenAI } from '@ai-sdk/openai'; import { createGoogleGenerativeAI } from '@ai-sdk/google'; @@ -108,24 +108,90 @@ export class AiService { } } + // Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter + // does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON + // { model, input_audio: { data: , format } }. Detect it by host so the + // standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is + // unaffected. + private static isOpenRouter(baseURL?: string): boolean { + if (!baseURL) return false; + try { + const host = new URL(baseURL).hostname.toLowerCase(); + // Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai". + return host === 'openrouter.ai' || host.endsWith('.openrouter.ai'); + } catch { + return false; + } + } + /** - * Resolve the workspace config and build the transcription (STT) model. - * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API - * (only @ai-sdk/openai exposes .transcription()), regardless of the chat - * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back - * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE - * on demand; the decrypted key is never logged. - * - * Throws AiSttNotConfiguredException (-> 503) when no STT model is set. + * Transcribe audio with the workspace STT model. Standard OpenAI-compatible + * endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64 + * audio/transcriptions API. `format` is the audio container hint (webm / mp4 / + * wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws + * AiSttNotConfiguredException (-> 503) when no STT model is configured. */ - async getTranscriptionModel(workspaceId: string): Promise { + async transcribe( + workspaceId: string, + audio: Uint8Array, + format: string, + ): Promise { const cfg = await this.aiSettings.resolve(workspaceId); if (!cfg?.sttModel) throw new AiSttNotConfiguredException(); - const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat - // apiKey may be unused for keyless self-hosted whisper; pass a placeholder. - return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription( - cfg.sttModel, - ); + const baseURL = cfg.sttBaseUrl || cfg.baseUrl; + + if (AiService.isOpenRouter(baseURL)) { + return this.transcribeViaOpenRouter( + baseURL as string, + cfg.sttApiKey, + cfg.sttModel, + audio, + format, + ); + } + + // Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for + // keyless self-hosted whisper; pass a placeholder. + const model = createOpenAI({ + apiKey: cfg.sttApiKey ?? 'unused', + baseURL, + }).transcription(cfg.sttModel); + const { text } = await transcribe({ model, audio }); + return text.trim(); + } + + // OpenRouter transcription: JSON body with base64 audio; returns { text }. + private async transcribeViaOpenRouter( + baseURL: string, + apiKey: string | undefined, + model: string, + audio: Uint8Array, + format: string, + ): Promise { + const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`; + const res = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}), + }, + body: JSON.stringify({ + model, + input_audio: { + data: Buffer.from(audio).toString('base64'), + format, + }, + }), + }); + if (!res.ok) { + // Surface status + body so the real reason reaches the user; never log the key. + const body = await res.text().catch(() => ''); + throw new Error( + `OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`, + ); + } + const json = (await res.json()) as { text?: string }; + return (json.text ?? '').trim(); } /** @@ -182,11 +248,36 @@ export class AiService { return Number.isFinite(raw) && raw > 0 ? raw : 120_000; } + // Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only + // as a connectivity probe for the STT endpoint in testConnection. + private static silentWavProbe(): Uint8Array { + const sampleRate = 16000; + const numSamples = sampleRate; // ~1 second + const dataSize = numSamples * 2; // 16-bit mono + const buf = Buffer.alloc(44 + dataSize); + buf.write('RIFF', 0); + buf.writeUInt32LE(36 + dataSize, 4); + buf.write('WAVE', 8); + buf.write('fmt ', 12); + buf.writeUInt32LE(16, 16); // PCM fmt chunk size + buf.writeUInt16LE(1, 20); // audio format = PCM + buf.writeUInt16LE(1, 22); // channels = 1 + buf.writeUInt32LE(sampleRate, 24); + buf.writeUInt32LE(sampleRate * 2, 28); // byte rate + buf.writeUInt16LE(2, 32); // block align + buf.writeUInt16LE(16, 34); // bits per sample + buf.write('data', 36); + buf.writeUInt32LE(dataSize, 40); + // The PCM samples stay zero (silence). + return buf; + } + /** * Cheap connectivity check for a single "Test endpoint" button. Probes ONLY * the requested capability so each card in the UI surfaces its own result: * - `chat`: a one-word generation against the configured chat model; - * - `embeddings`: embedding a tiny string against the embedding model. + * - `embeddings`: embedding a tiny string against the embedding model; + * - `stt`: transcribing a tiny silent WAV against the transcription model. * * A capability that is not configured returns a plain "… is not configured" * message; any real failure returns ok:false with the provider's own cause @@ -201,7 +292,7 @@ export class AiService { */ async testConnection( workspaceId: string, - capability: 'chat' | 'embeddings' = 'chat', + capability: 'chat' | 'embeddings' | 'stt' = 'chat', ): Promise<{ ok: true } | { ok: false; error: string }> { if (capability === 'embeddings') { try { @@ -216,6 +307,21 @@ export class AiService { } } + if (capability === 'stt') { + try { + // Probe with a tiny silent WAV; a reachable, authorized endpoint returns + // (usually empty) text, any failure surfaces via describeProviderError. + await this.transcribe(workspaceId, AiService.silentWavProbe(), 'wav'); + return { ok: true }; + } catch (err) { + if (err instanceof AiSttNotConfiguredException) { + return { ok: false, error: 'STT is not configured' }; + } + this.logger.error('AI STT test connection failed', err as Error); + return { ok: false, error: describeProviderError(err) }; + } + } + // Default: chat probe. try { const model = await this.getChatModel(workspaceId); diff --git a/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts b/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts index 9fab83a0..f383f0f3 100644 --- a/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts +++ b/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts @@ -4,6 +4,6 @@ import { IsIn, IsOptional } from 'class-validator'; // defaults to the chat endpoint server-side when omitted. export class TestAiConnectionDto { @IsOptional() - @IsIn(['chat', 'embeddings']) - capability?: 'chat' | 'embeddings'; + @IsIn(['chat', 'embeddings', 'stt']) + capability?: 'chat' | 'embeddings' | 'stt'; }