diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json
index 591b362a..8cfd742c 100644
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1189,5 +1189,7 @@
"No microphone found": "No microphone found",
"Could not start recording": "Could not start recording",
"Transcription failed": "Transcription failed",
- "Voice dictation is not configured": "Voice dictation is not configured"
+ "Voice dictation is not configured": "Voice dictation is not configured",
+ "Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
+ "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
}
diff --git a/apps/client/src/features/dictation/hooks/use-dictation.ts b/apps/client/src/features/dictation/hooks/use-dictation.ts
index 059949f0..86af4c78 100644
--- a/apps/client/src/features/dictation/hooks/use-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-dictation.ts
@@ -90,18 +90,37 @@ export function useDictation(
if (status !== "idle") return;
startingRef.current = true;
+ if (!navigator.mediaDevices?.getUserMedia) {
+ const reason =
+ "navigator.mediaDevices.getUserMedia is unavailable in this context";
+ console.error("[dictation] " + reason);
+ notifications.show({
+ color: "red",
+ message: t("Audio recording is not available in this browser/context"),
+ });
+ setStatus("idle");
+ startingRef.current = false;
+ return;
+ }
+
let stream: MediaStream;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
+ // Always log the full error for diagnosis (name, message, stack).
+ console.error("[dictation] getUserMedia failed", err);
const name = (err as { name?: string })?.name;
+ const detail = (err as { message?: string })?.message ?? String(err);
let message: string;
if (name === "NotAllowedError" || name === "SecurityError") {
message = t("Microphone access denied");
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
message = t("No microphone found");
+ } else if (name === "NotReadableError" || name === "AbortError") {
+ message = t("Microphone is unavailable or already in use");
} else {
- message = t("Could not start recording");
+ // Unknown failure: show the real reason instead of a generic string.
+ message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
}
notifications.show({ color: "red", message });
setStatus("idle");
@@ -120,13 +139,14 @@ export function useDictation(
stream,
mimeType ? { mimeType } : undefined,
);
- } catch {
+ } catch (err) {
+ console.error("[dictation] MediaRecorder failed", err);
// The stream was acquired but the recorder failed to construct; stop the
// tracks so the MediaStream does not leak before bailing out.
stopTracks();
notifications.show({
color: "red",
- message: t("Could not start recording"),
+ message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
});
setStatus("idle");
startingRef.current = false;
@@ -165,17 +185,23 @@ export function useDictation(
setStatus("idle");
})
.catch((err: unknown) => {
- const httpStatus = (err as { response?: { status?: number } })
- ?.response?.status;
- // The server returns 503 when dictation is unconfigured and 403 when
- // it is disabled server-side; both map to the same "not configured".
- const message =
- httpStatus === 503 || httpStatus === 403
- ? t("Voice dictation is not configured")
- : t("Transcription failed");
+ // Log the full error for diagnosis (status + body + stack).
+ console.error("[dictation] transcription failed", err);
+ const resp = (
+ err as { response?: { status?: number; data?: { message?: string } } }
+ )?.response;
+ const serverMsg = resp?.data?.message;
+ let message: string;
+ if (serverMsg && serverMsg.trim().length > 0) {
+ // The server already explains the cause (e.g. provider 404, bad
+ // format, STT not configured) — show it verbatim.
+ message = serverMsg;
+ } else if (resp?.status === 503 || resp?.status === 403) {
+ message = t("Voice dictation is not configured");
+ } else {
+ message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
+ }
notifications.show({ color: "red", message });
- // Surface the error state briefly, then return to idle. Store the
- // timer so it can be cleared on unmount.
setStatus("error");
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
@@ -192,7 +218,8 @@ export function useDictation(
try {
optionsRef.current.onStart?.();
recorder.start();
- } catch {
+ } catch (err) {
+ console.error("[dictation] MediaRecorder.start failed", err);
// recorder.start() can synchronously throw (InvalidStateError /
// NotSupportedError); clean up so the button is not left stuck and the
// MediaStream does not leak.
@@ -201,7 +228,7 @@ export function useDictation(
startingRef.current = false;
notifications.show({
color: "red",
- message: t("Could not start recording"),
+ message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
});
setStatus("idle");
return;
diff --git a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
index e39176fd..b908fc03 100644
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -93,9 +93,10 @@ export default function AiProviderSettings() {
const updateMutation = useUpdateAiSettingsMutation();
const reindexMutation = useReindexAiEmbeddingsMutation();
- // Two independent test mutations so each card has its own loading + result.
+ // Independent test mutations so each card has its own loading + result.
const chatTest = useTestAiConnectionMutation();
const embedTest = useTestAiConnectionMutation();
+ const sttTest = useTestAiConnectionMutation();
// Workspace-level feature toggles live in the card headers.
const [workspace, setWorkspace] = useAtom(workspaceAtom);
@@ -354,6 +355,11 @@ export default function AiProviderSettings() {
? "ok"
: "error"
: "idle";
+ const sttStatus: CardStatus = sttTest.data
+ ? sttTest.data.ok
+ ? "ok"
+ : "error"
+ : "idle";
const chatResolved = resolveUrl(form.values.baseUrl, "/chat/completions");
const embedResolved = resolveUrl(
@@ -617,7 +623,7 @@ export default function AiProviderSettings() {
-
+
{t("Voice / STT")}
{t("Resolves to {{url}}", { url: sttResolved })}
+
+
+
+ {sttTest.data &&
+ (sttTest.data.ok ? (
+
+ {t("Connection successful")}
+
+ ) : (
+
+ {sttTest.data.error || t("Connection failed")}
+
+ ))}
+
{/* Nested: external MCP tools the agent calls out to */}
diff --git a/apps/client/src/features/workspace/services/ai-settings-service.ts b/apps/client/src/features/workspace/services/ai-settings-service.ts
index 53809ab9..99490189 100644
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -55,7 +55,7 @@ export interface IAiTestResult {
}
// Which endpoint a connection test probes.
-export type AiTestCapability = "chat" | "embeddings";
+export type AiTestCapability = "chat" | "embeddings" | "stt";
export async function getAiSettings(): Promise {
const req = await api.post("/workspace/ai-settings");
diff --git a/apps/server/src/core/ai-chat/ai-chat.controller.ts b/apps/server/src/core/ai-chat/ai-chat.controller.ts
index d1007a78..c32e8e3c 100644
--- a/apps/server/src/core/ai-chat/ai-chat.controller.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts
@@ -4,11 +4,13 @@ import {
Controller,
ForbiddenException,
HttpCode,
+ HttpException,
HttpStatus,
Logger,
Post,
Req,
Res,
+ ServiceUnavailableException,
UseGuards,
UseInterceptors,
} from '@nestjs/common';
@@ -32,6 +34,7 @@ import {
GetChatMessagesDto,
RenameChatDto,
} from './dto/ai-chat.dto';
+import { describeProviderError } from '../../integrations/ai/ai-error.util';
/**
* Per-user AI chat API (§6.1). Routes are POST to match this codebase's
@@ -249,7 +252,31 @@ export class AiChatController {
}
throw err;
}
- const text = await this.aiTranscription.transcribe(workspace.id, buf);
+ // Container hint for JSON-style STT providers (e.g. OpenRouter); multipart
+ // endpoints ignore it.
+ const formatMap: Record = {
+ 'audio/webm': 'webm',
+ 'audio/ogg': 'ogg',
+ 'audio/mp4': 'mp4',
+ 'audio/mpeg': 'mp3',
+ 'audio/wav': 'wav',
+ 'audio/x-wav': 'wav',
+ 'audio/wave': 'wav',
+ 'audio/m4a': 'm4a',
+ 'audio/x-m4a': 'm4a',
+ };
+ const format = formatMap[baseMime] ?? 'webm';
+ let text: string;
+ try {
+ text = await this.aiTranscription.transcribe(workspace.id, buf, format);
+ } catch (err) {
+ // Preserve meaningful HTTP errors (e.g. AiSttNotConfiguredException -> 503).
+ if (err instanceof HttpException) throw err;
+ // Log the full error and surface the real provider/transport reason instead
+ // of an opaque 500 (e.g. "the STT endpoint returned 404 ...").
+ this.logger.error('AI transcription failed', err as Error);
+ throw new ServiceUnavailableException(describeProviderError(err));
+ }
return { text };
}
diff --git a/apps/server/src/core/ai-chat/ai-transcription.service.ts b/apps/server/src/core/ai-chat/ai-transcription.service.ts
index 72d3ea9f..b95cbb69 100644
--- a/apps/server/src/core/ai-chat/ai-transcription.service.ts
+++ b/apps/server/src/core/ai-chat/ai-transcription.service.ts
@@ -1,20 +1,21 @@
import { Injectable } from '@nestjs/common';
-import { experimental_transcribe as transcribe } from 'ai';
import { AiService } from '../../integrations/ai/ai.service';
/**
* Transcribes uploaded audio to text using the per-workspace STT model.
- * Thin wrapper over the AI SDK's experimental_transcribe; never logs the
- * audio or the key.
+ * Delegates to AiService, which picks the OpenAI-multipart or OpenRouter-JSON
+ * path. Never logs the audio or the key.
*/
@Injectable()
export class AiTranscriptionService {
constructor(private readonly ai: AiService) {}
- // Transcribe an uploaded audio buffer using the workspace STT model.
- async transcribe(workspaceId: string, audio: Uint8Array): Promise {
- const model = await this.ai.getTranscriptionModel(workspaceId);
- const { text } = await transcribe({ model, audio });
- return text.trim();
+ // Transcribe an uploaded audio buffer. `format` is the container hint.
+ async transcribe(
+ workspaceId: string,
+ audio: Uint8Array,
+ format: string,
+ ): Promise {
+ return this.ai.transcribe(workspaceId, audio, format);
}
}
diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts
index b93416d0..e894d703 100644
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -1,10 +1,10 @@
import { Injectable, Logger } from '@nestjs/common';
import {
embedMany,
+ experimental_transcribe as transcribe,
generateText,
type EmbeddingModel,
type LanguageModel,
- type TranscriptionModel,
} from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -108,24 +108,90 @@ export class AiService {
}
}
+ // Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
+ // does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
+ // { model, input_audio: { data: , format } }. Detect it by host so the
+ // standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
+ // unaffected.
+ private static isOpenRouter(baseURL?: string): boolean {
+ if (!baseURL) return false;
+ try {
+ const host = new URL(baseURL).hostname.toLowerCase();
+ // Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
+ return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
+ } catch {
+ return false;
+ }
+ }
+
/**
- * Resolve the workspace config and build the transcription (STT) model.
- * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
- * (only @ai-sdk/openai exposes .transcription()), regardless of the chat
- * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
- * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
- * on demand; the decrypted key is never logged.
- *
- * Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
+ * Transcribe audio with the workspace STT model. Standard OpenAI-compatible
+ * endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
+ * audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
+ * wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
+ * AiSttNotConfiguredException (-> 503) when no STT model is configured.
*/
- async getTranscriptionModel(workspaceId: string): Promise {
+ async transcribe(
+ workspaceId: string,
+ audio: Uint8Array,
+ format: string,
+ ): Promise {
const cfg = await this.aiSettings.resolve(workspaceId);
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
- const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
- // apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
- return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
- cfg.sttModel,
- );
+ const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
+
+ if (AiService.isOpenRouter(baseURL)) {
+ return this.transcribeViaOpenRouter(
+ baseURL as string,
+ cfg.sttApiKey,
+ cfg.sttModel,
+ audio,
+ format,
+ );
+ }
+
+ // Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
+ // keyless self-hosted whisper; pass a placeholder.
+ const model = createOpenAI({
+ apiKey: cfg.sttApiKey ?? 'unused',
+ baseURL,
+ }).transcription(cfg.sttModel);
+ const { text } = await transcribe({ model, audio });
+ return text.trim();
+ }
+
+ // OpenRouter transcription: JSON body with base64 audio; returns { text }.
+ private async transcribeViaOpenRouter(
+ baseURL: string,
+ apiKey: string | undefined,
+ model: string,
+ audio: Uint8Array,
+ format: string,
+ ): Promise {
+ const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
+ const res = await fetch(url, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
+ },
+ body: JSON.stringify({
+ model,
+ input_audio: {
+ data: Buffer.from(audio).toString('base64'),
+ format,
+ },
+ }),
+ });
+ if (!res.ok) {
+ // Surface status + body so the real reason reaches the user; never log the key.
+ const body = await res.text().catch(() => '');
+ throw new Error(
+ `OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
+ );
+ }
+ const json = (await res.json()) as { text?: string };
+ return (json.text ?? '').trim();
}
/**
@@ -182,11 +248,36 @@ export class AiService {
return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
}
+ // Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only
+ // as a connectivity probe for the STT endpoint in testConnection.
+ private static silentWavProbe(): Uint8Array {
+ const sampleRate = 16000;
+ const numSamples = sampleRate; // ~1 second
+ const dataSize = numSamples * 2; // 16-bit mono
+ const buf = Buffer.alloc(44 + dataSize);
+ buf.write('RIFF', 0);
+ buf.writeUInt32LE(36 + dataSize, 4);
+ buf.write('WAVE', 8);
+ buf.write('fmt ', 12);
+ buf.writeUInt32LE(16, 16); // PCM fmt chunk size
+ buf.writeUInt16LE(1, 20); // audio format = PCM
+ buf.writeUInt16LE(1, 22); // channels = 1
+ buf.writeUInt32LE(sampleRate, 24);
+ buf.writeUInt32LE(sampleRate * 2, 28); // byte rate
+ buf.writeUInt16LE(2, 32); // block align
+ buf.writeUInt16LE(16, 34); // bits per sample
+ buf.write('data', 36);
+ buf.writeUInt32LE(dataSize, 40);
+ // The PCM samples stay zero (silence).
+ return buf;
+ }
+
/**
* Cheap connectivity check for a single "Test endpoint" button. Probes ONLY
* the requested capability so each card in the UI surfaces its own result:
* - `chat`: a one-word generation against the configured chat model;
- * - `embeddings`: embedding a tiny string against the embedding model.
+ * - `embeddings`: embedding a tiny string against the embedding model;
+ * - `stt`: transcribing a tiny silent WAV against the transcription model.
*
* A capability that is not configured returns a plain "… is not configured"
* message; any real failure returns ok:false with the provider's own cause
@@ -201,7 +292,7 @@ export class AiService {
*/
async testConnection(
workspaceId: string,
- capability: 'chat' | 'embeddings' = 'chat',
+ capability: 'chat' | 'embeddings' | 'stt' = 'chat',
): Promise<{ ok: true } | { ok: false; error: string }> {
if (capability === 'embeddings') {
try {
@@ -216,6 +307,21 @@ export class AiService {
}
}
+ if (capability === 'stt') {
+ try {
+ // Probe with a tiny silent WAV; a reachable, authorized endpoint returns
+ // (usually empty) text, any failure surfaces via describeProviderError.
+ await this.transcribe(workspaceId, AiService.silentWavProbe(), 'wav');
+ return { ok: true };
+ } catch (err) {
+ if (err instanceof AiSttNotConfiguredException) {
+ return { ok: false, error: 'STT is not configured' };
+ }
+ this.logger.error('AI STT test connection failed', err as Error);
+ return { ok: false, error: describeProviderError(err) };
+ }
+ }
+
// Default: chat probe.
try {
const model = await this.getChatModel(workspaceId);
diff --git a/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts b/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
index 9fab83a0..f383f0f3 100644
--- a/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
+++ b/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
@@ -4,6 +4,6 @@ import { IsIn, IsOptional } from 'class-validator';
// defaults to the chat endpoint server-side when omitted.
export class TestAiConnectionDto {
@IsOptional()
- @IsIn(['chat', 'embeddings'])
- capability?: 'chat' | 'embeddings';
+ @IsIn(['chat', 'embeddings', 'stt'])
+ capability?: 'chat' | 'embeddings' | 'stt';
}