feat(ai): OpenRouter STT support + real error surfacing + STT endpoint test
- ai.service: route *.openrouter.ai STT to its JSON+base64 /audio/transcriptions API; keep the OpenAI multipart path (AI SDK) for OpenAI/self-hosted whisper. Unify transcription behind transcribe(). - /transcribe controller: surface the real provider/transport reason (describeProviderError) instead of an opaque 500; preserve HttpException. - testConnection: add an 'stt' capability (silent-WAV probe) + DTO; client gets a Test endpoint button and status dot on the Voice/STT card. - useDictation: log full errors to the console and show the real reason (mic start + transcription paths); handle NotReadable/Abort and missing mediaDevices. - docs(CLAUDE.md): require full error logging + specific user-facing messages.
This commit is contained in:
@@ -1189,5 +1189,7 @@
|
||||
"No microphone found": "No microphone found",
|
||||
"Could not start recording": "Could not start recording",
|
||||
"Transcription failed": "Transcription failed",
|
||||
"Voice dictation is not configured": "Voice dictation is not configured"
|
||||
"Voice dictation is not configured": "Voice dictation is not configured",
|
||||
"Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
|
||||
"Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
|
||||
}
|
||||
|
||||
@@ -90,18 +90,37 @@ export function useDictation(
|
||||
if (status !== "idle") return;
|
||||
startingRef.current = true;
|
||||
|
||||
if (!navigator.mediaDevices?.getUserMedia) {
|
||||
const reason =
|
||||
"navigator.mediaDevices.getUserMedia is unavailable in this context";
|
||||
console.error("[dictation] " + reason);
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: t("Audio recording is not available in this browser/context"),
|
||||
});
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
return;
|
||||
}
|
||||
|
||||
let stream: MediaStream;
|
||||
try {
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} catch (err) {
|
||||
// Always log the full error for diagnosis (name, message, stack).
|
||||
console.error("[dictation] getUserMedia failed", err);
|
||||
const name = (err as { name?: string })?.name;
|
||||
const detail = (err as { message?: string })?.message ?? String(err);
|
||||
let message: string;
|
||||
if (name === "NotAllowedError" || name === "SecurityError") {
|
||||
message = t("Microphone access denied");
|
||||
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
|
||||
message = t("No microphone found");
|
||||
} else if (name === "NotReadableError" || name === "AbortError") {
|
||||
message = t("Microphone is unavailable or already in use");
|
||||
} else {
|
||||
message = t("Could not start recording");
|
||||
// Unknown failure: show the real reason instead of a generic string.
|
||||
message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
|
||||
}
|
||||
notifications.show({ color: "red", message });
|
||||
setStatus("idle");
|
||||
@@ -120,13 +139,14 @@ export function useDictation(
|
||||
stream,
|
||||
mimeType ? { mimeType } : undefined,
|
||||
);
|
||||
} catch {
|
||||
} catch (err) {
|
||||
console.error("[dictation] MediaRecorder failed", err);
|
||||
// The stream was acquired but the recorder failed to construct; stop the
|
||||
// tracks so the MediaStream does not leak before bailing out.
|
||||
stopTracks();
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: t("Could not start recording"),
|
||||
message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
|
||||
});
|
||||
setStatus("idle");
|
||||
startingRef.current = false;
|
||||
@@ -165,17 +185,23 @@ export function useDictation(
|
||||
setStatus("idle");
|
||||
})
|
||||
.catch((err: unknown) => {
|
||||
const httpStatus = (err as { response?: { status?: number } })
|
||||
?.response?.status;
|
||||
// The server returns 503 when dictation is unconfigured and 403 when
|
||||
// it is disabled server-side; both map to the same "not configured".
|
||||
const message =
|
||||
httpStatus === 503 || httpStatus === 403
|
||||
? t("Voice dictation is not configured")
|
||||
: t("Transcription failed");
|
||||
// Log the full error for diagnosis (status + body + stack).
|
||||
console.error("[dictation] transcription failed", err);
|
||||
const resp = (
|
||||
err as { response?: { status?: number; data?: { message?: string } } }
|
||||
)?.response;
|
||||
const serverMsg = resp?.data?.message;
|
||||
let message: string;
|
||||
if (serverMsg && serverMsg.trim().length > 0) {
|
||||
// The server already explains the cause (e.g. provider 404, bad
|
||||
// format, STT not configured) — show it verbatim.
|
||||
message = serverMsg;
|
||||
} else if (resp?.status === 503 || resp?.status === 403) {
|
||||
message = t("Voice dictation is not configured");
|
||||
} else {
|
||||
message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
|
||||
}
|
||||
notifications.show({ color: "red", message });
|
||||
// Surface the error state briefly, then return to idle. Store the
|
||||
// timer so it can be cleared on unmount.
|
||||
setStatus("error");
|
||||
if (errorTimerRef.current !== null) {
|
||||
clearTimeout(errorTimerRef.current);
|
||||
@@ -192,7 +218,8 @@ export function useDictation(
|
||||
try {
|
||||
optionsRef.current.onStart?.();
|
||||
recorder.start();
|
||||
} catch {
|
||||
} catch (err) {
|
||||
console.error("[dictation] MediaRecorder.start failed", err);
|
||||
// recorder.start() can synchronously throw (InvalidStateError /
|
||||
// NotSupportedError); clean up so the button is not left stuck and the
|
||||
// MediaStream does not leak.
|
||||
@@ -201,7 +228,7 @@ export function useDictation(
|
||||
startingRef.current = false;
|
||||
notifications.show({
|
||||
color: "red",
|
||||
message: t("Could not start recording"),
|
||||
message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
|
||||
});
|
||||
setStatus("idle");
|
||||
return;
|
||||
|
||||
@@ -93,9 +93,10 @@ export default function AiProviderSettings() {
|
||||
const updateMutation = useUpdateAiSettingsMutation();
|
||||
const reindexMutation = useReindexAiEmbeddingsMutation();
|
||||
|
||||
// Two independent test mutations so each card has its own loading + result.
|
||||
// Independent test mutations so each card has its own loading + result.
|
||||
const chatTest = useTestAiConnectionMutation();
|
||||
const embedTest = useTestAiConnectionMutation();
|
||||
const sttTest = useTestAiConnectionMutation();
|
||||
|
||||
// Workspace-level feature toggles live in the card headers.
|
||||
const [workspace, setWorkspace] = useAtom(workspaceAtom);
|
||||
@@ -354,6 +355,11 @@ export default function AiProviderSettings() {
|
||||
? "ok"
|
||||
: "error"
|
||||
: "idle";
|
||||
const sttStatus: CardStatus = sttTest.data
|
||||
? sttTest.data.ok
|
||||
? "ok"
|
||||
: "error"
|
||||
: "idle";
|
||||
|
||||
const chatResolved = resolveUrl(form.values.baseUrl, "/chat/completions");
|
||||
const embedResolved = resolveUrl(
|
||||
@@ -617,7 +623,7 @@ export default function AiProviderSettings() {
|
||||
<Paper withBorder radius="md" p="lg">
|
||||
<Group justify="space-between" align="center" wrap="nowrap">
|
||||
<Group gap="xs" align="center" wrap="nowrap">
|
||||
<StatusDot status="idle" />
|
||||
<StatusDot status={sttStatus} />
|
||||
<Text fw={600}>{t("Voice / STT")}</Text>
|
||||
</Group>
|
||||
<Switch
|
||||
@@ -675,6 +681,27 @@ export default function AiProviderSettings() {
|
||||
<Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
|
||||
{t("Resolves to {{url}}", { url: sttResolved })}
|
||||
</Text>
|
||||
|
||||
<Group mt="md" align="center">
|
||||
<Button
|
||||
variant="default"
|
||||
size="sm"
|
||||
loading={sttTest.isPending}
|
||||
onClick={() => sttTest.mutate("stt")}
|
||||
>
|
||||
{t("Test endpoint")}
|
||||
</Button>
|
||||
{sttTest.data &&
|
||||
(sttTest.data.ok ? (
|
||||
<Text size="sm" c="green">
|
||||
{t("Connection successful")}
|
||||
</Text>
|
||||
) : (
|
||||
<Text size="sm" c="red">
|
||||
{sttTest.data.error || t("Connection failed")}
|
||||
</Text>
|
||||
))}
|
||||
</Group>
|
||||
</Paper>
|
||||
|
||||
{/* Nested: external MCP tools the agent calls out to */}
|
||||
|
||||
@@ -55,7 +55,7 @@ export interface IAiTestResult {
|
||||
}
|
||||
|
||||
// Which endpoint a connection test probes.
|
||||
export type AiTestCapability = "chat" | "embeddings";
|
||||
export type AiTestCapability = "chat" | "embeddings" | "stt";
|
||||
|
||||
export async function getAiSettings(): Promise<IAiSettings> {
|
||||
const req = await api.post<IAiSettings>("/workspace/ai-settings");
|
||||
|
||||
@@ -4,11 +4,13 @@ import {
|
||||
Controller,
|
||||
ForbiddenException,
|
||||
HttpCode,
|
||||
HttpException,
|
||||
HttpStatus,
|
||||
Logger,
|
||||
Post,
|
||||
Req,
|
||||
Res,
|
||||
ServiceUnavailableException,
|
||||
UseGuards,
|
||||
UseInterceptors,
|
||||
} from '@nestjs/common';
|
||||
@@ -32,6 +34,7 @@ import {
|
||||
GetChatMessagesDto,
|
||||
RenameChatDto,
|
||||
} from './dto/ai-chat.dto';
|
||||
import { describeProviderError } from '../../integrations/ai/ai-error.util';
|
||||
|
||||
/**
|
||||
* Per-user AI chat API (§6.1). Routes are POST to match this codebase's
|
||||
@@ -249,7 +252,31 @@ export class AiChatController {
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
const text = await this.aiTranscription.transcribe(workspace.id, buf);
|
||||
// Container hint for JSON-style STT providers (e.g. OpenRouter); multipart
|
||||
// endpoints ignore it.
|
||||
const formatMap: Record<string, string> = {
|
||||
'audio/webm': 'webm',
|
||||
'audio/ogg': 'ogg',
|
||||
'audio/mp4': 'mp4',
|
||||
'audio/mpeg': 'mp3',
|
||||
'audio/wav': 'wav',
|
||||
'audio/x-wav': 'wav',
|
||||
'audio/wave': 'wav',
|
||||
'audio/m4a': 'm4a',
|
||||
'audio/x-m4a': 'm4a',
|
||||
};
|
||||
const format = formatMap[baseMime] ?? 'webm';
|
||||
let text: string;
|
||||
try {
|
||||
text = await this.aiTranscription.transcribe(workspace.id, buf, format);
|
||||
} catch (err) {
|
||||
// Preserve meaningful HTTP errors (e.g. AiSttNotConfiguredException -> 503).
|
||||
if (err instanceof HttpException) throw err;
|
||||
// Log the full error and surface the real provider/transport reason instead
|
||||
// of an opaque 500 (e.g. "the STT endpoint returned 404 ...").
|
||||
this.logger.error('AI transcription failed', err as Error);
|
||||
throw new ServiceUnavailableException(describeProviderError(err));
|
||||
}
|
||||
return { text };
|
||||
}
|
||||
|
||||
|
||||
@@ -1,20 +1,21 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { experimental_transcribe as transcribe } from 'ai';
|
||||
import { AiService } from '../../integrations/ai/ai.service';
|
||||
|
||||
/**
|
||||
* Transcribes uploaded audio to text using the per-workspace STT model.
|
||||
* Thin wrapper over the AI SDK's experimental_transcribe; never logs the
|
||||
* audio or the key.
|
||||
* Delegates to AiService, which picks the OpenAI-multipart or OpenRouter-JSON
|
||||
* path. Never logs the audio or the key.
|
||||
*/
|
||||
@Injectable()
|
||||
export class AiTranscriptionService {
|
||||
constructor(private readonly ai: AiService) {}
|
||||
|
||||
// Transcribe an uploaded audio buffer using the workspace STT model.
|
||||
async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
|
||||
const model = await this.ai.getTranscriptionModel(workspaceId);
|
||||
const { text } = await transcribe({ model, audio });
|
||||
return text.trim();
|
||||
// Transcribe an uploaded audio buffer. `format` is the container hint.
|
||||
async transcribe(
|
||||
workspaceId: string,
|
||||
audio: Uint8Array,
|
||||
format: string,
|
||||
): Promise<string> {
|
||||
return this.ai.transcribe(workspaceId, audio, format);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import {
|
||||
embedMany,
|
||||
experimental_transcribe as transcribe,
|
||||
generateText,
|
||||
type EmbeddingModel,
|
||||
type LanguageModel,
|
||||
type TranscriptionModel,
|
||||
} from 'ai';
|
||||
import { createOpenAI } from '@ai-sdk/openai';
|
||||
import { createGoogleGenerativeAI } from '@ai-sdk/google';
|
||||
@@ -108,24 +108,90 @@ export class AiService {
|
||||
}
|
||||
}
|
||||
|
||||
// Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
|
||||
// does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
|
||||
// { model, input_audio: { data: <base64>, format } }. Detect it by host so the
|
||||
// standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
|
||||
// unaffected.
|
||||
private static isOpenRouter(baseURL?: string): boolean {
|
||||
if (!baseURL) return false;
|
||||
try {
|
||||
const host = new URL(baseURL).hostname.toLowerCase();
|
||||
// Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
|
||||
return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the workspace config and build the transcription (STT) model.
|
||||
* STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
|
||||
* (only @ai-sdk/openai exposes .transcription()), regardless of the chat
|
||||
* driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
|
||||
* to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
|
||||
* on demand; the decrypted key is never logged.
|
||||
*
|
||||
* Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
|
||||
* Transcribe audio with the workspace STT model. Standard OpenAI-compatible
|
||||
* endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
|
||||
* audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
|
||||
* wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
|
||||
* AiSttNotConfiguredException (-> 503) when no STT model is configured.
|
||||
*/
|
||||
async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
|
||||
async transcribe(
|
||||
workspaceId: string,
|
||||
audio: Uint8Array,
|
||||
format: string,
|
||||
): Promise<string> {
|
||||
const cfg = await this.aiSettings.resolve(workspaceId);
|
||||
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
|
||||
const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
|
||||
// apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
|
||||
return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
|
||||
cfg.sttModel,
|
||||
);
|
||||
const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
|
||||
|
||||
if (AiService.isOpenRouter(baseURL)) {
|
||||
return this.transcribeViaOpenRouter(
|
||||
baseURL as string,
|
||||
cfg.sttApiKey,
|
||||
cfg.sttModel,
|
||||
audio,
|
||||
format,
|
||||
);
|
||||
}
|
||||
|
||||
// Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
|
||||
// keyless self-hosted whisper; pass a placeholder.
|
||||
const model = createOpenAI({
|
||||
apiKey: cfg.sttApiKey ?? 'unused',
|
||||
baseURL,
|
||||
}).transcription(cfg.sttModel);
|
||||
const { text } = await transcribe({ model, audio });
|
||||
return text.trim();
|
||||
}
|
||||
|
||||
// OpenRouter transcription: JSON body with base64 audio; returns { text }.
|
||||
private async transcribeViaOpenRouter(
|
||||
baseURL: string,
|
||||
apiKey: string | undefined,
|
||||
model: string,
|
||||
audio: Uint8Array,
|
||||
format: string,
|
||||
): Promise<string> {
|
||||
const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
|
||||
const res = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input_audio: {
|
||||
data: Buffer.from(audio).toString('base64'),
|
||||
format,
|
||||
},
|
||||
}),
|
||||
});
|
||||
if (!res.ok) {
|
||||
// Surface status + body so the real reason reaches the user; never log the key.
|
||||
const body = await res.text().catch(() => '');
|
||||
throw new Error(
|
||||
`OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
|
||||
);
|
||||
}
|
||||
const json = (await res.json()) as { text?: string };
|
||||
return (json.text ?? '').trim();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -182,11 +248,36 @@ export class AiService {
|
||||
return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
|
||||
}
|
||||
|
||||
// Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only
|
||||
// as a connectivity probe for the STT endpoint in testConnection.
|
||||
private static silentWavProbe(): Uint8Array {
|
||||
const sampleRate = 16000;
|
||||
const numSamples = sampleRate; // ~1 second
|
||||
const dataSize = numSamples * 2; // 16-bit mono
|
||||
const buf = Buffer.alloc(44 + dataSize);
|
||||
buf.write('RIFF', 0);
|
||||
buf.writeUInt32LE(36 + dataSize, 4);
|
||||
buf.write('WAVE', 8);
|
||||
buf.write('fmt ', 12);
|
||||
buf.writeUInt32LE(16, 16); // PCM fmt chunk size
|
||||
buf.writeUInt16LE(1, 20); // audio format = PCM
|
||||
buf.writeUInt16LE(1, 22); // channels = 1
|
||||
buf.writeUInt32LE(sampleRate, 24);
|
||||
buf.writeUInt32LE(sampleRate * 2, 28); // byte rate
|
||||
buf.writeUInt16LE(2, 32); // block align
|
||||
buf.writeUInt16LE(16, 34); // bits per sample
|
||||
buf.write('data', 36);
|
||||
buf.writeUInt32LE(dataSize, 40);
|
||||
// The PCM samples stay zero (silence).
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cheap connectivity check for a single "Test endpoint" button. Probes ONLY
|
||||
* the requested capability so each card in the UI surfaces its own result:
|
||||
* - `chat`: a one-word generation against the configured chat model;
|
||||
* - `embeddings`: embedding a tiny string against the embedding model.
|
||||
* - `embeddings`: embedding a tiny string against the embedding model;
|
||||
* - `stt`: transcribing a tiny silent WAV against the transcription model.
|
||||
*
|
||||
* A capability that is not configured returns a plain "… is not configured"
|
||||
* message; any real failure returns ok:false with the provider's own cause
|
||||
@@ -201,7 +292,7 @@ export class AiService {
|
||||
*/
|
||||
async testConnection(
|
||||
workspaceId: string,
|
||||
capability: 'chat' | 'embeddings' = 'chat',
|
||||
capability: 'chat' | 'embeddings' | 'stt' = 'chat',
|
||||
): Promise<{ ok: true } | { ok: false; error: string }> {
|
||||
if (capability === 'embeddings') {
|
||||
try {
|
||||
@@ -216,6 +307,21 @@ export class AiService {
|
||||
}
|
||||
}
|
||||
|
||||
if (capability === 'stt') {
|
||||
try {
|
||||
// Probe with a tiny silent WAV; a reachable, authorized endpoint returns
|
||||
// (usually empty) text, any failure surfaces via describeProviderError.
|
||||
await this.transcribe(workspaceId, AiService.silentWavProbe(), 'wav');
|
||||
return { ok: true };
|
||||
} catch (err) {
|
||||
if (err instanceof AiSttNotConfiguredException) {
|
||||
return { ok: false, error: 'STT is not configured' };
|
||||
}
|
||||
this.logger.error('AI STT test connection failed', err as Error);
|
||||
return { ok: false, error: describeProviderError(err) };
|
||||
}
|
||||
}
|
||||
|
||||
// Default: chat probe.
|
||||
try {
|
||||
const model = await this.getChatModel(workspaceId);
|
||||
|
||||
@@ -4,6 +4,6 @@ import { IsIn, IsOptional } from 'class-validator';
|
||||
// defaults to the chat endpoint server-side when omitted.
|
||||
export class TestAiConnectionDto {
|
||||
@IsOptional()
|
||||
@IsIn(['chat', 'embeddings'])
|
||||
capability?: 'chat' | 'embeddings';
|
||||
@IsIn(['chat', 'embeddings', 'stt'])
|
||||
capability?: 'chat' | 'embeddings' | 'stt';
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user