feat(ai): OpenRouter STT support + real error surfacing + STT endpoint test

- ai.service: route *.openrouter.ai STT to its JSON+base64
  /audio/transcriptions API; keep the OpenAI multipart path (AI SDK) for
  OpenAI/self-hosted whisper. Unify transcription behind transcribe().
- /transcribe controller: surface the real provider/transport reason
  (describeProviderError) instead of an opaque 500; preserve HttpException.
- testConnection: add an 'stt' capability (silent-WAV probe) + DTO; client
  gets a Test endpoint button and status dot on the Voice/STT card.
- useDictation: log full errors to the console and show the real reason
  (mic start + transcription paths); handle NotReadable/Abort and missing
  mediaDevices.
- docs(CLAUDE.md): require full error logging + specific user-facing messages.
This commit is contained in:
vvzvlad
2026-06-18 19:26:35 +03:00
parent ef90655657
commit 77249d59c6
8 changed files with 237 additions and 47 deletions

View File

@@ -1189,5 +1189,7 @@
"No microphone found": "No microphone found",
"Could not start recording": "Could not start recording",
"Transcription failed": "Transcription failed",
"Voice dictation is not configured": "Voice dictation is not configured"
"Voice dictation is not configured": "Voice dictation is not configured",
"Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
"Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
}

View File

@@ -90,18 +90,37 @@ export function useDictation(
if (status !== "idle") return;
startingRef.current = true;
if (!navigator.mediaDevices?.getUserMedia) {
const reason =
"navigator.mediaDevices.getUserMedia is unavailable in this context";
console.error("[dictation] " + reason);
notifications.show({
color: "red",
message: t("Audio recording is not available in this browser/context"),
});
setStatus("idle");
startingRef.current = false;
return;
}
let stream: MediaStream;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
// Always log the full error for diagnosis (name, message, stack).
console.error("[dictation] getUserMedia failed", err);
const name = (err as { name?: string })?.name;
const detail = (err as { message?: string })?.message ?? String(err);
let message: string;
if (name === "NotAllowedError" || name === "SecurityError") {
message = t("Microphone access denied");
} else if (name === "NotFoundError" || name === "OverconstrainedError") {
message = t("No microphone found");
} else if (name === "NotReadableError" || name === "AbortError") {
message = t("Microphone is unavailable or already in use");
} else {
message = t("Could not start recording");
// Unknown failure: show the real reason instead of a generic string.
message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
}
notifications.show({ color: "red", message });
setStatus("idle");
@@ -120,13 +139,14 @@ export function useDictation(
stream,
mimeType ? { mimeType } : undefined,
);
} catch {
} catch (err) {
console.error("[dictation] MediaRecorder failed", err);
// The stream was acquired but the recorder failed to construct; stop the
// tracks so the MediaStream does not leak before bailing out.
stopTracks();
notifications.show({
color: "red",
message: t("Could not start recording"),
message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
});
setStatus("idle");
startingRef.current = false;
@@ -165,17 +185,23 @@ export function useDictation(
setStatus("idle");
})
.catch((err: unknown) => {
const httpStatus = (err as { response?: { status?: number } })
?.response?.status;
// The server returns 503 when dictation is unconfigured and 403 when
// it is disabled server-side; both map to the same "not configured".
const message =
httpStatus === 503 || httpStatus === 403
? t("Voice dictation is not configured")
: t("Transcription failed");
// Log the full error for diagnosis (status + body + stack).
console.error("[dictation] transcription failed", err);
const resp = (
err as { response?: { status?: number; data?: { message?: string } } }
)?.response;
const serverMsg = resp?.data?.message;
let message: string;
if (serverMsg && serverMsg.trim().length > 0) {
// The server already explains the cause (e.g. provider 404, bad
// format, STT not configured) — show it verbatim.
message = serverMsg;
} else if (resp?.status === 503 || resp?.status === 403) {
message = t("Voice dictation is not configured");
} else {
message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
}
notifications.show({ color: "red", message });
// Surface the error state briefly, then return to idle. Store the
// timer so it can be cleared on unmount.
setStatus("error");
if (errorTimerRef.current !== null) {
clearTimeout(errorTimerRef.current);
@@ -192,7 +218,8 @@ export function useDictation(
try {
optionsRef.current.onStart?.();
recorder.start();
} catch {
} catch (err) {
console.error("[dictation] MediaRecorder.start failed", err);
// recorder.start() can synchronously throw (InvalidStateError /
// NotSupportedError); clean up so the button is not left stuck and the
// MediaStream does not leak.
@@ -201,7 +228,7 @@ export function useDictation(
startingRef.current = false;
notifications.show({
color: "red",
message: t("Could not start recording"),
message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
});
setStatus("idle");
return;

View File

@@ -93,9 +93,10 @@ export default function AiProviderSettings() {
const updateMutation = useUpdateAiSettingsMutation();
const reindexMutation = useReindexAiEmbeddingsMutation();
// Two independent test mutations so each card has its own loading + result.
// Independent test mutations so each card has its own loading + result.
const chatTest = useTestAiConnectionMutation();
const embedTest = useTestAiConnectionMutation();
const sttTest = useTestAiConnectionMutation();
// Workspace-level feature toggles live in the card headers.
const [workspace, setWorkspace] = useAtom(workspaceAtom);
@@ -354,6 +355,11 @@ export default function AiProviderSettings() {
? "ok"
: "error"
: "idle";
const sttStatus: CardStatus = sttTest.data
? sttTest.data.ok
? "ok"
: "error"
: "idle";
const chatResolved = resolveUrl(form.values.baseUrl, "/chat/completions");
const embedResolved = resolveUrl(
@@ -617,7 +623,7 @@ export default function AiProviderSettings() {
<Paper withBorder radius="md" p="lg">
<Group justify="space-between" align="center" wrap="nowrap">
<Group gap="xs" align="center" wrap="nowrap">
<StatusDot status="idle" />
<StatusDot status={sttStatus} />
<Text fw={600}>{t("Voice / STT")}</Text>
</Group>
<Switch
@@ -675,6 +681,27 @@ export default function AiProviderSettings() {
<Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
{t("Resolves to {{url}}", { url: sttResolved })}
</Text>
<Group mt="md" align="center">
<Button
variant="default"
size="sm"
loading={sttTest.isPending}
onClick={() => sttTest.mutate("stt")}
>
{t("Test endpoint")}
</Button>
{sttTest.data &&
(sttTest.data.ok ? (
<Text size="sm" c="green">
{t("Connection successful")}
</Text>
) : (
<Text size="sm" c="red">
{sttTest.data.error || t("Connection failed")}
</Text>
))}
</Group>
</Paper>
{/* Nested: external MCP tools the agent calls out to */}

View File

@@ -55,7 +55,7 @@ export interface IAiTestResult {
}
// Which endpoint a connection test probes.
export type AiTestCapability = "chat" | "embeddings";
export type AiTestCapability = "chat" | "embeddings" | "stt";
export async function getAiSettings(): Promise<IAiSettings> {
const req = await api.post<IAiSettings>("/workspace/ai-settings");

View File

@@ -4,11 +4,13 @@ import {
Controller,
ForbiddenException,
HttpCode,
HttpException,
HttpStatus,
Logger,
Post,
Req,
Res,
ServiceUnavailableException,
UseGuards,
UseInterceptors,
} from '@nestjs/common';
@@ -32,6 +34,7 @@ import {
GetChatMessagesDto,
RenameChatDto,
} from './dto/ai-chat.dto';
import { describeProviderError } from '../../integrations/ai/ai-error.util';
/**
* Per-user AI chat API (§6.1). Routes are POST to match this codebase's
@@ -249,7 +252,31 @@ export class AiChatController {
}
throw err;
}
const text = await this.aiTranscription.transcribe(workspace.id, buf);
// Container hint for JSON-style STT providers (e.g. OpenRouter); multipart
// endpoints ignore it.
const formatMap: Record<string, string> = {
'audio/webm': 'webm',
'audio/ogg': 'ogg',
'audio/mp4': 'mp4',
'audio/mpeg': 'mp3',
'audio/wav': 'wav',
'audio/x-wav': 'wav',
'audio/wave': 'wav',
'audio/m4a': 'm4a',
'audio/x-m4a': 'm4a',
};
const format = formatMap[baseMime] ?? 'webm';
let text: string;
try {
text = await this.aiTranscription.transcribe(workspace.id, buf, format);
} catch (err) {
// Preserve meaningful HTTP errors (e.g. AiSttNotConfiguredException -> 503).
if (err instanceof HttpException) throw err;
// Log the full error and surface the real provider/transport reason instead
// of an opaque 500 (e.g. "the STT endpoint returned 404 ...").
this.logger.error('AI transcription failed', err as Error);
throw new ServiceUnavailableException(describeProviderError(err));
}
return { text };
}

View File

@@ -1,20 +1,21 @@
import { Injectable } from '@nestjs/common';
import { experimental_transcribe as transcribe } from 'ai';
import { AiService } from '../../integrations/ai/ai.service';
/**
* Transcribes uploaded audio to text using the per-workspace STT model.
* Thin wrapper over the AI SDK's experimental_transcribe; never logs the
* audio or the key.
* Delegates to AiService, which picks the OpenAI-multipart or OpenRouter-JSON
* path. Never logs the audio or the key.
*/
@Injectable()
export class AiTranscriptionService {
constructor(private readonly ai: AiService) {}
// Transcribe an uploaded audio buffer using the workspace STT model.
async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
const model = await this.ai.getTranscriptionModel(workspaceId);
const { text } = await transcribe({ model, audio });
return text.trim();
// Transcribe an uploaded audio buffer. `format` is the container hint.
async transcribe(
workspaceId: string,
audio: Uint8Array,
format: string,
): Promise<string> {
return this.ai.transcribe(workspaceId, audio, format);
}
}

View File

@@ -1,10 +1,10 @@
import { Injectable, Logger } from '@nestjs/common';
import {
embedMany,
experimental_transcribe as transcribe,
generateText,
type EmbeddingModel,
type LanguageModel,
type TranscriptionModel,
} from 'ai';
import { createOpenAI } from '@ai-sdk/openai';
import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -108,24 +108,90 @@ export class AiService {
}
}
// Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
// does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
// { model, input_audio: { data: <base64>, format } }. Detect it by host so the
// standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
// unaffected.
private static isOpenRouter(baseURL?: string): boolean {
if (!baseURL) return false;
try {
const host = new URL(baseURL).hostname.toLowerCase();
// Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
} catch {
return false;
}
}
/**
* Resolve the workspace config and build the transcription (STT) model.
* STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
* (only @ai-sdk/openai exposes .transcription()), regardless of the chat
* driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
* to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
* on demand; the decrypted key is never logged.
*
* Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
* Transcribe audio with the workspace STT model. Standard OpenAI-compatible
* endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
* audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
* wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
* AiSttNotConfiguredException (-> 503) when no STT model is configured.
*/
async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
async transcribe(
workspaceId: string,
audio: Uint8Array,
format: string,
): Promise<string> {
const cfg = await this.aiSettings.resolve(workspaceId);
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
// apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
cfg.sttModel,
);
const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
if (AiService.isOpenRouter(baseURL)) {
return this.transcribeViaOpenRouter(
baseURL as string,
cfg.sttApiKey,
cfg.sttModel,
audio,
format,
);
}
// Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
// keyless self-hosted whisper; pass a placeholder.
const model = createOpenAI({
apiKey: cfg.sttApiKey ?? 'unused',
baseURL,
}).transcription(cfg.sttModel);
const { text } = await transcribe({ model, audio });
return text.trim();
}
// OpenRouter transcription: JSON body with base64 audio; returns { text }.
private async transcribeViaOpenRouter(
baseURL: string,
apiKey: string | undefined,
model: string,
audio: Uint8Array,
format: string,
): Promise<string> {
const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
const res = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
},
body: JSON.stringify({
model,
input_audio: {
data: Buffer.from(audio).toString('base64'),
format,
},
}),
});
if (!res.ok) {
// Surface status + body so the real reason reaches the user; never log the key.
const body = await res.text().catch(() => '');
throw new Error(
`OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
);
}
const json = (await res.json()) as { text?: string };
return (json.text ?? '').trim();
}
/**
@@ -182,11 +248,36 @@ export class AiService {
return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
}
// Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only
// as a connectivity probe for the STT endpoint in testConnection.
private static silentWavProbe(): Uint8Array {
const sampleRate = 16000;
const numSamples = sampleRate; // ~1 second
const dataSize = numSamples * 2; // 16-bit mono
const buf = Buffer.alloc(44 + dataSize);
buf.write('RIFF', 0);
buf.writeUInt32LE(36 + dataSize, 4);
buf.write('WAVE', 8);
buf.write('fmt ', 12);
buf.writeUInt32LE(16, 16); // PCM fmt chunk size
buf.writeUInt16LE(1, 20); // audio format = PCM
buf.writeUInt16LE(1, 22); // channels = 1
buf.writeUInt32LE(sampleRate, 24);
buf.writeUInt32LE(sampleRate * 2, 28); // byte rate
buf.writeUInt16LE(2, 32); // block align
buf.writeUInt16LE(16, 34); // bits per sample
buf.write('data', 36);
buf.writeUInt32LE(dataSize, 40);
// The PCM samples stay zero (silence).
return buf;
}
/**
* Cheap connectivity check for a single "Test endpoint" button. Probes ONLY
* the requested capability so each card in the UI surfaces its own result:
* - `chat`: a one-word generation against the configured chat model;
* - `embeddings`: embedding a tiny string against the embedding model.
* - `embeddings`: embedding a tiny string against the embedding model;
* - `stt`: transcribing a tiny silent WAV against the transcription model.
*
* A capability that is not configured returns a plain "… is not configured"
* message; any real failure returns ok:false with the provider's own cause
@@ -201,7 +292,7 @@ export class AiService {
*/
async testConnection(
workspaceId: string,
capability: 'chat' | 'embeddings' = 'chat',
capability: 'chat' | 'embeddings' | 'stt' = 'chat',
): Promise<{ ok: true } | { ok: false; error: string }> {
if (capability === 'embeddings') {
try {
@@ -216,6 +307,21 @@ export class AiService {
}
}
if (capability === 'stt') {
try {
// Probe with a tiny silent WAV; a reachable, authorized endpoint returns
// (usually empty) text, any failure surfaces via describeProviderError.
await this.transcribe(workspaceId, AiService.silentWavProbe(), 'wav');
return { ok: true };
} catch (err) {
if (err instanceof AiSttNotConfiguredException) {
return { ok: false, error: 'STT is not configured' };
}
this.logger.error('AI STT test connection failed', err as Error);
return { ok: false, error: describeProviderError(err) };
}
}
// Default: chat probe.
try {
const model = await this.getChatModel(workspaceId);

View File

@@ -4,6 +4,6 @@ import { IsIn, IsOptional } from 'class-validator';
// defaults to the chat endpoint server-side when omitted.
export class TestAiConnectionDto {
@IsOptional()
@IsIn(['chat', 'embeddings'])
capability?: 'chat' | 'embeddings';
@IsIn(['chat', 'embeddings', 'stt'])
capability?: 'chat' | 'embeddings' | 'stt';
}