feat(ai): OpenRouter STT support + real error surfacing + STT endpoint test

- ai.service: route *.openrouter.ai STT to its JSON+base64 /audio/transcriptions API; keep the OpenAI multipart path (AI SDK) for OpenAI/self-hosted whisper. Unify transcription behind transcribe(). - /transcribe controller: surface the real provider/transport reason (describeProviderError) instead of an opaque 500; preserve HttpException. - testConnection: add an 'stt' capability (silent-WAV probe) + DTO; client gets a Test endpoint button and status dot on the Voice/STT card. - useDictation: log full errors to the console and show the real reason (mic start + transcription paths); handle NotReadable/Abort and missing mediaDevices. - docs(CLAUDE.md): require full error logging + specific user-facing messages.
2026-06-18 19:26:35 +03:00
parent ef90655657
commit 77249d59c6
8 changed files with 237 additions and 47 deletions
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1189,5 +1189,7 @@
  "No microphone found": "No microphone found",
  "Could not start recording": "Could not start recording",
  "Transcription failed": "Transcription failed",
-  "Voice dictation is not configured": "Voice dictation is not configured"
+  "Voice dictation is not configured": "Voice dictation is not configured",
+  "Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
+  "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
 }
--- a/apps/client/src/features/dictation/hooks/use-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-dictation.ts
@@ -90,18 +90,37 @@ export function useDictation(
    if (status !== "idle") return;
    startingRef.current = true;

+    if (!navigator.mediaDevices?.getUserMedia) {
+      const reason =
+        "navigator.mediaDevices.getUserMedia is unavailable in this context";
+      console.error("[dictation] " + reason);
+      notifications.show({
+        color: "red",
+        message: t("Audio recording is not available in this browser/context"),
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
    let stream: MediaStream;
    try {
      stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    } catch (err) {
+      // Always log the full error for diagnosis (name, message, stack).
+      console.error("[dictation] getUserMedia failed", err);
      const name = (err as { name?: string })?.name;
+      const detail = (err as { message?: string })?.message ?? String(err);
      let message: string;
      if (name === "NotAllowedError" || name === "SecurityError") {
        message = t("Microphone access denied");
      } else if (name === "NotFoundError" || name === "OverconstrainedError") {
        message = t("No microphone found");
+      } else if (name === "NotReadableError" || name === "AbortError") {
+        message = t("Microphone is unavailable or already in use");
      } else {
-        message = t("Could not start recording");
+        // Unknown failure: show the real reason instead of a generic string.
+        message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
      }
      notifications.show({ color: "red", message });
      setStatus("idle");
@@ -120,13 +139,14 @@ export function useDictation(
        stream,
        mimeType ? { mimeType } : undefined,
      );
-    } catch {
+    } catch (err) {
+      console.error("[dictation] MediaRecorder failed", err);
      // The stream was acquired but the recorder failed to construct; stop the
      // tracks so the MediaStream does not leak before bailing out.
      stopTracks();
      notifications.show({
        color: "red",
-        message: t("Could not start recording"),
+        message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
      });
      setStatus("idle");
      startingRef.current = false;
@@ -165,17 +185,23 @@ export function useDictation(
          setStatus("idle");
        })
        .catch((err: unknown) => {
-          const httpStatus = (err as { response?: { status?: number } })
-            ?.response?.status;
-          // The server returns 503 when dictation is unconfigured and 403 when
-          // it is disabled server-side; both map to the same "not configured".
-          const message =
-            httpStatus === 503 || httpStatus === 403
-              ? t("Voice dictation is not configured")
-              : t("Transcription failed");
+          // Log the full error for diagnosis (status + body + stack).
+          console.error("[dictation] transcription failed", err);
+          const resp = (
+            err as { response?: { status?: number; data?: { message?: string } } }
+          )?.response;
+          const serverMsg = resp?.data?.message;
+          let message: string;
+          if (serverMsg && serverMsg.trim().length > 0) {
+            // The server already explains the cause (e.g. provider 404, bad
+            // format, STT not configured) — show it verbatim.
+            message = serverMsg;
+          } else if (resp?.status === 503 || resp?.status === 403) {
+            message = t("Voice dictation is not configured");
+          } else {
+            message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
+          }
          notifications.show({ color: "red", message });
-          // Surface the error state briefly, then return to idle. Store the
-          // timer so it can be cleared on unmount.
          setStatus("error");
          if (errorTimerRef.current !== null) {
            clearTimeout(errorTimerRef.current);
@@ -192,7 +218,8 @@ export function useDictation(
    try {
      optionsRef.current.onStart?.();
      recorder.start();
-    } catch {
+    } catch (err) {
+      console.error("[dictation] MediaRecorder.start failed", err);
      // recorder.start() can synchronously throw (InvalidStateError /
      // NotSupportedError); clean up so the button is not left stuck and the
      // MediaStream does not leak.
@@ -201,7 +228,7 @@ export function useDictation(
      startingRef.current = false;
      notifications.show({
        color: "red",
-        message: t("Could not start recording"),
+        message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
      });
      setStatus("idle");
      return;
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -93,9 +93,10 @@ export default function AiProviderSettings() {
  const updateMutation = useUpdateAiSettingsMutation();
  const reindexMutation = useReindexAiEmbeddingsMutation();

-  // Two independent test mutations so each card has its own loading + result.
+  // Independent test mutations so each card has its own loading + result.
  const chatTest = useTestAiConnectionMutation();
  const embedTest = useTestAiConnectionMutation();
+  const sttTest = useTestAiConnectionMutation();

  // Workspace-level feature toggles live in the card headers.
  const [workspace, setWorkspace] = useAtom(workspaceAtom);
@@ -354,6 +355,11 @@ export default function AiProviderSettings() {
      ? "ok"
      : "error"
    : "idle";
+  const sttStatus: CardStatus = sttTest.data
+    ? sttTest.data.ok
+      ? "ok"
+      : "error"
+    : "idle";

  const chatResolved = resolveUrl(form.values.baseUrl, "/chat/completions");
  const embedResolved = resolveUrl(
@@ -617,7 +623,7 @@ export default function AiProviderSettings() {
      <Paper withBorder radius="md" p="lg">
        <Group justify="space-between" align="center" wrap="nowrap">
          <Group gap="xs" align="center" wrap="nowrap">
-            <StatusDot status="idle" />
+            <StatusDot status={sttStatus} />
            <Text fw={600}>{t("Voice / STT")}</Text>
          </Group>
          <Switch
@@ -675,6 +681,27 @@ export default function AiProviderSettings() {
        <Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
          {t("Resolves to {{url}}", { url: sttResolved })}
        </Text>
+
+        <Group mt="md" align="center">
+          <Button
+            variant="default"
+            size="sm"
+            loading={sttTest.isPending}
+            onClick={() => sttTest.mutate("stt")}
+          >
+            {t("Test endpoint")}
+          </Button>
+          {sttTest.data &&
+            (sttTest.data.ok ? (
+              <Text size="sm" c="green">
+                {t("Connection successful")}
+              </Text>
+            ) : (
+              <Text size="sm" c="red">
+                {sttTest.data.error || t("Connection failed")}
+              </Text>
+            ))}
+        </Group>
      </Paper>

      {/* Nested: external MCP tools the agent calls out to */}
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -55,7 +55,7 @@ export interface IAiTestResult {
 }

 // Which endpoint a connection test probes.
-export type AiTestCapability = "chat" | "embeddings";
+export type AiTestCapability = "chat" | "embeddings" | "stt";

 export async function getAiSettings(): Promise<IAiSettings> {
  const req = await api.post<IAiSettings>("/workspace/ai-settings");
--- a/apps/server/src/core/ai-chat/ai-chat.controller.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts
@@ -4,11 +4,13 @@ import {
  Controller,
  ForbiddenException,
  HttpCode,
+  HttpException,
  HttpStatus,
  Logger,
  Post,
  Req,
  Res,
+  ServiceUnavailableException,
  UseGuards,
  UseInterceptors,
 } from '@nestjs/common';
@@ -32,6 +34,7 @@ import {
  GetChatMessagesDto,
  RenameChatDto,
 } from './dto/ai-chat.dto';
+import { describeProviderError } from '../../integrations/ai/ai-error.util';

 /**
 * Per-user AI chat API (§6.1). Routes are POST to match this codebase's
@@ -249,7 +252,31 @@ export class AiChatController {
      }
      throw err;
    }
-    const text = await this.aiTranscription.transcribe(workspace.id, buf);
+    // Container hint for JSON-style STT providers (e.g. OpenRouter); multipart
+    // endpoints ignore it.
+    const formatMap: Record<string, string> = {
+      'audio/webm': 'webm',
+      'audio/ogg': 'ogg',
+      'audio/mp4': 'mp4',
+      'audio/mpeg': 'mp3',
+      'audio/wav': 'wav',
+      'audio/x-wav': 'wav',
+      'audio/wave': 'wav',
+      'audio/m4a': 'm4a',
+      'audio/x-m4a': 'm4a',
+    };
+    const format = formatMap[baseMime] ?? 'webm';
+    let text: string;
+    try {
+      text = await this.aiTranscription.transcribe(workspace.id, buf, format);
+    } catch (err) {
+      // Preserve meaningful HTTP errors (e.g. AiSttNotConfiguredException -> 503).
+      if (err instanceof HttpException) throw err;
+      // Log the full error and surface the real provider/transport reason instead
+      // of an opaque 500 (e.g. "the STT endpoint returned 404 ...").
+      this.logger.error('AI transcription failed', err as Error);
+      throw new ServiceUnavailableException(describeProviderError(err));
+    }
    return { text };
  }

--- a/apps/server/src/core/ai-chat/ai-transcription.service.ts
+++ b/apps/server/src/core/ai-chat/ai-transcription.service.ts
@@ -1,20 +1,21 @@
 import { Injectable } from '@nestjs/common';
-import { experimental_transcribe as transcribe } from 'ai';
 import { AiService } from '../../integrations/ai/ai.service';

 /**
 * Transcribes uploaded audio to text using the per-workspace STT model.
- * Thin wrapper over the AI SDK's experimental_transcribe; never logs the
- * audio or the key.
+ * Delegates to AiService, which picks the OpenAI-multipart or OpenRouter-JSON
+ * path. Never logs the audio or the key.
 */
@Injectable()
 export class AiTranscriptionService {
  constructor(private readonly ai: AiService) {}

-  // Transcribe an uploaded audio buffer using the workspace STT model.
-  async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
-    const model = await this.ai.getTranscriptionModel(workspaceId);
-    const { text } = await transcribe({ model, audio });
-    return text.trim();
+  // Transcribe an uploaded audio buffer. `format` is the container hint.
+  async transcribe(
+    workspaceId: string,
+    audio: Uint8Array,
+    format: string,
+  ): Promise<string> {
+    return this.ai.transcribe(workspaceId, audio, format);
  }
 }
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -1,10 +1,10 @@
 import { Injectable, Logger } from '@nestjs/common';
 import {
  embedMany,
+  experimental_transcribe as transcribe,
  generateText,
  type EmbeddingModel,
  type LanguageModel,
-  type TranscriptionModel,
 } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -108,24 +108,90 @@ export class AiService {
    }
  }

+  // Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
+  // does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
+  // { model, input_audio: { data: <base64>, format } }. Detect it by host so the
+  // standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
+  // unaffected.
+  private static isOpenRouter(baseURL?: string): boolean {
+    if (!baseURL) return false;
+    try {
+      const host = new URL(baseURL).hostname.toLowerCase();
+      // Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
+      return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
+    } catch {
+      return false;
+    }
+  }
+
  /**
-   * Resolve the workspace config and build the transcription (STT) model.
-   * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
-   * (only @ai-sdk/openai exposes .transcription()), regardless of the chat
-   * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
-   * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
-   * on demand; the decrypted key is never logged.
-   *
-   * Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
+   * Transcribe audio with the workspace STT model. Standard OpenAI-compatible
+   * endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
+   * audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
+   * wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
+   * AiSttNotConfiguredException (-> 503) when no STT model is configured.
   */
-  async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
+  async transcribe(
+    workspaceId: string,
+    audio: Uint8Array,
+    format: string,
+  ): Promise<string> {
    const cfg = await this.aiSettings.resolve(workspaceId);
    if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
-    const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
-    // apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
-    return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
-      cfg.sttModel,
-    );
+    const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
+
+    if (AiService.isOpenRouter(baseURL)) {
+      return this.transcribeViaOpenRouter(
+        baseURL as string,
+        cfg.sttApiKey,
+        cfg.sttModel,
+        audio,
+        format,
+      );
+    }
+
+    // Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
+    // keyless self-hosted whisper; pass a placeholder.
+    const model = createOpenAI({
+      apiKey: cfg.sttApiKey ?? 'unused',
+      baseURL,
+    }).transcription(cfg.sttModel);
+    const { text } = await transcribe({ model, audio });
+    return text.trim();
+  }
+
+  // OpenRouter transcription: JSON body with base64 audio; returns { text }.
+  private async transcribeViaOpenRouter(
+    baseURL: string,
+    apiKey: string | undefined,
+    model: string,
+    audio: Uint8Array,
+    format: string,
+  ): Promise<string> {
+    const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
+    const res = await fetch(url, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
+      },
+      body: JSON.stringify({
+        model,
+        input_audio: {
+          data: Buffer.from(audio).toString('base64'),
+          format,
+        },
+      }),
+    });
+    if (!res.ok) {
+      // Surface status + body so the real reason reaches the user; never log the key.
+      const body = await res.text().catch(() => '');
+      throw new Error(
+        `OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
+      );
+    }
+    const json = (await res.json()) as { text?: string };
+    return (json.text ?? '').trim();
  }

  /**
@@ -182,11 +248,36 @@ export class AiService {
    return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
  }

+  // Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only
+  // as a connectivity probe for the STT endpoint in testConnection.
+  private static silentWavProbe(): Uint8Array {
+    const sampleRate = 16000;
+    const numSamples = sampleRate; // ~1 second
+    const dataSize = numSamples * 2; // 16-bit mono
+    const buf = Buffer.alloc(44 + dataSize);
+    buf.write('RIFF', 0);
+    buf.writeUInt32LE(36 + dataSize, 4);
+    buf.write('WAVE', 8);
+    buf.write('fmt ', 12);
+    buf.writeUInt32LE(16, 16); // PCM fmt chunk size
+    buf.writeUInt16LE(1, 20); // audio format = PCM
+    buf.writeUInt16LE(1, 22); // channels = 1
+    buf.writeUInt32LE(sampleRate, 24);
+    buf.writeUInt32LE(sampleRate * 2, 28); // byte rate
+    buf.writeUInt16LE(2, 32); // block align
+    buf.writeUInt16LE(16, 34); // bits per sample
+    buf.write('data', 36);
+    buf.writeUInt32LE(dataSize, 40);
+    // The PCM samples stay zero (silence).
+    return buf;
+  }
+
  /**
   * Cheap connectivity check for a single "Test endpoint" button. Probes ONLY
   * the requested capability so each card in the UI surfaces its own result:
   *  - `chat`: a one-word generation against the configured chat model;
-   *  - `embeddings`: embedding a tiny string against the embedding model.
+   *  - `embeddings`: embedding a tiny string against the embedding model;
+   *  - `stt`: transcribing a tiny silent WAV against the transcription model.
   *
   * A capability that is not configured returns a plain "… is not configured"
   * message; any real failure returns ok:false with the provider's own cause
@@ -201,7 +292,7 @@ export class AiService {
   */
  async testConnection(
    workspaceId: string,
-    capability: 'chat' | 'embeddings' = 'chat',
+    capability: 'chat' | 'embeddings' | 'stt' = 'chat',
  ): Promise<{ ok: true } | { ok: false; error: string }> {
    if (capability === 'embeddings') {
      try {
@@ -216,6 +307,21 @@ export class AiService {
      }
    }

+    if (capability === 'stt') {
+      try {
+        // Probe with a tiny silent WAV; a reachable, authorized endpoint returns
+        // (usually empty) text, any failure surfaces via describeProviderError.
+        await this.transcribe(workspaceId, AiService.silentWavProbe(), 'wav');
+        return { ok: true };
+      } catch (err) {
+        if (err instanceof AiSttNotConfiguredException) {
+          return { ok: false, error: 'STT is not configured' };
+        }
+        this.logger.error('AI STT test connection failed', err as Error);
+        return { ok: false, error: describeProviderError(err) };
+      }
+    }
+
    // Default: chat probe.
    try {
      const model = await this.getChatModel(workspaceId);
--- a/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
+++ b/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
@@ -4,6 +4,6 @@ import { IsIn, IsOptional } from 'class-validator';
 // defaults to the chat endpoint server-side when omitted.
 export class TestAiConnectionDto {
  @IsOptional()
-  @IsIn(['chat', 'embeddings'])
-  capability?: 'chat' | 'embeddings';
+  @IsIn(['chat', 'embeddings', 'stt'])
+  capability?: 'chat' | 'embeddings' | 'stt';
 }