From 77249d59c6065b12d65312a2dc96c07353059179 Mon Sep 17 00:00:00 2001
From: vvzvlad <git@vvzvlad.xyz>
Date: Thu, 18 Jun 2026 19:26:35 +0300
Subject: [PATCH] feat(ai): OpenRouter STT support + real error surfacing + STT
 endpoint test

- ai.service: route *.openrouter.ai STT to its JSON+base64
  /audio/transcriptions API; keep the OpenAI multipart path (AI SDK) for
  OpenAI/self-hosted whisper. Unify transcription behind transcribe().
- /transcribe controller: surface the real provider/transport reason
  (describeProviderError) instead of an opaque 500; preserve HttpException.
- testConnection: add an 'stt' capability (silent-WAV probe) + DTO; client
  gets a Test endpoint button and status dot on the Voice/STT card.
- useDictation: log full errors to the console and show the real reason
  (mic start + transcription paths); handle NotReadable/Abort and missing
  mediaDevices.
- docs(CLAUDE.md): require full error logging + specific user-facing messages.
---
 .../public/locales/en-US/translation.json     |   4 +-
 .../features/dictation/hooks/use-dictation.ts |  57 +++++--
 .../components/ai-provider-settings.tsx       |  31 +++-
 .../workspace/services/ai-settings-service.ts |   2 +-
 .../src/core/ai-chat/ai-chat.controller.ts    |  29 +++-
 .../core/ai-chat/ai-transcription.service.ts  |  17 ++-
 apps/server/src/integrations/ai/ai.service.ts | 140 +++++++++++++++---
 .../ai/dto/test-ai-connection.dto.ts          |   4 +-
 8 files changed, 237 insertions(+), 47 deletions(-)

diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json
index 591b362a..8cfd742c 100644
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1189,5 +1189,7 @@
   "No microphone found": "No microphone found",
   "Could not start recording": "Could not start recording",
   "Transcription failed": "Transcription failed",
-  "Voice dictation is not configured": "Voice dictation is not configured"
+  "Voice dictation is not configured": "Voice dictation is not configured",
+  "Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
+  "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
 }
diff --git a/apps/client/src/features/dictation/hooks/use-dictation.ts b/apps/client/src/features/dictation/hooks/use-dictation.ts
index 059949f0..86af4c78 100644
--- a/apps/client/src/features/dictation/hooks/use-dictation.ts
+++ b/apps/client/src/features/dictation/hooks/use-dictation.ts
@@ -90,18 +90,37 @@ export function useDictation(
     if (status !== "idle") return;
     startingRef.current = true;
 
+    if (!navigator.mediaDevices?.getUserMedia) {
+      const reason =
+        "navigator.mediaDevices.getUserMedia is unavailable in this context";
+      console.error("[dictation] " + reason);
+      notifications.show({
+        color: "red",
+        message: t("Audio recording is not available in this browser/context"),
+      });
+      setStatus("idle");
+      startingRef.current = false;
+      return;
+    }
+
     let stream: MediaStream;
     try {
       stream = await navigator.mediaDevices.getUserMedia({ audio: true });
     } catch (err) {
+      // Always log the full error for diagnosis (name, message, stack).
+      console.error("[dictation] getUserMedia failed", err);
       const name = (err as { name?: string })?.name;
+      const detail = (err as { message?: string })?.message ?? String(err);
       let message: string;
       if (name === "NotAllowedError" || name === "SecurityError") {
         message = t("Microphone access denied");
       } else if (name === "NotFoundError" || name === "OverconstrainedError") {
         message = t("No microphone found");
+      } else if (name === "NotReadableError" || name === "AbortError") {
+        message = t("Microphone is unavailable or already in use");
       } else {
-        message = t("Could not start recording");
+        // Unknown failure: show the real reason instead of a generic string.
+        message = `${t("Could not start recording")}: ${name ? `${name}: ` : ""}${detail}`;
       }
       notifications.show({ color: "red", message });
       setStatus("idle");
@@ -120,13 +139,14 @@ export function useDictation(
         stream,
         mimeType ? { mimeType } : undefined,
       );
-    } catch {
+    } catch (err) {
+      console.error("[dictation] MediaRecorder failed", err);
       // The stream was acquired but the recorder failed to construct; stop the
       // tracks so the MediaStream does not leak before bailing out.
       stopTracks();
       notifications.show({
         color: "red",
-        message: t("Could not start recording"),
+        message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
       });
       setStatus("idle");
       startingRef.current = false;
@@ -165,17 +185,23 @@ export function useDictation(
           setStatus("idle");
         })
         .catch((err: unknown) => {
-          const httpStatus = (err as { response?: { status?: number } })
-            ?.response?.status;
-          // The server returns 503 when dictation is unconfigured and 403 when
-          // it is disabled server-side; both map to the same "not configured".
-          const message =
-            httpStatus === 503 || httpStatus === 403
-              ? t("Voice dictation is not configured")
-              : t("Transcription failed");
+          // Log the full error for diagnosis (status + body + stack).
+          console.error("[dictation] transcription failed", err);
+          const resp = (
+            err as { response?: { status?: number; data?: { message?: string } } }
+          )?.response;
+          const serverMsg = resp?.data?.message;
+          let message: string;
+          if (serverMsg && serverMsg.trim().length > 0) {
+            // The server already explains the cause (e.g. provider 404, bad
+            // format, STT not configured) — show it verbatim.
+            message = serverMsg;
+          } else if (resp?.status === 503 || resp?.status === 403) {
+            message = t("Voice dictation is not configured");
+          } else {
+            message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
+          }
           notifications.show({ color: "red", message });
-          // Surface the error state briefly, then return to idle. Store the
-          // timer so it can be cleared on unmount.
           setStatus("error");
           if (errorTimerRef.current !== null) {
             clearTimeout(errorTimerRef.current);
@@ -192,7 +218,8 @@ export function useDictation(
     try {
       optionsRef.current.onStart?.();
       recorder.start();
-    } catch {
+    } catch (err) {
+      console.error("[dictation] MediaRecorder.start failed", err);
       // recorder.start() can synchronously throw (InvalidStateError /
       // NotSupportedError); clean up so the button is not left stuck and the
       // MediaStream does not leak.
@@ -201,7 +228,7 @@ export function useDictation(
       startingRef.current = false;
       notifications.show({
         color: "red",
-        message: t("Could not start recording"),
+        message: `${t("Could not start recording")}: ${(err as { message?: string })?.message ?? String(err)}`,
       });
       setStatus("idle");
       return;
diff --git a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
index e39176fd..b908fc03 100644
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -93,9 +93,10 @@ export default function AiProviderSettings() {
   const updateMutation = useUpdateAiSettingsMutation();
   const reindexMutation = useReindexAiEmbeddingsMutation();
 
-  // Two independent test mutations so each card has its own loading + result.
+  // Independent test mutations so each card has its own loading + result.
   const chatTest = useTestAiConnectionMutation();
   const embedTest = useTestAiConnectionMutation();
+  const sttTest = useTestAiConnectionMutation();
 
   // Workspace-level feature toggles live in the card headers.
   const [workspace, setWorkspace] = useAtom(workspaceAtom);
@@ -354,6 +355,11 @@ export default function AiProviderSettings() {
       ? "ok"
       : "error"
     : "idle";
+  const sttStatus: CardStatus = sttTest.data
+    ? sttTest.data.ok
+      ? "ok"
+      : "error"
+    : "idle";
 
   const chatResolved = resolveUrl(form.values.baseUrl, "/chat/completions");
   const embedResolved = resolveUrl(
@@ -617,7 +623,7 @@ export default function AiProviderSettings() {
       <Paper withBorder radius="md" p="lg">
         <Group justify="space-between" align="center" wrap="nowrap">
           <Group gap="xs" align="center" wrap="nowrap">
-            <StatusDot status="idle" />
+            <StatusDot status={sttStatus} />
             <Text fw={600}>{t("Voice / STT")}</Text>
           </Group>
           <Switch
@@ -675,6 +681,27 @@ export default function AiProviderSettings() {
         <Text size="xs" c="dimmed" mt={4} style={{ fontFamily: monoFont }} truncate>
           {t("Resolves to {{url}}", { url: sttResolved })}
         </Text>
+
+        <Group mt="md" align="center">
+          <Button
+            variant="default"
+            size="sm"
+            loading={sttTest.isPending}
+            onClick={() => sttTest.mutate("stt")}
+          >
+            {t("Test endpoint")}
+          </Button>
+          {sttTest.data &&
+            (sttTest.data.ok ? (
+              <Text size="sm" c="green">
+                {t("Connection successful")}
+              </Text>
+            ) : (
+              <Text size="sm" c="red">
+                {sttTest.data.error || t("Connection failed")}
+              </Text>
+            ))}
+        </Group>
       </Paper>
 
       {/* Nested: external MCP tools the agent calls out to */}
diff --git a/apps/client/src/features/workspace/services/ai-settings-service.ts b/apps/client/src/features/workspace/services/ai-settings-service.ts
index 53809ab9..99490189 100644
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -55,7 +55,7 @@ export interface IAiTestResult {
 }
 
 // Which endpoint a connection test probes.
-export type AiTestCapability = "chat" | "embeddings";
+export type AiTestCapability = "chat" | "embeddings" | "stt";
 
 export async function getAiSettings(): Promise<IAiSettings> {
   const req = await api.post<IAiSettings>("/workspace/ai-settings");
diff --git a/apps/server/src/core/ai-chat/ai-chat.controller.ts b/apps/server/src/core/ai-chat/ai-chat.controller.ts
index d1007a78..c32e8e3c 100644
--- a/apps/server/src/core/ai-chat/ai-chat.controller.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts
@@ -4,11 +4,13 @@ import {
   Controller,
   ForbiddenException,
   HttpCode,
+  HttpException,
   HttpStatus,
   Logger,
   Post,
   Req,
   Res,
+  ServiceUnavailableException,
   UseGuards,
   UseInterceptors,
 } from '@nestjs/common';
@@ -32,6 +34,7 @@ import {
   GetChatMessagesDto,
   RenameChatDto,
 } from './dto/ai-chat.dto';
+import { describeProviderError } from '../../integrations/ai/ai-error.util';
 
 /**
  * Per-user AI chat API (§6.1). Routes are POST to match this codebase's
@@ -249,7 +252,31 @@ export class AiChatController {
       }
       throw err;
     }
-    const text = await this.aiTranscription.transcribe(workspace.id, buf);
+    // Container hint for JSON-style STT providers (e.g. OpenRouter); multipart
+    // endpoints ignore it.
+    const formatMap: Record<string, string> = {
+      'audio/webm': 'webm',
+      'audio/ogg': 'ogg',
+      'audio/mp4': 'mp4',
+      'audio/mpeg': 'mp3',
+      'audio/wav': 'wav',
+      'audio/x-wav': 'wav',
+      'audio/wave': 'wav',
+      'audio/m4a': 'm4a',
+      'audio/x-m4a': 'm4a',
+    };
+    const format = formatMap[baseMime] ?? 'webm';
+    let text: string;
+    try {
+      text = await this.aiTranscription.transcribe(workspace.id, buf, format);
+    } catch (err) {
+      // Preserve meaningful HTTP errors (e.g. AiSttNotConfiguredException -> 503).
+      if (err instanceof HttpException) throw err;
+      // Log the full error and surface the real provider/transport reason instead
+      // of an opaque 500 (e.g. "the STT endpoint returned 404 ...").
+      this.logger.error('AI transcription failed', err as Error);
+      throw new ServiceUnavailableException(describeProviderError(err));
+    }
     return { text };
   }
 
diff --git a/apps/server/src/core/ai-chat/ai-transcription.service.ts b/apps/server/src/core/ai-chat/ai-transcription.service.ts
index 72d3ea9f..b95cbb69 100644
--- a/apps/server/src/core/ai-chat/ai-transcription.service.ts
+++ b/apps/server/src/core/ai-chat/ai-transcription.service.ts
@@ -1,20 +1,21 @@
 import { Injectable } from '@nestjs/common';
-import { experimental_transcribe as transcribe } from 'ai';
 import { AiService } from '../../integrations/ai/ai.service';
 
 /**
  * Transcribes uploaded audio to text using the per-workspace STT model.
- * Thin wrapper over the AI SDK's experimental_transcribe; never logs the
- * audio or the key.
+ * Delegates to AiService, which picks the OpenAI-multipart or OpenRouter-JSON
+ * path. Never logs the audio or the key.
  */
 @Injectable()
 export class AiTranscriptionService {
   constructor(private readonly ai: AiService) {}
 
-  // Transcribe an uploaded audio buffer using the workspace STT model.
-  async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
-    const model = await this.ai.getTranscriptionModel(workspaceId);
-    const { text } = await transcribe({ model, audio });
-    return text.trim();
+  // Transcribe an uploaded audio buffer. `format` is the container hint.
+  async transcribe(
+    workspaceId: string,
+    audio: Uint8Array,
+    format: string,
+  ): Promise<string> {
+    return this.ai.transcribe(workspaceId, audio, format);
   }
 }
diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts
index b93416d0..e894d703 100644
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -1,10 +1,10 @@
 import { Injectable, Logger } from '@nestjs/common';
 import {
   embedMany,
+  experimental_transcribe as transcribe,
   generateText,
   type EmbeddingModel,
   type LanguageModel,
-  type TranscriptionModel,
 } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -108,24 +108,90 @@ export class AiService {
     }
   }
 
+  // Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
+  // does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
+  // { model, input_audio: { data: <base64>, format } }. Detect it by host so the
+  // standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
+  // unaffected.
+  private static isOpenRouter(baseURL?: string): boolean {
+    if (!baseURL) return false;
+    try {
+      const host = new URL(baseURL).hostname.toLowerCase();
+      // Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
+      return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
+    } catch {
+      return false;
+    }
+  }
+
   /**
-   * Resolve the workspace config and build the transcription (STT) model.
-   * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
-   * (only @ai-sdk/openai exposes .transcription()), regardless of the chat
-   * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
-   * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
-   * on demand; the decrypted key is never logged.
-   *
-   * Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
+   * Transcribe audio with the workspace STT model. Standard OpenAI-compatible
+   * endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
+   * audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
+   * wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
+   * AiSttNotConfiguredException (-> 503) when no STT model is configured.
    */
-  async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
+  async transcribe(
+    workspaceId: string,
+    audio: Uint8Array,
+    format: string,
+  ): Promise<string> {
     const cfg = await this.aiSettings.resolve(workspaceId);
     if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
-    const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
-    // apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
-    return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
-      cfg.sttModel,
-    );
+    const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
+
+    if (AiService.isOpenRouter(baseURL)) {
+      return this.transcribeViaOpenRouter(
+        baseURL as string,
+        cfg.sttApiKey,
+        cfg.sttModel,
+        audio,
+        format,
+      );
+    }
+
+    // Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
+    // keyless self-hosted whisper; pass a placeholder.
+    const model = createOpenAI({
+      apiKey: cfg.sttApiKey ?? 'unused',
+      baseURL,
+    }).transcription(cfg.sttModel);
+    const { text } = await transcribe({ model, audio });
+    return text.trim();
+  }
+
+  // OpenRouter transcription: JSON body with base64 audio; returns { text }.
+  private async transcribeViaOpenRouter(
+    baseURL: string,
+    apiKey: string | undefined,
+    model: string,
+    audio: Uint8Array,
+    format: string,
+  ): Promise<string> {
+    const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
+    const res = await fetch(url, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
+      },
+      body: JSON.stringify({
+        model,
+        input_audio: {
+          data: Buffer.from(audio).toString('base64'),
+          format,
+        },
+      }),
+    });
+    if (!res.ok) {
+      // Surface status + body so the real reason reaches the user; never log the key.
+      const body = await res.text().catch(() => '');
+      throw new Error(
+        `OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
+      );
+    }
+    const json = (await res.json()) as { text?: string };
+    return (json.text ?? '').trim();
   }
 
   /**
@@ -182,11 +248,36 @@ export class AiService {
     return Number.isFinite(raw) && raw > 0 ? raw : 120_000;
   }
 
+  // Build a tiny valid WAV (mono, 16-bit PCM, 16 kHz, ~1s of silence), used only
+  // as a connectivity probe for the STT endpoint in testConnection.
+  private static silentWavProbe(): Uint8Array {
+    const sampleRate = 16000;
+    const numSamples = sampleRate; // ~1 second
+    const dataSize = numSamples * 2; // 16-bit mono
+    const buf = Buffer.alloc(44 + dataSize);
+    buf.write('RIFF', 0);
+    buf.writeUInt32LE(36 + dataSize, 4);
+    buf.write('WAVE', 8);
+    buf.write('fmt ', 12);
+    buf.writeUInt32LE(16, 16); // PCM fmt chunk size
+    buf.writeUInt16LE(1, 20); // audio format = PCM
+    buf.writeUInt16LE(1, 22); // channels = 1
+    buf.writeUInt32LE(sampleRate, 24);
+    buf.writeUInt32LE(sampleRate * 2, 28); // byte rate
+    buf.writeUInt16LE(2, 32); // block align
+    buf.writeUInt16LE(16, 34); // bits per sample
+    buf.write('data', 36);
+    buf.writeUInt32LE(dataSize, 40);
+    // The PCM samples stay zero (silence).
+    return buf;
+  }
+
   /**
    * Cheap connectivity check for a single "Test endpoint" button. Probes ONLY
    * the requested capability so each card in the UI surfaces its own result:
    *  - `chat`: a one-word generation against the configured chat model;
-   *  - `embeddings`: embedding a tiny string against the embedding model.
+   *  - `embeddings`: embedding a tiny string against the embedding model;
+   *  - `stt`: transcribing a tiny silent WAV against the transcription model.
    *
    * A capability that is not configured returns a plain "… is not configured"
    * message; any real failure returns ok:false with the provider's own cause
@@ -201,7 +292,7 @@ export class AiService {
    */
   async testConnection(
     workspaceId: string,
-    capability: 'chat' | 'embeddings' = 'chat',
+    capability: 'chat' | 'embeddings' | 'stt' = 'chat',
   ): Promise<{ ok: true } | { ok: false; error: string }> {
     if (capability === 'embeddings') {
       try {
@@ -216,6 +307,21 @@ export class AiService {
       }
     }
 
+    if (capability === 'stt') {
+      try {
+        // Probe with a tiny silent WAV; a reachable, authorized endpoint returns
+        // (usually empty) text, any failure surfaces via describeProviderError.
+        await this.transcribe(workspaceId, AiService.silentWavProbe(), 'wav');
+        return { ok: true };
+      } catch (err) {
+        if (err instanceof AiSttNotConfiguredException) {
+          return { ok: false, error: 'STT is not configured' };
+        }
+        this.logger.error('AI STT test connection failed', err as Error);
+        return { ok: false, error: describeProviderError(err) };
+      }
+    }
+
     // Default: chat probe.
     try {
       const model = await this.getChatModel(workspaceId);
diff --git a/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts b/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
index 9fab83a0..f383f0f3 100644
--- a/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
+++ b/apps/server/src/integrations/ai/dto/test-ai-connection.dto.ts
@@ -4,6 +4,6 @@ import { IsIn, IsOptional } from 'class-validator';
 // defaults to the chat endpoint server-side when omitted.
 export class TestAiConnectionDto {
   @IsOptional()
-  @IsIn(['chat', 'embeddings'])
-  capability?: 'chat' | 'embeddings';
+  @IsIn(['chat', 'embeddings', 'stt'])
+  capability?: 'chat' | 'embeddings' | 'stt';
 }