refactor(ai): explicit STT request format instead of OpenRouter host-sniffing

Replace the implicit `hostname endsWith openrouter.ai` detection with an explicit, admin-chosen provider field `sttApiStyle` ('multipart' = OpenAI- compatible multipart /audio/transcriptions; 'json' = OpenRouter-style JSON + base64 input_audio). The transcription path now branches on the stored field, not on the URL — nothing hidden from the admin. - ai.types: add SttApiStyle + STT_API_STYLES; field on AiProviderSettings and MaskedAiSettings (resolved via ResolvedAiConfig). - update-ai-settings.dto: validate sttApiStyle with @IsIn(STT_API_STYLES). - ai-settings.service: plumb sttApiStyle through resolve()/getMasked() and the non-secret update whitelist; workspace.repo: add it to the ALLOWED array so it persists. - ai.service: drop isOpenRouter(); transcribe() branches on cfg.sttApiStyle; rename helper to transcribeJsonBase64 with provider-neutral error text and a BadRequestException (400) when the base URL is missing for the JSON style. - client: SttApiStyle type on IAiSettings/IAiSettingsUpdate; "Request format" Select on the Voice/STT settings card; i18n.
2026-06-18 19:40:05 +03:00
parent 77249d59c6
commit 01a5a4b5d2
8 changed files with 85 additions and 38 deletions
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1191,5 +1191,9 @@
  "Transcription failed": "Transcription failed",
  "Voice dictation is not configured": "Voice dictation is not configured",
  "Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
-  "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
+  "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context",
+  "Request format": "Request format",
+  "How transcription requests are sent to the endpoint": "How transcription requests are sent to the endpoint",
+  "OpenAI-compatible (multipart/form-data)": "OpenAI-compatible (multipart/form-data)",
+  "OpenRouter (JSON, base64 audio)": "OpenRouter (JSON, base64 audio)"
 }
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -9,6 +9,7 @@ import {
  Modal,
  Paper,
  PasswordInput,
+  Select,
  Stack,
  Switch,
  Text,
@@ -32,7 +33,10 @@ import {
  useTestAiConnectionMutation,
  useUpdateAiSettingsMutation,
 } from "@/features/workspace/queries/ai-settings-query.ts";
-import { IAiSettingsUpdate } from "@/features/workspace/services/ai-settings-service.ts";
+import {
+  IAiSettingsUpdate,
+  SttApiStyle,
+} from "@/features/workspace/services/ai-settings-service.ts";
 import AiMcpServers from "./ai-mcp-servers.tsx";

 // No driver field: every endpoint is OpenAI-compatible, so the form carries only
@@ -50,6 +54,7 @@ const formSchema = z.object({
  // STT-specific fields. Empty base URL / key fall back to the chat ones.
  sttModel: z.string(),
  sttBaseUrl: z.string(),
+  sttApiStyle: z.enum(["multipart", "json"]),
  sttApiKey: z.string(),
 });

@@ -139,6 +144,7 @@ export default function AiProviderSettings() {
      embeddingApiKey: "",
      sttModel: "",
      sttBaseUrl: "",
+      sttApiStyle: "multipart" as SttApiStyle,
      sttApiKey: "",
    },
  });
@@ -157,6 +163,7 @@ export default function AiProviderSettings() {
      embeddingApiKey: "",
      sttModel: settings.sttModel ?? "",
      sttBaseUrl: settings.sttBaseUrl ?? "",
+      sttApiStyle: settings.sttApiStyle ?? "multipart",
      sttApiKey: "",
    });
    form.resetDirty();
@@ -184,6 +191,7 @@ export default function AiProviderSettings() {
      // server-side.
      sttModel: values.sttModel,
      sttBaseUrl: values.sttBaseUrl,
+      sttApiStyle: values.sttApiStyle,
    };

    // Key semantics (never send the stored key back):
@@ -671,6 +679,22 @@ export default function AiProviderSettings() {
          </Stack>
        </Group>

+        <Select
+          mt="sm"
+          label={t("Request format")}
+          description={t("How transcription requests are sent to the endpoint")}
+          data={[
+            {
+              value: "multipart",
+              label: t("OpenAI-compatible (multipart/form-data)"),
+            },
+            { value: "json", label: t("OpenRouter (JSON, base64 audio)") },
+          ]}
+          allowDeselect={false}
+          disabled={isLoading}
+          {...form.getInputProps("sttApiStyle")}
+        />
+
        <TextInput
          mt="sm"
          label={t("Base URL")}
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -3,6 +3,12 @@ import api from "@/lib/api-client";
 // Supported LLM providers/drivers.
 export type AiDriver = "openai" | "gemini" | "ollama";

+// How STT (speech-to-text) requests are encoded for the transcription endpoint.
+//   - 'multipart' -> OpenAI-compatible multipart/form-data (OpenAI, speaches,
+//     faster-whisper-server)
+//   - 'json'      -> JSON body with base64-encoded audio (OpenRouter)
+export type SttApiStyle = "multipart" | "json";
+
 // Masked AI provider settings returned by the server.
 // No API key is ever returned; only `hasApiKey` / `hasEmbeddingApiKey` indicate
 // whether one is stored. `embeddingBaseUrl` is the RAW stored value (empty means
@@ -21,6 +27,7 @@ export interface IAiSettings {
  // key is stored (empty means "uses the chat API key").
  sttModel?: string;
  sttBaseUrl?: string;
+  sttApiStyle?: SttApiStyle;
  hasSttApiKey: boolean;
  // RAG indexing coverage (pages indexed for semantic search).
  indexedPages: number;
@@ -43,6 +50,7 @@ export interface IAiSettingsUpdate {
  embeddingApiKey?: string;
  sttModel?: string;
  sttBaseUrl?: string;
+  sttApiStyle?: SttApiStyle;
  // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
  sttApiKey?: string;
 }
--- a/apps/server/src/database/repos/workspace/workspace.repo.ts
+++ b/apps/server/src/database/repos/workspace/workspace.repo.ts
@@ -239,7 +239,7 @@ export class WorkspaceRepo {
    // is a real jsonb object, never a double-encoded string. The CASE self-heals
    // workspaces whose settings.ai.provider was previously corrupted into an
    // array/string.
-    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
+    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'systemPrompt'];
    const entries = Object.entries(provider).filter(
      ([k, v]) => v !== undefined && ALLOWED.includes(k),
    );
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -12,6 +12,7 @@ import {
  AiProviderSettings,
  MaskedAiSettings,
  ResolvedAiConfig,
+  SttApiStyle,
 } from './ai.types';

 /**
@@ -30,6 +31,7 @@ export interface UpdateAiSettingsInput {
  embeddingApiKey?: string;
  sttModel?: string;
  sttBaseUrl?: string;
+  sttApiStyle?: SttApiStyle;
  sttApiKey?: string;
 }

@@ -117,6 +119,9 @@ export class AiSettingsService {
      chatModel: provider.chatModel,
      embeddingModel: provider.embeddingModel,
      sttModel: provider.sttModel,
+      // Plain passthrough, no fallback; the transcribe path defaults unset to
+      // 'multipart' (current behavior).
+      sttApiStyle: provider.sttApiStyle,
      baseUrl: provider.baseUrl,
      systemPrompt: provider.systemPrompt,
    };
@@ -190,6 +195,7 @@ export class AiSettingsService {
      embeddingBaseUrl: provider.embeddingBaseUrl,
      sttModel: provider.sttModel,
      sttBaseUrl: provider.sttBaseUrl,
+      sttApiStyle: provider.sttApiStyle,
      systemPrompt: provider.systemPrompt,
      hasApiKey,
      hasEmbeddingApiKey,
@@ -226,6 +232,7 @@ export class AiSettingsService {
      'embeddingBaseUrl',
      'sttModel',
      'sttBaseUrl',
+      'sttApiStyle',
      'systemPrompt',
    ] as const) {
      if (nonSecret[key] !== undefined) {
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -1,4 +1,4 @@
-import { Injectable, Logger } from '@nestjs/common';
+import { BadRequestException, Injectable, Logger } from '@nestjs/common';
 import {
  embedMany,
  experimental_transcribe as transcribe,
@@ -108,28 +108,14 @@ export class AiService {
    }
  }

-  // Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
-  // does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
-  // { model, input_audio: { data: <base64>, format } }. Detect it by host so the
-  // standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
-  // unaffected.
-  private static isOpenRouter(baseURL?: string): boolean {
-    if (!baseURL) return false;
-    try {
-      const host = new URL(baseURL).hostname.toLowerCase();
-      // Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
-      return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
-    } catch {
-      return false;
-    }
-  }
-
  /**
-   * Transcribe audio with the workspace STT model. Standard OpenAI-compatible
-   * endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
-   * audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
-   * wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
-   * AiSttNotConfiguredException (-> 503) when no STT model is configured.
+   * Transcribe audio with the workspace STT model. The request encoding is the
+   * admin-chosen `sttApiStyle`: 'json' uses the JSON+base64 audio/transcriptions
+   * API (OpenRouter); anything else (default 'multipart') uses the AI SDK
+   * multipart path (OpenAI, speaches, faster-whisper-server, ...). `format` is
+   * the audio container hint (webm / mp4 / wav / mp3 / ogg / m4a). Built PER
+   * WORKSPACE; the key is never logged. Throws AiSttNotConfiguredException
+   * (-> 503) when no STT model is configured.
   */
  async transcribe(
    workspaceId: string,
@@ -140,14 +126,11 @@ export class AiService {
    if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
    const baseURL = cfg.sttBaseUrl || cfg.baseUrl;

-    if (AiService.isOpenRouter(baseURL)) {
-      return this.transcribeViaOpenRouter(
-        baseURL as string,
-        cfg.sttApiKey,
-        cfg.sttModel,
-        audio,
-        format,
-      );
+    // Explicit, admin-chosen request encoding (no URL guessing). 'json' is the
+    // OpenRouter style (JSON + base64 input_audio); everything else uses the
+    // OpenAI-compatible multipart path via the AI SDK.
+    if (cfg.sttApiStyle === 'json') {
+      return this.transcribeJsonBase64(baseURL, cfg.sttApiKey, cfg.sttModel, audio, format);
    }

    // Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
@@ -160,14 +143,23 @@ export class AiService {
    return text.trim();
  }

-  // OpenRouter transcription: JSON body with base64 audio; returns { text }.
-  private async transcribeViaOpenRouter(
-    baseURL: string,
+  /**
+   * JSON + base64 transcription body (OpenRouter-style). POSTs
+   * { model, input_audio: { data, format } } to {baseURL}/audio/transcriptions
+   * and returns { text }.
+   */
+  private async transcribeJsonBase64(
+    baseURL: string | undefined,
    apiKey: string | undefined,
    model: string,
    audio: Uint8Array,
    format: string,
  ): Promise<string> {
+    if (!baseURL) {
+      throw new BadRequestException(
+        'STT base URL is not set (required for the JSON request format)',
+      );
+    }
    const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
    const res = await fetch(url, {
      method: 'POST',
@@ -187,7 +179,7 @@ export class AiService {
      // Surface status + body so the real reason reaches the user; never log the key.
      const body = await res.text().catch(() => '');
      throw new Error(
-        `OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
+        `JSON transcription request failed (${res.status}): ${body.slice(0, 500)}`,
      );
    }
    const json = (await res.json()) as { text?: string };
--- a/apps/server/src/integrations/ai/ai.types.ts
+++ b/apps/server/src/integrations/ai/ai.types.ts
@@ -10,6 +10,12 @@ export type AiDriver = 'openai' | 'gemini' | 'ollama';

 export const AI_DRIVERS: AiDriver[] = ['openai', 'gemini', 'ollama'];

+// STT request encoding. 'multipart' = OpenAI-compatible /audio/transcriptions
+// form-data (OpenAI, speaches, faster-whisper-server). 'json' = JSON body with
+// base64 input_audio (OpenRouter). Chosen explicitly by the admin.
+export type SttApiStyle = 'multipart' | 'json';
+export const STT_API_STYLES: SttApiStyle[] = ['multipart', 'json'];
+
 /**
 * Non-secret provider settings persisted under `settings.ai.provider`.
 * The API key is intentionally absent here.
@@ -24,6 +30,7 @@ export interface AiProviderSettings {
  sttModel?: string;
  // STT-specific base URL. Falls back to baseUrl when empty/unset.
  sttBaseUrl?: string;
+  sttApiStyle?: SttApiStyle;
  systemPrompt?: string;
 }

@@ -58,6 +65,7 @@ export interface MaskedAiSettings {
  embeddingBaseUrl?: string;
  sttModel?: string;
  sttBaseUrl?: string;
+  sttApiStyle?: SttApiStyle;
  systemPrompt?: string;
  hasApiKey: boolean;
  hasEmbeddingApiKey: boolean;
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
@@ -1,5 +1,5 @@
 import { IsIn, IsOptional, IsString } from 'class-validator';
-import { AI_DRIVERS, AiDriver } from '../ai.types';
+import { AI_DRIVERS, AiDriver, STT_API_STYLES, SttApiStyle } from '../ai.types';

 /**
 * Admin update payload for the workspace AI provider settings.
@@ -50,6 +50,10 @@ export class UpdateAiSettingsDto {
  @IsString()
  sttBaseUrl?: string;

+  @IsOptional()
+  @IsIn(STT_API_STYLES)
+  sttApiStyle?: SttApiStyle;
+
  @IsOptional()
  @IsString()
  sttApiKey?: string;