refactor(ai): explicit STT request format instead of OpenRouter host-sniffing

Replace the implicit `hostname endsWith openrouter.ai` detection with an
explicit, admin-chosen provider field `sttApiStyle` ('multipart' = OpenAI-
compatible multipart /audio/transcriptions; 'json' = OpenRouter-style JSON +
base64 input_audio). The transcription path now branches on the stored field,
not on the URL — nothing hidden from the admin.

- ai.types: add SttApiStyle + STT_API_STYLES; field on AiProviderSettings and
  MaskedAiSettings (resolved via ResolvedAiConfig).
- update-ai-settings.dto: validate sttApiStyle with @IsIn(STT_API_STYLES).
- ai-settings.service: plumb sttApiStyle through resolve()/getMasked() and the
  non-secret update whitelist; workspace.repo: add it to the ALLOWED array so it
  persists.
- ai.service: drop isOpenRouter(); transcribe() branches on cfg.sttApiStyle;
  rename helper to transcribeJsonBase64 with provider-neutral error text and a
  BadRequestException (400) when the base URL is missing for the JSON style.
- client: SttApiStyle type on IAiSettings/IAiSettingsUpdate; "Request format"
  Select on the Voice/STT settings card; i18n.
This commit is contained in:
vvzvlad
2026-06-18 19:40:05 +03:00
parent 77249d59c6
commit 01a5a4b5d2
8 changed files with 85 additions and 38 deletions

View File

@@ -1191,5 +1191,9 @@
"Transcription failed": "Transcription failed",
"Voice dictation is not configured": "Voice dictation is not configured",
"Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
"Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
"Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context",
"Request format": "Request format",
"How transcription requests are sent to the endpoint": "How transcription requests are sent to the endpoint",
"OpenAI-compatible (multipart/form-data)": "OpenAI-compatible (multipart/form-data)",
"OpenRouter (JSON, base64 audio)": "OpenRouter (JSON, base64 audio)"
}

View File

@@ -9,6 +9,7 @@ import {
Modal,
Paper,
PasswordInput,
Select,
Stack,
Switch,
Text,
@@ -32,7 +33,10 @@ import {
useTestAiConnectionMutation,
useUpdateAiSettingsMutation,
} from "@/features/workspace/queries/ai-settings-query.ts";
import { IAiSettingsUpdate } from "@/features/workspace/services/ai-settings-service.ts";
import {
IAiSettingsUpdate,
SttApiStyle,
} from "@/features/workspace/services/ai-settings-service.ts";
import AiMcpServers from "./ai-mcp-servers.tsx";
// No driver field: every endpoint is OpenAI-compatible, so the form carries only
@@ -50,6 +54,7 @@ const formSchema = z.object({
// STT-specific fields. Empty base URL / key fall back to the chat ones.
sttModel: z.string(),
sttBaseUrl: z.string(),
sttApiStyle: z.enum(["multipart", "json"]),
sttApiKey: z.string(),
});
@@ -139,6 +144,7 @@ export default function AiProviderSettings() {
embeddingApiKey: "",
sttModel: "",
sttBaseUrl: "",
sttApiStyle: "multipart" as SttApiStyle,
sttApiKey: "",
},
});
@@ -157,6 +163,7 @@ export default function AiProviderSettings() {
embeddingApiKey: "",
sttModel: settings.sttModel ?? "",
sttBaseUrl: settings.sttBaseUrl ?? "",
sttApiStyle: settings.sttApiStyle ?? "multipart",
sttApiKey: "",
});
form.resetDirty();
@@ -184,6 +191,7 @@ export default function AiProviderSettings() {
// server-side.
sttModel: values.sttModel,
sttBaseUrl: values.sttBaseUrl,
sttApiStyle: values.sttApiStyle,
};
// Key semantics (never send the stored key back):
@@ -671,6 +679,22 @@ export default function AiProviderSettings() {
</Stack>
</Group>
<Select
mt="sm"
label={t("Request format")}
description={t("How transcription requests are sent to the endpoint")}
data={[
{
value: "multipart",
label: t("OpenAI-compatible (multipart/form-data)"),
},
{ value: "json", label: t("OpenRouter (JSON, base64 audio)") },
]}
allowDeselect={false}
disabled={isLoading}
{...form.getInputProps("sttApiStyle")}
/>
<TextInput
mt="sm"
label={t("Base URL")}

View File

@@ -3,6 +3,12 @@ import api from "@/lib/api-client";
// Supported LLM providers/drivers.
export type AiDriver = "openai" | "gemini" | "ollama";
// How STT (speech-to-text) requests are encoded for the transcription endpoint.
// - 'multipart' -> OpenAI-compatible multipart/form-data (OpenAI, speaches,
// faster-whisper-server)
// - 'json' -> JSON body with base64-encoded audio (OpenRouter)
export type SttApiStyle = "multipart" | "json";
// Masked AI provider settings returned by the server.
// No API key is ever returned; only `hasApiKey` / `hasEmbeddingApiKey` indicate
// whether one is stored. `embeddingBaseUrl` is the RAW stored value (empty means
@@ -21,6 +27,7 @@ export interface IAiSettings {
// key is stored (empty means "uses the chat API key").
sttModel?: string;
sttBaseUrl?: string;
sttApiStyle?: SttApiStyle;
hasSttApiKey: boolean;
// RAG indexing coverage (pages indexed for semantic search).
indexedPages: number;
@@ -43,6 +50,7 @@ export interface IAiSettingsUpdate {
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
sttApiStyle?: SttApiStyle;
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
sttApiKey?: string;
}

View File

@@ -239,7 +239,7 @@ export class WorkspaceRepo {
// is a real jsonb object, never a double-encoded string. The CASE self-heals
// workspaces whose settings.ai.provider was previously corrupted into an
// array/string.
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'systemPrompt'];
const entries = Object.entries(provider).filter(
([k, v]) => v !== undefined && ALLOWED.includes(k),
);

View File

@@ -12,6 +12,7 @@ import {
AiProviderSettings,
MaskedAiSettings,
ResolvedAiConfig,
SttApiStyle,
} from './ai.types';
/**
@@ -30,6 +31,7 @@ export interface UpdateAiSettingsInput {
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
sttApiStyle?: SttApiStyle;
sttApiKey?: string;
}
@@ -117,6 +119,9 @@ export class AiSettingsService {
chatModel: provider.chatModel,
embeddingModel: provider.embeddingModel,
sttModel: provider.sttModel,
// Plain passthrough, no fallback; the transcribe path defaults unset to
// 'multipart' (current behavior).
sttApiStyle: provider.sttApiStyle,
baseUrl: provider.baseUrl,
systemPrompt: provider.systemPrompt,
};
@@ -190,6 +195,7 @@ export class AiSettingsService {
embeddingBaseUrl: provider.embeddingBaseUrl,
sttModel: provider.sttModel,
sttBaseUrl: provider.sttBaseUrl,
sttApiStyle: provider.sttApiStyle,
systemPrompt: provider.systemPrompt,
hasApiKey,
hasEmbeddingApiKey,
@@ -226,6 +232,7 @@ export class AiSettingsService {
'embeddingBaseUrl',
'sttModel',
'sttBaseUrl',
'sttApiStyle',
'systemPrompt',
] as const) {
if (nonSecret[key] !== undefined) {

View File

@@ -1,4 +1,4 @@
import { Injectable, Logger } from '@nestjs/common';
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
import {
embedMany,
experimental_transcribe as transcribe,
@@ -108,28 +108,14 @@ export class AiService {
}
}
// Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
// does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
// { model, input_audio: { data: <base64>, format } }. Detect it by host so the
// standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
// unaffected.
private static isOpenRouter(baseURL?: string): boolean {
if (!baseURL) return false;
try {
const host = new URL(baseURL).hostname.toLowerCase();
// Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
} catch {
return false;
}
}
/**
* Transcribe audio with the workspace STT model. Standard OpenAI-compatible
* endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
* audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
* wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
* AiSttNotConfiguredException (-> 503) when no STT model is configured.
* Transcribe audio with the workspace STT model. The request encoding is the
* admin-chosen `sttApiStyle`: 'json' uses the JSON+base64 audio/transcriptions
* API (OpenRouter); anything else (default 'multipart') uses the AI SDK
* multipart path (OpenAI, speaches, faster-whisper-server, ...). `format` is
* the audio container hint (webm / mp4 / wav / mp3 / ogg / m4a). Built PER
* WORKSPACE; the key is never logged. Throws AiSttNotConfiguredException
* (-> 503) when no STT model is configured.
*/
async transcribe(
workspaceId: string,
@@ -140,14 +126,11 @@ export class AiService {
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
if (AiService.isOpenRouter(baseURL)) {
return this.transcribeViaOpenRouter(
baseURL as string,
cfg.sttApiKey,
cfg.sttModel,
audio,
format,
);
// Explicit, admin-chosen request encoding (no URL guessing). 'json' is the
// OpenRouter style (JSON + base64 input_audio); everything else uses the
// OpenAI-compatible multipart path via the AI SDK.
if (cfg.sttApiStyle === 'json') {
return this.transcribeJsonBase64(baseURL, cfg.sttApiKey, cfg.sttModel, audio, format);
}
// Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
@@ -160,14 +143,23 @@ export class AiService {
return text.trim();
}
// OpenRouter transcription: JSON body with base64 audio; returns { text }.
private async transcribeViaOpenRouter(
baseURL: string,
/**
* JSON + base64 transcription body (OpenRouter-style). POSTs
* { model, input_audio: { data, format } } to {baseURL}/audio/transcriptions
* and returns { text }.
*/
private async transcribeJsonBase64(
baseURL: string | undefined,
apiKey: string | undefined,
model: string,
audio: Uint8Array,
format: string,
): Promise<string> {
if (!baseURL) {
throw new BadRequestException(
'STT base URL is not set (required for the JSON request format)',
);
}
const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
const res = await fetch(url, {
method: 'POST',
@@ -187,7 +179,7 @@ export class AiService {
// Surface status + body so the real reason reaches the user; never log the key.
const body = await res.text().catch(() => '');
throw new Error(
`OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
`JSON transcription request failed (${res.status}): ${body.slice(0, 500)}`,
);
}
const json = (await res.json()) as { text?: string };

View File

@@ -10,6 +10,12 @@ export type AiDriver = 'openai' | 'gemini' | 'ollama';
export const AI_DRIVERS: AiDriver[] = ['openai', 'gemini', 'ollama'];
// STT request encoding. 'multipart' = OpenAI-compatible /audio/transcriptions
// form-data (OpenAI, speaches, faster-whisper-server). 'json' = JSON body with
// base64 input_audio (OpenRouter). Chosen explicitly by the admin.
export type SttApiStyle = 'multipart' | 'json';
export const STT_API_STYLES: SttApiStyle[] = ['multipart', 'json'];
/**
* Non-secret provider settings persisted under `settings.ai.provider`.
* The API key is intentionally absent here.
@@ -24,6 +30,7 @@ export interface AiProviderSettings {
sttModel?: string;
// STT-specific base URL. Falls back to baseUrl when empty/unset.
sttBaseUrl?: string;
sttApiStyle?: SttApiStyle;
systemPrompt?: string;
}
@@ -58,6 +65,7 @@ export interface MaskedAiSettings {
embeddingBaseUrl?: string;
sttModel?: string;
sttBaseUrl?: string;
sttApiStyle?: SttApiStyle;
systemPrompt?: string;
hasApiKey: boolean;
hasEmbeddingApiKey: boolean;

View File

@@ -1,5 +1,5 @@
import { IsIn, IsOptional, IsString } from 'class-validator';
import { AI_DRIVERS, AiDriver } from '../ai.types';
import { AI_DRIVERS, AiDriver, STT_API_STYLES, SttApiStyle } from '../ai.types';
/**
* Admin update payload for the workspace AI provider settings.
@@ -50,6 +50,10 @@ export class UpdateAiSettingsDto {
@IsString()
sttBaseUrl?: string;
@IsOptional()
@IsIn(STT_API_STYLES)
sttApiStyle?: SttApiStyle;
@IsOptional()
@IsString()
sttApiKey?: string;