refactor(ai): explicit STT request format instead of OpenRouter host-sniffing
Replace the implicit `hostname endsWith openrouter.ai` detection with an
explicit, admin-chosen provider field `sttApiStyle` ('multipart' = OpenAI-
compatible multipart /audio/transcriptions; 'json' = OpenRouter-style JSON +
base64 input_audio). The transcription path now branches on the stored field,
not on the URL — nothing hidden from the admin.
- ai.types: add SttApiStyle + STT_API_STYLES; field on AiProviderSettings and
MaskedAiSettings (resolved via ResolvedAiConfig).
- update-ai-settings.dto: validate sttApiStyle with @IsIn(STT_API_STYLES).
- ai-settings.service: plumb sttApiStyle through resolve()/getMasked() and the
non-secret update whitelist; workspace.repo: add it to the ALLOWED array so it
persists.
- ai.service: drop isOpenRouter(); transcribe() branches on cfg.sttApiStyle;
rename helper to transcribeJsonBase64 with provider-neutral error text and a
BadRequestException (400) when the base URL is missing for the JSON style.
- client: SttApiStyle type on IAiSettings/IAiSettingsUpdate; "Request format"
Select on the Voice/STT settings card; i18n.
This commit is contained in:
@@ -1191,5 +1191,9 @@
|
||||
"Transcription failed": "Transcription failed",
|
||||
"Voice dictation is not configured": "Voice dictation is not configured",
|
||||
"Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
|
||||
"Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
|
||||
"Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context",
|
||||
"Request format": "Request format",
|
||||
"How transcription requests are sent to the endpoint": "How transcription requests are sent to the endpoint",
|
||||
"OpenAI-compatible (multipart/form-data)": "OpenAI-compatible (multipart/form-data)",
|
||||
"OpenRouter (JSON, base64 audio)": "OpenRouter (JSON, base64 audio)"
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
Modal,
|
||||
Paper,
|
||||
PasswordInput,
|
||||
Select,
|
||||
Stack,
|
||||
Switch,
|
||||
Text,
|
||||
@@ -32,7 +33,10 @@ import {
|
||||
useTestAiConnectionMutation,
|
||||
useUpdateAiSettingsMutation,
|
||||
} from "@/features/workspace/queries/ai-settings-query.ts";
|
||||
import { IAiSettingsUpdate } from "@/features/workspace/services/ai-settings-service.ts";
|
||||
import {
|
||||
IAiSettingsUpdate,
|
||||
SttApiStyle,
|
||||
} from "@/features/workspace/services/ai-settings-service.ts";
|
||||
import AiMcpServers from "./ai-mcp-servers.tsx";
|
||||
|
||||
// No driver field: every endpoint is OpenAI-compatible, so the form carries only
|
||||
@@ -50,6 +54,7 @@ const formSchema = z.object({
|
||||
// STT-specific fields. Empty base URL / key fall back to the chat ones.
|
||||
sttModel: z.string(),
|
||||
sttBaseUrl: z.string(),
|
||||
sttApiStyle: z.enum(["multipart", "json"]),
|
||||
sttApiKey: z.string(),
|
||||
});
|
||||
|
||||
@@ -139,6 +144,7 @@ export default function AiProviderSettings() {
|
||||
embeddingApiKey: "",
|
||||
sttModel: "",
|
||||
sttBaseUrl: "",
|
||||
sttApiStyle: "multipart" as SttApiStyle,
|
||||
sttApiKey: "",
|
||||
},
|
||||
});
|
||||
@@ -157,6 +163,7 @@ export default function AiProviderSettings() {
|
||||
embeddingApiKey: "",
|
||||
sttModel: settings.sttModel ?? "",
|
||||
sttBaseUrl: settings.sttBaseUrl ?? "",
|
||||
sttApiStyle: settings.sttApiStyle ?? "multipart",
|
||||
sttApiKey: "",
|
||||
});
|
||||
form.resetDirty();
|
||||
@@ -184,6 +191,7 @@ export default function AiProviderSettings() {
|
||||
// server-side.
|
||||
sttModel: values.sttModel,
|
||||
sttBaseUrl: values.sttBaseUrl,
|
||||
sttApiStyle: values.sttApiStyle,
|
||||
};
|
||||
|
||||
// Key semantics (never send the stored key back):
|
||||
@@ -671,6 +679,22 @@ export default function AiProviderSettings() {
|
||||
</Stack>
|
||||
</Group>
|
||||
|
||||
<Select
|
||||
mt="sm"
|
||||
label={t("Request format")}
|
||||
description={t("How transcription requests are sent to the endpoint")}
|
||||
data={[
|
||||
{
|
||||
value: "multipart",
|
||||
label: t("OpenAI-compatible (multipart/form-data)"),
|
||||
},
|
||||
{ value: "json", label: t("OpenRouter (JSON, base64 audio)") },
|
||||
]}
|
||||
allowDeselect={false}
|
||||
disabled={isLoading}
|
||||
{...form.getInputProps("sttApiStyle")}
|
||||
/>
|
||||
|
||||
<TextInput
|
||||
mt="sm"
|
||||
label={t("Base URL")}
|
||||
|
||||
@@ -3,6 +3,12 @@ import api from "@/lib/api-client";
|
||||
// Supported LLM providers/drivers.
|
||||
export type AiDriver = "openai" | "gemini" | "ollama";
|
||||
|
||||
// How STT (speech-to-text) requests are encoded for the transcription endpoint.
|
||||
// - 'multipart' -> OpenAI-compatible multipart/form-data (OpenAI, speaches,
|
||||
// faster-whisper-server)
|
||||
// - 'json' -> JSON body with base64-encoded audio (OpenRouter)
|
||||
export type SttApiStyle = "multipart" | "json";
|
||||
|
||||
// Masked AI provider settings returned by the server.
|
||||
// No API key is ever returned; only `hasApiKey` / `hasEmbeddingApiKey` indicate
|
||||
// whether one is stored. `embeddingBaseUrl` is the RAW stored value (empty means
|
||||
@@ -21,6 +27,7 @@ export interface IAiSettings {
|
||||
// key is stored (empty means "uses the chat API key").
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
sttApiStyle?: SttApiStyle;
|
||||
hasSttApiKey: boolean;
|
||||
// RAG indexing coverage (pages indexed for semantic search).
|
||||
indexedPages: number;
|
||||
@@ -43,6 +50,7 @@ export interface IAiSettingsUpdate {
|
||||
embeddingApiKey?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
sttApiStyle?: SttApiStyle;
|
||||
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
|
||||
sttApiKey?: string;
|
||||
}
|
||||
|
||||
@@ -239,7 +239,7 @@ export class WorkspaceRepo {
|
||||
// is a real jsonb object, never a double-encoded string. The CASE self-heals
|
||||
// workspaces whose settings.ai.provider was previously corrupted into an
|
||||
// array/string.
|
||||
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
|
||||
const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'systemPrompt'];
|
||||
const entries = Object.entries(provider).filter(
|
||||
([k, v]) => v !== undefined && ALLOWED.includes(k),
|
||||
);
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
AiProviderSettings,
|
||||
MaskedAiSettings,
|
||||
ResolvedAiConfig,
|
||||
SttApiStyle,
|
||||
} from './ai.types';
|
||||
|
||||
/**
|
||||
@@ -30,6 +31,7 @@ export interface UpdateAiSettingsInput {
|
||||
embeddingApiKey?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
sttApiStyle?: SttApiStyle;
|
||||
sttApiKey?: string;
|
||||
}
|
||||
|
||||
@@ -117,6 +119,9 @@ export class AiSettingsService {
|
||||
chatModel: provider.chatModel,
|
||||
embeddingModel: provider.embeddingModel,
|
||||
sttModel: provider.sttModel,
|
||||
// Plain passthrough, no fallback; the transcribe path defaults unset to
|
||||
// 'multipart' (current behavior).
|
||||
sttApiStyle: provider.sttApiStyle,
|
||||
baseUrl: provider.baseUrl,
|
||||
systemPrompt: provider.systemPrompt,
|
||||
};
|
||||
@@ -190,6 +195,7 @@ export class AiSettingsService {
|
||||
embeddingBaseUrl: provider.embeddingBaseUrl,
|
||||
sttModel: provider.sttModel,
|
||||
sttBaseUrl: provider.sttBaseUrl,
|
||||
sttApiStyle: provider.sttApiStyle,
|
||||
systemPrompt: provider.systemPrompt,
|
||||
hasApiKey,
|
||||
hasEmbeddingApiKey,
|
||||
@@ -226,6 +232,7 @@ export class AiSettingsService {
|
||||
'embeddingBaseUrl',
|
||||
'sttModel',
|
||||
'sttBaseUrl',
|
||||
'sttApiStyle',
|
||||
'systemPrompt',
|
||||
] as const) {
|
||||
if (nonSecret[key] !== undefined) {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
|
||||
import {
|
||||
embedMany,
|
||||
experimental_transcribe as transcribe,
|
||||
@@ -108,28 +108,14 @@ export class AiService {
|
||||
}
|
||||
}
|
||||
|
||||
// Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
|
||||
// does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
|
||||
// { model, input_audio: { data: <base64>, format } }. Detect it by host so the
|
||||
// standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
|
||||
// unaffected.
|
||||
private static isOpenRouter(baseURL?: string): boolean {
|
||||
if (!baseURL) return false;
|
||||
try {
|
||||
const host = new URL(baseURL).hostname.toLowerCase();
|
||||
// Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
|
||||
return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio with the workspace STT model. Standard OpenAI-compatible
|
||||
* endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
|
||||
* audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
|
||||
* wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
|
||||
* AiSttNotConfiguredException (-> 503) when no STT model is configured.
|
||||
* Transcribe audio with the workspace STT model. The request encoding is the
|
||||
* admin-chosen `sttApiStyle`: 'json' uses the JSON+base64 audio/transcriptions
|
||||
* API (OpenRouter); anything else (default 'multipart') uses the AI SDK
|
||||
* multipart path (OpenAI, speaches, faster-whisper-server, ...). `format` is
|
||||
* the audio container hint (webm / mp4 / wav / mp3 / ogg / m4a). Built PER
|
||||
* WORKSPACE; the key is never logged. Throws AiSttNotConfiguredException
|
||||
* (-> 503) when no STT model is configured.
|
||||
*/
|
||||
async transcribe(
|
||||
workspaceId: string,
|
||||
@@ -140,14 +126,11 @@ export class AiService {
|
||||
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
|
||||
const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
|
||||
|
||||
if (AiService.isOpenRouter(baseURL)) {
|
||||
return this.transcribeViaOpenRouter(
|
||||
baseURL as string,
|
||||
cfg.sttApiKey,
|
||||
cfg.sttModel,
|
||||
audio,
|
||||
format,
|
||||
);
|
||||
// Explicit, admin-chosen request encoding (no URL guessing). 'json' is the
|
||||
// OpenRouter style (JSON + base64 input_audio); everything else uses the
|
||||
// OpenAI-compatible multipart path via the AI SDK.
|
||||
if (cfg.sttApiStyle === 'json') {
|
||||
return this.transcribeJsonBase64(baseURL, cfg.sttApiKey, cfg.sttModel, audio, format);
|
||||
}
|
||||
|
||||
// Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
|
||||
@@ -160,14 +143,23 @@ export class AiService {
|
||||
return text.trim();
|
||||
}
|
||||
|
||||
// OpenRouter transcription: JSON body with base64 audio; returns { text }.
|
||||
private async transcribeViaOpenRouter(
|
||||
baseURL: string,
|
||||
/**
|
||||
* JSON + base64 transcription body (OpenRouter-style). POSTs
|
||||
* { model, input_audio: { data, format } } to {baseURL}/audio/transcriptions
|
||||
* and returns { text }.
|
||||
*/
|
||||
private async transcribeJsonBase64(
|
||||
baseURL: string | undefined,
|
||||
apiKey: string | undefined,
|
||||
model: string,
|
||||
audio: Uint8Array,
|
||||
format: string,
|
||||
): Promise<string> {
|
||||
if (!baseURL) {
|
||||
throw new BadRequestException(
|
||||
'STT base URL is not set (required for the JSON request format)',
|
||||
);
|
||||
}
|
||||
const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
|
||||
const res = await fetch(url, {
|
||||
method: 'POST',
|
||||
@@ -187,7 +179,7 @@ export class AiService {
|
||||
// Surface status + body so the real reason reaches the user; never log the key.
|
||||
const body = await res.text().catch(() => '');
|
||||
throw new Error(
|
||||
`OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
|
||||
`JSON transcription request failed (${res.status}): ${body.slice(0, 500)}`,
|
||||
);
|
||||
}
|
||||
const json = (await res.json()) as { text?: string };
|
||||
|
||||
@@ -10,6 +10,12 @@ export type AiDriver = 'openai' | 'gemini' | 'ollama';
|
||||
|
||||
export const AI_DRIVERS: AiDriver[] = ['openai', 'gemini', 'ollama'];
|
||||
|
||||
// STT request encoding. 'multipart' = OpenAI-compatible /audio/transcriptions
|
||||
// form-data (OpenAI, speaches, faster-whisper-server). 'json' = JSON body with
|
||||
// base64 input_audio (OpenRouter). Chosen explicitly by the admin.
|
||||
export type SttApiStyle = 'multipart' | 'json';
|
||||
export const STT_API_STYLES: SttApiStyle[] = ['multipart', 'json'];
|
||||
|
||||
/**
|
||||
* Non-secret provider settings persisted under `settings.ai.provider`.
|
||||
* The API key is intentionally absent here.
|
||||
@@ -24,6 +30,7 @@ export interface AiProviderSettings {
|
||||
sttModel?: string;
|
||||
// STT-specific base URL. Falls back to baseUrl when empty/unset.
|
||||
sttBaseUrl?: string;
|
||||
sttApiStyle?: SttApiStyle;
|
||||
systemPrompt?: string;
|
||||
}
|
||||
|
||||
@@ -58,6 +65,7 @@ export interface MaskedAiSettings {
|
||||
embeddingBaseUrl?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
sttApiStyle?: SttApiStyle;
|
||||
systemPrompt?: string;
|
||||
hasApiKey: boolean;
|
||||
hasEmbeddingApiKey: boolean;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { IsIn, IsOptional, IsString } from 'class-validator';
|
||||
import { AI_DRIVERS, AiDriver } from '../ai.types';
|
||||
import { AI_DRIVERS, AiDriver, STT_API_STYLES, SttApiStyle } from '../ai.types';
|
||||
|
||||
/**
|
||||
* Admin update payload for the workspace AI provider settings.
|
||||
@@ -50,6 +50,10 @@ export class UpdateAiSettingsDto {
|
||||
@IsString()
|
||||
sttBaseUrl?: string;
|
||||
|
||||
@IsOptional()
|
||||
@IsIn(STT_API_STYLES)
|
||||
sttApiStyle?: SttApiStyle;
|
||||
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
sttApiKey?: string;
|
||||
|
||||
Reference in New Issue
Block a user