diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json
index 8cfd742c..87e8af42 100644
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1191,5 +1191,9 @@
"Transcription failed": "Transcription failed",
"Voice dictation is not configured": "Voice dictation is not configured",
"Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
- "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context"
+ "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context",
+ "Request format": "Request format",
+ "How transcription requests are sent to the endpoint": "How transcription requests are sent to the endpoint",
+ "OpenAI-compatible (multipart/form-data)": "OpenAI-compatible (multipart/form-data)",
+ "OpenRouter (JSON, base64 audio)": "OpenRouter (JSON, base64 audio)"
}
diff --git a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
index b908fc03..78727bda 100644
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -9,6 +9,7 @@ import {
Modal,
Paper,
PasswordInput,
+ Select,
Stack,
Switch,
Text,
@@ -32,7 +33,10 @@ import {
useTestAiConnectionMutation,
useUpdateAiSettingsMutation,
} from "@/features/workspace/queries/ai-settings-query.ts";
-import { IAiSettingsUpdate } from "@/features/workspace/services/ai-settings-service.ts";
+import {
+ IAiSettingsUpdate,
+ SttApiStyle,
+} from "@/features/workspace/services/ai-settings-service.ts";
import AiMcpServers from "./ai-mcp-servers.tsx";
// No driver field: every endpoint is OpenAI-compatible, so the form carries only
@@ -50,6 +54,7 @@ const formSchema = z.object({
// STT-specific fields. Empty base URL / key fall back to the chat ones.
sttModel: z.string(),
sttBaseUrl: z.string(),
+ sttApiStyle: z.enum(["multipart", "json"]),
sttApiKey: z.string(),
});
@@ -139,6 +144,7 @@ export default function AiProviderSettings() {
embeddingApiKey: "",
sttModel: "",
sttBaseUrl: "",
+ sttApiStyle: "multipart" as SttApiStyle,
sttApiKey: "",
},
});
@@ -157,6 +163,7 @@ export default function AiProviderSettings() {
embeddingApiKey: "",
sttModel: settings.sttModel ?? "",
sttBaseUrl: settings.sttBaseUrl ?? "",
+ sttApiStyle: settings.sttApiStyle ?? "multipart",
sttApiKey: "",
});
form.resetDirty();
@@ -184,6 +191,7 @@ export default function AiProviderSettings() {
// server-side.
sttModel: values.sttModel,
sttBaseUrl: values.sttBaseUrl,
+ sttApiStyle: values.sttApiStyle,
};
// Key semantics (never send the stored key back):
@@ -671,6 +679,22 @@ export default function AiProviderSettings() {
+
+
OpenAI-compatible multipart/form-data (OpenAI, speaches,
+// faster-whisper-server)
+// - 'json' -> JSON body with base64-encoded audio (OpenRouter)
+export type SttApiStyle = "multipart" | "json";
+
// Masked AI provider settings returned by the server.
// No API key is ever returned; only `hasApiKey` / `hasEmbeddingApiKey` indicate
// whether one is stored. `embeddingBaseUrl` is the RAW stored value (empty means
@@ -21,6 +27,7 @@ export interface IAiSettings {
// key is stored (empty means "uses the chat API key").
sttModel?: string;
sttBaseUrl?: string;
+ sttApiStyle?: SttApiStyle;
hasSttApiKey: boolean;
// RAG indexing coverage (pages indexed for semantic search).
indexedPages: number;
@@ -43,6 +50,7 @@ export interface IAiSettingsUpdate {
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
+ sttApiStyle?: SttApiStyle;
// Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
sttApiKey?: string;
}
diff --git a/apps/server/src/database/repos/workspace/workspace.repo.ts b/apps/server/src/database/repos/workspace/workspace.repo.ts
index 2f8e1b08..b5d62f7a 100644
--- a/apps/server/src/database/repos/workspace/workspace.repo.ts
+++ b/apps/server/src/database/repos/workspace/workspace.repo.ts
@@ -239,7 +239,7 @@ export class WorkspaceRepo {
// is a real jsonb object, never a double-encoded string. The CASE self-heals
// workspaces whose settings.ai.provider was previously corrupted into an
// array/string.
- const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
+ const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'systemPrompt'];
const entries = Object.entries(provider).filter(
([k, v]) => v !== undefined && ALLOWED.includes(k),
);
diff --git a/apps/server/src/integrations/ai/ai-settings.service.ts b/apps/server/src/integrations/ai/ai-settings.service.ts
index 315ff380..f8fb6996 100644
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -12,6 +12,7 @@ import {
AiProviderSettings,
MaskedAiSettings,
ResolvedAiConfig,
+ SttApiStyle,
} from './ai.types';
/**
@@ -30,6 +31,7 @@ export interface UpdateAiSettingsInput {
embeddingApiKey?: string;
sttModel?: string;
sttBaseUrl?: string;
+ sttApiStyle?: SttApiStyle;
sttApiKey?: string;
}
@@ -117,6 +119,9 @@ export class AiSettingsService {
chatModel: provider.chatModel,
embeddingModel: provider.embeddingModel,
sttModel: provider.sttModel,
+ // Plain passthrough, no fallback; the transcribe path defaults unset to
+ // 'multipart' (current behavior).
+ sttApiStyle: provider.sttApiStyle,
baseUrl: provider.baseUrl,
systemPrompt: provider.systemPrompt,
};
@@ -190,6 +195,7 @@ export class AiSettingsService {
embeddingBaseUrl: provider.embeddingBaseUrl,
sttModel: provider.sttModel,
sttBaseUrl: provider.sttBaseUrl,
+ sttApiStyle: provider.sttApiStyle,
systemPrompt: provider.systemPrompt,
hasApiKey,
hasEmbeddingApiKey,
@@ -226,6 +232,7 @@ export class AiSettingsService {
'embeddingBaseUrl',
'sttModel',
'sttBaseUrl',
+ 'sttApiStyle',
'systemPrompt',
] as const) {
if (nonSecret[key] !== undefined) {
diff --git a/apps/server/src/integrations/ai/ai.service.ts b/apps/server/src/integrations/ai/ai.service.ts
index e894d703..fef30a5b 100644
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -1,4 +1,4 @@
-import { Injectable, Logger } from '@nestjs/common';
+import { BadRequestException, Injectable, Logger } from '@nestjs/common';
import {
embedMany,
experimental_transcribe as transcribe,
@@ -108,28 +108,14 @@ export class AiService {
}
}
- // Some OpenAI-compatible gateways diverge on the transcription API. OpenRouter
- // does NOT accept OpenAI's multipart /audio/transcriptions; it wants JSON
- // { model, input_audio: { data: , format } }. Detect it by host so the
- // standard multipart path (OpenAI, speaches, faster-whisper-server, ...) is
- // unaffected.
- private static isOpenRouter(baseURL?: string): boolean {
- if (!baseURL) return false;
- try {
- const host = new URL(baseURL).hostname.toLowerCase();
- // Exact host or a real subdomain — avoid matching e.g. "evil-openrouter.ai".
- return host === 'openrouter.ai' || host.endsWith('.openrouter.ai');
- } catch {
- return false;
- }
- }
-
/**
- * Transcribe audio with the workspace STT model. Standard OpenAI-compatible
- * endpoints use the AI SDK multipart path; OpenRouter uses its JSON+base64
- * audio/transcriptions API. `format` is the audio container hint (webm / mp4 /
- * wav / mp3 / ogg / m4a). Built PER WORKSPACE; the key is never logged. Throws
- * AiSttNotConfiguredException (-> 503) when no STT model is configured.
+ * Transcribe audio with the workspace STT model. The request encoding is the
+ * admin-chosen `sttApiStyle`: 'json' uses the JSON+base64 audio/transcriptions
+ * API (OpenRouter); anything else (default 'multipart') uses the AI SDK
+ * multipart path (OpenAI, speaches, faster-whisper-server, ...). `format` is
+ * the audio container hint (webm / mp4 / wav / mp3 / ogg / m4a). Built PER
+ * WORKSPACE; the key is never logged. Throws AiSttNotConfiguredException
+ * (-> 503) when no STT model is configured.
*/
async transcribe(
workspaceId: string,
@@ -140,14 +126,11 @@ export class AiService {
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
- if (AiService.isOpenRouter(baseURL)) {
- return this.transcribeViaOpenRouter(
- baseURL as string,
- cfg.sttApiKey,
- cfg.sttModel,
- audio,
- format,
- );
+ // Explicit, admin-chosen request encoding (no URL guessing). 'json' is the
+ // OpenRouter style (JSON + base64 input_audio); everything else uses the
+ // OpenAI-compatible multipart path via the AI SDK.
+ if (cfg.sttApiStyle === 'json') {
+ return this.transcribeJsonBase64(baseURL, cfg.sttApiKey, cfg.sttModel, audio, format);
}
// Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
@@ -160,14 +143,23 @@ export class AiService {
return text.trim();
}
- // OpenRouter transcription: JSON body with base64 audio; returns { text }.
- private async transcribeViaOpenRouter(
- baseURL: string,
+ /**
+ * JSON + base64 transcription body (OpenRouter-style). POSTs
+ * { model, input_audio: { data, format } } to {baseURL}/audio/transcriptions
+ * and returns { text }.
+ */
+ private async transcribeJsonBase64(
+ baseURL: string | undefined,
apiKey: string | undefined,
model: string,
audio: Uint8Array,
format: string,
): Promise {
+ if (!baseURL) {
+ throw new BadRequestException(
+ 'STT base URL is not set (required for the JSON request format)',
+ );
+ }
const url = `${baseURL.replace(/\/$/, '')}/audio/transcriptions`;
const res = await fetch(url, {
method: 'POST',
@@ -187,7 +179,7 @@ export class AiService {
// Surface status + body so the real reason reaches the user; never log the key.
const body = await res.text().catch(() => '');
throw new Error(
- `OpenRouter transcription failed (${res.status}): ${body.slice(0, 500)}`,
+ `JSON transcription request failed (${res.status}): ${body.slice(0, 500)}`,
);
}
const json = (await res.json()) as { text?: string };
diff --git a/apps/server/src/integrations/ai/ai.types.ts b/apps/server/src/integrations/ai/ai.types.ts
index 32f043c8..4f4258ad 100644
--- a/apps/server/src/integrations/ai/ai.types.ts
+++ b/apps/server/src/integrations/ai/ai.types.ts
@@ -10,6 +10,12 @@ export type AiDriver = 'openai' | 'gemini' | 'ollama';
export const AI_DRIVERS: AiDriver[] = ['openai', 'gemini', 'ollama'];
+// STT request encoding. 'multipart' = OpenAI-compatible /audio/transcriptions
+// form-data (OpenAI, speaches, faster-whisper-server). 'json' = JSON body with
+// base64 input_audio (OpenRouter). Chosen explicitly by the admin.
+export type SttApiStyle = 'multipart' | 'json';
+export const STT_API_STYLES: SttApiStyle[] = ['multipart', 'json'];
+
/**
* Non-secret provider settings persisted under `settings.ai.provider`.
* The API key is intentionally absent here.
@@ -24,6 +30,7 @@ export interface AiProviderSettings {
sttModel?: string;
// STT-specific base URL. Falls back to baseUrl when empty/unset.
sttBaseUrl?: string;
+ sttApiStyle?: SttApiStyle;
systemPrompt?: string;
}
@@ -58,6 +65,7 @@ export interface MaskedAiSettings {
embeddingBaseUrl?: string;
sttModel?: string;
sttBaseUrl?: string;
+ sttApiStyle?: SttApiStyle;
systemPrompt?: string;
hasApiKey: boolean;
hasEmbeddingApiKey: boolean;
diff --git a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
index 49199bc0..77935352 100644
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
@@ -1,5 +1,5 @@
import { IsIn, IsOptional, IsString } from 'class-validator';
-import { AI_DRIVERS, AiDriver } from '../ai.types';
+import { AI_DRIVERS, AiDriver, STT_API_STYLES, SttApiStyle } from '../ai.types';
/**
* Admin update payload for the workspace AI provider settings.
@@ -50,6 +50,10 @@ export class UpdateAiSettingsDto {
@IsString()
sttBaseUrl?: string;
+ @IsOptional()
+ @IsIn(STT_API_STYLES)
+ sttApiStyle?: SttApiStyle;
+
@IsOptional()
@IsString()
sttApiKey?: string;