feat(ai/stt): add dictation language selection to STT settings

Add a per-workspace `sttLanguage` setting (ISO-639-1 hint; empty = auto-detect) and a searchable language picker in the Voice / STT settings card. The hint is forwarded to the transcription endpoint: - multipart path via the AI SDK `providerOptions.openai.language` - JSON (OpenRouter) path via a top-level `language` body field only when non-empty, so auto-detect behaves exactly as before. Threaded through the whole stack: ai.types, update DTO, AiSettingsService (resolve/getMasked/update), the workspace.repo SQL allowlist, the client ai-settings service types, and the provider-settings form. Adds en-US source keys and ru-RU translations. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-22 02:29:07 +03:00
parent c83343d3a3
commit a16ef2346f
9 changed files with 106 additions and 5 deletions
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1221,6 +1221,9 @@
  "How transcription requests are sent to the endpoint": "How transcription requests are sent to the endpoint",
  "OpenAI-compatible (multipart/form-data)": "OpenAI-compatible (multipart/form-data)",
  "OpenRouter (JSON, base64 audio)": "OpenRouter (JSON, base64 audio)",
+  "Dictation language": "Dictation language",
+  "Auto-detect": "Auto-detect",
+  "Spoken language hint sent to the transcription model. Auto-detect lets the model decide.": "Spoken language hint sent to the transcription model. Auto-detect lets the model decide.",
  "Agent role": "Agent role",
  "Universal assistant": "Universal assistant",
  "Add role": "Add role",
--- a/apps/client/public/locales/ru-RU/translation.json
+++ b/apps/client/public/locales/ru-RU/translation.json
@@ -1123,5 +1123,8 @@
  "Added {{name}} to favorites": "{{name}} добавлено в избранное",
  "Removed {{name}} from favorites": "{{name}} удалено из избранного",
  "Page menu for {{name}}": "Меню страницы для {{name}}",
-  "Create subpage of {{name}}": "Создать подстраницу для {{name}}"
+  "Create subpage of {{name}}": "Создать подстраницу для {{name}}",
+  "Dictation language": "Язык диктовки",
+  "Auto-detect": "Автоопределение",
+  "Spoken language hint sent to the transcription model. Auto-detect lets the model decide.": "Подсказка языка речи для модели транскрипции. «Автоопределение» оставляет выбор за моделью."
 }
--- a/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
+++ b/apps/client/src/features/workspace/components/settings/components/ai-provider-settings.tsx
@@ -42,6 +42,40 @@ import { useAiRolesQuery } from "@/features/ai-chat/queries/ai-chat-query.ts";
 import { IAiRole } from "@/features/ai-chat/types/ai-chat.types.ts";
 import AiMcpServers from "./ai-mcp-servers.tsx";

+// Curated ISO-639-1 dictation languages for the STT card. The empty-value
+// "Auto-detect" entry is prepended in render (it needs translation). Values
+// are sent verbatim to the transcription model as the language hint.
+const STT_LANGUAGE_OPTIONS: { value: string; label: string }[] = [
+  { value: "en", label: "English" },
+  { value: "ru", label: "Russian — Русский" },
+  { value: "uk", label: "Ukrainian — Українська" },
+  { value: "de", label: "German — Deutsch" },
+  { value: "fr", label: "French — Français" },
+  { value: "es", label: "Spanish — Español" },
+  { value: "it", label: "Italian — Italiano" },
+  { value: "pt", label: "Portuguese — Português" },
+  { value: "nl", label: "Dutch — Nederlands" },
+  { value: "pl", label: "Polish — Polski" },
+  { value: "tr", label: "Turkish — Türkçe" },
+  { value: "cs", label: "Czech — Čeština" },
+  { value: "sv", label: "Swedish — Svenska" },
+  { value: "fi", label: "Finnish — Suomi" },
+  { value: "da", label: "Danish — Dansk" },
+  { value: "no", label: "Norwegian — Norsk" },
+  { value: "ro", label: "Romanian — Română" },
+  { value: "hu", label: "Hungarian — Magyar" },
+  { value: "el", label: "Greek — Ελληνικά" },
+  { value: "he", label: "Hebrew — עברית" },
+  { value: "ar", label: "Arabic — العربية" },
+  { value: "hi", label: "Hindi — हिन्दी" },
+  { value: "id", label: "Indonesian — Bahasa Indonesia" },
+  { value: "vi", label: "Vietnamese — Tiếng Việt" },
+  { value: "th", label: "Thai — ไทย" },
+  { value: "ja", label: "Japanese — 日本語" },
+  { value: "ko", label: "Korean — 한국어" },
+  { value: "zh", label: "Chinese — 中文" },
+];
+
 // No driver field: every endpoint is OpenAI-compatible, so the form carries only
 // the user-editable fields. `apiKey` / `embeddingApiKey` are write-only buffers
 // (empty means "leave unchanged" unless explicitly cleared).
@@ -63,6 +97,8 @@ const formSchema = z.object({
  sttModel: z.string(),
  sttBaseUrl: z.string(),
  sttApiStyle: z.enum(["multipart", "json"]),
+  // ISO-639-1 dictation language; empty = auto-detect.
+  sttLanguage: z.string(),
  sttApiKey: z.string(),
 });

@@ -233,6 +269,7 @@ export default function AiProviderSettings() {
      sttModel: "",
      sttBaseUrl: "",
      sttApiStyle: "multipart" as SttApiStyle,
+      sttLanguage: "",
      sttApiKey: "",
    },
  });
@@ -254,6 +291,7 @@ export default function AiProviderSettings() {
      sttModel: settings.sttModel ?? "",
      sttBaseUrl: settings.sttBaseUrl ?? "",
      sttApiStyle: settings.sttApiStyle ?? "multipart",
+      sttLanguage: settings.sttLanguage ?? "",
      sttApiKey: "",
    });
    form.resetDirty();
@@ -288,6 +326,7 @@ export default function AiProviderSettings() {
      sttModel: values.sttModel,
      sttBaseUrl: values.sttBaseUrl,
      sttApiStyle: values.sttApiStyle,
+      sttLanguage: values.sttLanguage,
    };

    // Key semantics (never send the stored key back) — see resolveKeyField:
@@ -923,6 +962,22 @@ export default function AiProviderSettings() {
          {...form.getInputProps("sttApiStyle")}
        />

+        <Select
+          mt="sm"
+          label={t("Dictation language")}
+          description={t(
+            "Spoken language hint sent to the transcription model. Auto-detect lets the model decide.",
+          )}
+          data={[
+            { value: "", label: t("Auto-detect") },
+            ...STT_LANGUAGE_OPTIONS,
+          ]}
+          searchable
+          allowDeselect={false}
+          disabled={isLoading}
+          {...form.getInputProps("sttLanguage")}
+        />
+
        <TextInput
          mt="sm"
          label={t("Base URL")}
--- a/apps/client/src/features/workspace/services/ai-settings-service.ts
+++ b/apps/client/src/features/workspace/services/ai-settings-service.ts
@@ -33,6 +33,8 @@ export interface IAiSettings {
  sttModel?: string;
  sttBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
+  // ISO-639-1 dictation language; empty = auto-detect.
+  sttLanguage?: string;
  hasSttApiKey: boolean;
  // RAG indexing coverage (pages indexed for semantic search).
  indexedPages: number;
@@ -60,6 +62,8 @@ export interface IAiSettingsUpdate {
  sttModel?: string;
  sttBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
+  // ISO-639-1 dictation language; empty = auto-detect.
+  sttLanguage?: string;
  // Write-only STT key (same semantics as `apiKey` / `embeddingApiKey`).
  sttApiKey?: string;
 }
--- a/apps/server/src/database/repos/workspace/workspace.repo.ts
+++ b/apps/server/src/database/repos/workspace/workspace.repo.ts
@@ -239,7 +239,7 @@ export class WorkspaceRepo {
    // is a real jsonb object, never a double-encoded string. The CASE self-heals
    // workspaces whose settings.ai.provider was previously corrupted into an
    // array/string.
-    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'systemPrompt', 'publicShareChatModel', 'publicShareAssistantRoleId'];
+    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'sttApiStyle', 'sttLanguage', 'systemPrompt', 'publicShareChatModel', 'publicShareAssistantRoleId'];
    const entries = Object.entries(provider).filter(
      ([k, v]) => v !== undefined && ALLOWED.includes(k),
    );
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -33,6 +33,8 @@ export interface UpdateAiSettingsInput {
  sttModel?: string;
  sttBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
+  // ISO-639-1 dictation language hint (e.g. 'en', 'ru'). Empty = auto-detect.
+  sttLanguage?: string;
  sttApiKey?: string;
  publicShareChatModel?: string;
  publicShareAssistantRoleId?: string;
@@ -166,6 +168,8 @@ export class AiSettingsService {
      // Plain passthrough, no fallback; the transcribe path defaults unset to
      // 'multipart' (current behavior).
      sttApiStyle: provider.sttApiStyle,
+      // Plain passthrough; empty/unset = auto-detect at the transcribe path.
+      sttLanguage: provider.sttLanguage,
      baseUrl: provider.baseUrl,
      systemPrompt: provider.systemPrompt,
    };
@@ -240,6 +244,7 @@ export class AiSettingsService {
      sttModel: provider.sttModel,
      sttBaseUrl: provider.sttBaseUrl,
      sttApiStyle: provider.sttApiStyle,
+      sttLanguage: provider.sttLanguage,
      systemPrompt: provider.systemPrompt,
      publicShareChatModel: provider.publicShareChatModel,
      publicShareAssistantRoleId: provider.publicShareAssistantRoleId,
@@ -279,6 +284,7 @@ export class AiSettingsService {
      'sttModel',
      'sttBaseUrl',
      'sttApiStyle',
+      'sttLanguage',
      'systemPrompt',
      'publicShareChatModel',
      'publicShareAssistantRoleId',
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -212,12 +212,22 @@ export class AiService {
    const cfg = await this.aiSettings.resolve(workspaceId);
    if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
    const baseURL = cfg.sttBaseUrl || cfg.baseUrl;
+    // Trimmed language hint; empty/unset = auto-detect (never forward an empty
+    // string to the provider, which would override auto-detect).
+    const sttLanguage = cfg.sttLanguage?.trim() || undefined;

    // Explicit, admin-chosen request encoding (no URL guessing). 'json' is the
    // OpenRouter style (JSON + base64 input_audio); everything else uses the
    // OpenAI-compatible multipart path via the AI SDK.
    if (cfg.sttApiStyle === 'json') {
-      return this.transcribeJsonBase64(baseURL, cfg.sttApiKey, cfg.sttModel, audio, format);
+      return this.transcribeJsonBase64(
+        baseURL,
+        cfg.sttApiKey,
+        cfg.sttModel,
+        audio,
+        format,
+        sttLanguage,
+      );
    }

    // Standard OpenAI-compatible multipart path (AI SDK). apiKey may be unused for
@@ -226,14 +236,23 @@ export class AiService {
      apiKey: cfg.sttApiKey ?? 'unused',
      baseURL,
    }).transcription(cfg.sttModel);
-    const { text } = await transcribe({ model, audio });
+    const { text } = await transcribe({
+      model,
+      audio,
+      // Forward the language hint only when set; the OpenAI transcription model
+      // reads it from providerOptions.openai.language.
+      ...(sttLanguage
+        ? { providerOptions: { openai: { language: sttLanguage } } }
+        : {}),
+    });
    return text.trim();
  }

  /**
   * JSON + base64 transcription body (OpenRouter-style). POSTs
   * { model, input_audio: { data, format } } to {baseURL}/audio/transcriptions
-   * and returns { text }.
+   * and returns { text }. The optional `language` ISO-639-1 hint is included as
+   * a top-level body field only when set (empty/unset = auto-detect).
   */
  private async transcribeJsonBase64(
    baseURL: string | undefined,
@@ -241,6 +260,7 @@ export class AiService {
    model: string,
    audio: Uint8Array,
    format: string,
+    language?: string,
  ): Promise<string> {
    if (!baseURL) {
      throw new BadRequestException(
@@ -256,6 +276,7 @@ export class AiService {
      },
      body: JSON.stringify({
        model,
+        ...(language ? { language } : {}),
        input_audio: {
          data: Buffer.from(audio).toString('base64'),
          format,
--- a/apps/server/src/integrations/ai/ai.types.ts
+++ b/apps/server/src/integrations/ai/ai.types.ts
@@ -31,6 +31,8 @@ export interface AiProviderSettings {
  // STT-specific base URL. Falls back to baseUrl when empty/unset.
  sttBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
+  // ISO-639-1 dictation language hint (e.g. 'en', 'ru'). Empty/unset = auto-detect.
+  sttLanguage?: string;
  systemPrompt?: string;
  // Cheap chat model id used ONLY by the anonymous public-share assistant. The
  // driver / baseUrl / apiKey of the main chat provider are reused; this is the
@@ -80,6 +82,8 @@ export interface MaskedAiSettings {
  sttModel?: string;
  sttBaseUrl?: string;
  sttApiStyle?: SttApiStyle;
+  // ISO-639-1 dictation language hint (e.g. 'en', 'ru'). Empty/unset = auto-detect.
+  sttLanguage?: string;
  systemPrompt?: string;
  publicShareChatModel?: string;
  // Agent-role id whose persona the public-share assistant adopts; empty/unset
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
@@ -54,6 +54,11 @@ export class UpdateAiSettingsDto {
  @IsIn(STT_API_STYLES)
  sttApiStyle?: SttApiStyle;

+  // ISO-639-1 dictation language hint (e.g. 'en', 'ru'). Empty = auto-detect.
+  @IsOptional()
+  @IsString()
+  sttLanguage?: string;
+
  @IsOptional()
  @IsString()
  sttApiKey?: string;