feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the server via the workspace's OpenAI-compatible AI provider (Whisper / gpt-4o-transcribe / self-hosted whisper), then inserts the text. Backend: - New `stt_api_key_enc` column + migration; STT creds parity with chat/ embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to chat baseUrl/key). Both provider whitelists updated (service + repo). - AiService.getTranscriptionModel + AiTranscriptionService. - Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace scope + throttle, 25MB cap, MIME whitelist, never logs audio/key). - New `settings.ai.dictation` workspace flag (DTO + service + audit). Frontend: - Wire up the Voice/STT settings card (model/base URL/key) and the Voice-dictation toggle. - New `features/dictation`: useDictation (MediaRecorder state machine), MicButton, transcribe service; integrated into the chat composer and a new editor-toolbar dictation group, both gated by ai.dictation.
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput {
  systemPrompt?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
+  sttApiKey?: string;
 }

 /**
@@ -113,6 +116,7 @@ export class AiSettingsService {
      driver: provider.driver,
      chatModel: provider.chatModel,
      embeddingModel: provider.embeddingModel,
+      sttModel: provider.sttModel,
      baseUrl: provider.baseUrl,
      systemPrompt: provider.systemPrompt,
    };
@@ -122,6 +126,10 @@ export class AiSettingsService {
    // unconditionally.
    config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl;

+    // Effective STT base URL: the STT-specific value, else the chat base URL.
+    // Set unconditionally, same rationale as embeddingBaseUrl.
+    config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl;
+
    if (provider.driver !== 'ollama') {
      const creds = await this.aiProviderCredentialsRepo.find(
        workspaceId,
@@ -134,6 +142,10 @@ export class AiSettingsService {
      config.embeddingApiKey = creds?.embeddingApiKeyEnc
        ? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc)
        : config.apiKey;
+      // Effective STT key: the STT-specific key, else the chat key.
+      config.sttApiKey = creds?.sttApiKeyEnc
+        ? this.secretBox.decryptSecret(creds.sttApiKeyEnc)
+        : config.apiKey;
    }

    return config;
@@ -151,6 +163,7 @@ export class AiSettingsService {

    let hasApiKey = false;
    let hasEmbeddingApiKey = false;
+    let hasSttApiKey = false;
    if (provider.driver) {
      const creds = await this.aiProviderCredentialsRepo.find(
        workspaceId,
@@ -158,6 +171,7 @@ export class AiSettingsService {
      );
      hasApiKey = !!creds?.apiKeyEnc;
      hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
+      hasSttApiKey = !!creds?.sttApiKeyEnc;
    }

    // totalPages now counts only pages with embeddable content (non-empty text
@@ -174,9 +188,12 @@ export class AiSettingsService {
      embeddingModel: provider.embeddingModel,
      baseUrl: provider.baseUrl,
      embeddingBaseUrl: provider.embeddingBaseUrl,
+      sttModel: provider.sttModel,
+      sttBaseUrl: provider.sttBaseUrl,
      systemPrompt: provider.systemPrompt,
      hasApiKey,
      hasEmbeddingApiKey,
+      hasSttApiKey,
      indexedPages,
      totalPages,
    };
@@ -197,7 +214,7 @@ export class AiSettingsService {
    workspaceId: string,
    dto: UpdateAiSettingsInput,
  ): Promise<MaskedAiSettings> {
-    const { apiKey, embeddingApiKey, ...nonSecret } = dto;
+    const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto;

    // Persist non-secret provider fields (only those present in the partial).
    const providerPatch: Partial<AiProviderSettings> = {};
@@ -207,6 +224,8 @@ export class AiSettingsService {
      'embeddingModel',
      'baseUrl',
      'embeddingBaseUrl',
+      'sttModel',
+      'sttBaseUrl',
      'systemPrompt',
    ] as const) {
      if (nonSecret[key] !== undefined) {
@@ -222,7 +241,11 @@ export class AiSettingsService {

    // Key handling (write-only). Both keys share the same target driver and the
    // same "driver required" guard, resolved once.
-    if (apiKey !== undefined || embeddingApiKey !== undefined) {
+    if (
+      apiKey !== undefined ||
+      embeddingApiKey !== undefined ||
+      sttApiKey !== undefined
+    ) {
      const stored = await this.readProvider(workspaceId);
      const targetDriver = dto.driver ?? stored.driver;
      if (!targetDriver) {
@@ -264,6 +287,23 @@ export class AiSettingsService {
          );
        }
      }
+
+      // STT key.
+      if (sttApiKey !== undefined) {
+        if (sttApiKey === '') {
+          await this.aiProviderCredentialsRepo.clearSttKey(
+            workspaceId,
+            targetDriver,
+          );
+        } else {
+          const enc = this.secretBox.encryptSecret(sttApiKey);
+          await this.aiProviderCredentialsRepo.upsertSttKey(
+            workspaceId,
+            targetDriver,
+            enc,
+          );
+        }
+      }
    }

    return this.getMasked(workspaceId);
--- a/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts
+++ b/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts
@@ -0,0 +1,13 @@
+import { ServiceUnavailableException } from '@nestjs/common';
+
+/**
+ * Thrown when no usable STT (speech-to-text) config exists for the workspace
+ * (missing driver / sttModel). Distinct from the chat & embedding variants so
+ * the transcription endpoint can 503 independently of chat/embeddings being
+ * configured.
+ */
+export class AiSttNotConfiguredException extends ServiceUnavailableException {
+  constructor() {
+    super('AI STT model not configured');
+  }
+}
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -4,6 +4,7 @@ import {
  generateText,
  type EmbeddingModel,
  type LanguageModel,
+  type TranscriptionModel,
 } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama';
 import { AiSettingsService } from './ai-settings.service';
 import { AiNotConfiguredException } from './ai-not-configured.exception';
 import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
+import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
 import { describeProviderError } from './ai-error.util';

 /**
@@ -106,6 +108,26 @@ export class AiService {
    }
  }

+  /**
+   * Resolve the workspace config and build the transcription (STT) model.
+   * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
+   * (only @ai-sdk/openai exposes .transcription()), regardless of the chat
+   * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
+   * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
+   * on demand; the decrypted key is never logged.
+   *
+   * Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
+   */
+  async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
+    const cfg = await this.aiSettings.resolve(workspaceId);
+    if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
+    const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
+    // apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
+    return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
+      cfg.sttModel,
+    );
+  }
+
  /**
   * Embed a batch of texts with the workspace embedding model. Returns one
   * vector per input, in the same order. Thin wrapper over the AI SDK's
--- a/apps/server/src/integrations/ai/ai.types.ts
+++ b/apps/server/src/integrations/ai/ai.types.ts
@@ -21,6 +21,9 @@ export interface AiProviderSettings {
  baseUrl?: string;
  // Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
  embeddingBaseUrl?: string;
+  sttModel?: string;
+  // STT-specific base URL. Falls back to baseUrl when empty/unset.
+  sttBaseUrl?: string;
  systemPrompt?: string;
 }

@@ -31,12 +34,15 @@ export interface AiProviderSettings {
 *
 * `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and
 * key, already resolved with the chat-value fallback applied by `resolve`.
+ * `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key,
+ * already resolved with the chat-value fallback applied by `resolve`.
 */
 export interface ResolvedAiConfig extends Partial<AiProviderSettings> {
  driver?: AiDriver;
  chatModel?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttApiKey?: string;
 }

 /**
@@ -50,9 +56,12 @@ export interface MaskedAiSettings {
  embeddingModel?: string;
  baseUrl?: string;
  embeddingBaseUrl?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
  systemPrompt?: string;
  hasApiKey: boolean;
  hasEmbeddingApiKey: boolean;
+  hasSttApiKey: boolean;
  // RAG indexing coverage for the settings UI.
  indexedPages: number;
  totalPages: number;
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
@@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types';
 /**
 * Admin update payload for the workspace AI provider settings.
 *
- * `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored
- * encrypted, '' → cleared, absent → left untouched. They are NEVER returned by
- * any endpoint. The global ValidationPipe runs with `whitelist: true`, so
- * unknown fields are stripped.
+ * `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided →
+ * stored encrypted, '' → cleared, absent → left untouched. They are NEVER
+ * returned by any endpoint. The global ValidationPipe runs with
+ * `whitelist: true`, so unknown fields are stripped.
 */
 export class UpdateAiSettingsDto {
  @IsOptional()
@@ -41,4 +41,16 @@ export class UpdateAiSettingsDto {
  @IsOptional()
  @IsString()
  embeddingApiKey?: string;
+
+  @IsOptional()
+  @IsString()
+  sttModel?: string;
+
+  @IsOptional()
+  @IsString()
+  sttBaseUrl?: string;
+
+  @IsOptional()
+  @IsString()
+  sttApiKey?: string;
 }