feat(ai): server-side voice dictation (STT) with mic in chat and editor

Add push-to-talk voice dictation that transcribes recorded audio on the server via the workspace's OpenAI-compatible AI provider (Whisper / gpt-4o-transcribe / self-hosted whisper), then inserts the text. Backend: - New `stt_api_key_enc` column + migration; STT creds parity with chat/ embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to chat baseUrl/key). Both provider whitelists updated (service + repo). - AiService.getTranscriptionModel + AiTranscriptionService. - Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace scope + throttle, 25MB cap, MIME whitelist, never logs audio/key). - New `settings.ai.dictation` workspace flag (DTO + service + audit). Frontend: - Wire up the Voice/STT settings card (model/base URL/key) and the Voice-dictation toggle. - New `features/dictation`: useDictation (MediaRecorder state machine), MicButton, transcribe service; integrated into the chat composer and a new editor-toolbar dictation group, both gated by ai.dictation.
2026-06-18 18:45:33 +03:00
parent 49eba22201
commit 874bdd021c
24 changed files with 845 additions and 39 deletions
--- a/apps/server/src/core/ai-chat/ai-chat.controller.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts
@@ -1,4 +1,5 @@
 import {
+  BadRequestException,
  Body,
  Controller,
  ForbiddenException,
@@ -9,6 +10,7 @@ import {
  Req,
  Res,
  UseGuards,
+  UseInterceptors,
 } from '@nestjs/common';
 import { Throttle } from '@nestjs/throttler';
 import { FastifyReply, FastifyRequest } from 'fastify';
@@ -22,7 +24,9 @@ import { AiChatRepo } from '@docmost/db/repos/ai-chat/ai-chat.repo';
 import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo';
 import { UserThrottlerGuard } from '../../integrations/throttle/user-throttler.guard';
 import { AI_CHAT_THROTTLER } from '../../integrations/throttle/throttler-names';
+import { FileInterceptor } from '../../common/interceptors/file.interceptor';
 import { AiChatService, AiChatStreamBody } from './ai-chat.service';
+import { AiTranscriptionService } from './ai-transcription.service';
 import {
  ChatIdDto,
  GetChatMessagesDto,
@@ -43,6 +47,7 @@ export class AiChatController {
    private readonly aiChatService: AiChatService,
    private readonly aiChatRepo: AiChatRepo,
    private readonly aiChatMessageRepo: AiChatMessageRepo,
+    private readonly aiTranscription: AiTranscriptionService,
  ) {}

  /** List the requesting user's chats in this workspace (paginated). */
@@ -180,6 +185,74 @@ export class AiChatController {
    }
  }

+  /**
+   * Transcribe an uploaded audio clip to text using the workspace STT model.
+   * Gated by settings.ai.dictation (403 when disabled). Returns { text }.
+   */
+  @HttpCode(HttpStatus.OK)
+  @UseGuards(JwtAuthGuard, UserThrottlerGuard)
+  @Throttle({ [AI_CHAT_THROTTLER]: { limit: 20, ttl: 60000 } })
+  @Post('transcribe')
+  @UseInterceptors(FileInterceptor)
+  async transcribe(
+    @Req() req: any,
+    @AuthWorkspace() workspace: Workspace,
+  ): Promise<{ text: string }> {
+    // Gate: dictation must be explicitly enabled for the workspace.
+    const settings = (workspace.settings ?? {}) as {
+      ai?: { dictation?: boolean };
+    };
+    if (settings.ai?.dictation !== true) {
+      throw new ForbiddenException('Dictation is disabled');
+    }
+
+    let file = null;
+    try {
+      // Whisper hard-caps uploads at 25MB; allow a single file.
+      file = await req.file({ limits: { fileSize: 25 * 1024 * 1024, files: 1 } });
+    } catch (err: any) {
+      if (err?.statusCode === 413) {
+        throw new BadRequestException('Audio file too large (max 25MB)');
+      }
+      throw err;
+    }
+    if (!file) throw new BadRequestException('No audio uploaded');
+
+    // Whitelist audio container types produced by browser MediaRecorder
+    // (Chrome/FF: webm/opus, Safari: mp4) plus common STT-accepted formats.
+    const allowedMime = new Set([
+      'audio/webm',
+      'audio/ogg',
+      'audio/mp4',
+      'audio/mpeg',
+      'audio/wav',
+      'audio/x-wav',
+      'audio/wave',
+      'audio/m4a',
+      'audio/x-m4a',
+    ]);
+    // MediaRecorder mimetypes carry parameters (e.g. "audio/webm;codecs=opus");
+    // compare only the base type.
+    const baseMime = file.mimetype.split(';')[0].trim().toLowerCase();
+    if (!allowedMime.has(baseMime)) {
+      throw new BadRequestException('Unsupported audio format');
+    }
+
+    let buf: Buffer;
+    try {
+      buf = await file.toBuffer();
+    } catch (err: any) {
+      // With @fastify/multipart throwFileSizeLimit:true, the 25MB cap is enforced
+      // when the stream is consumed (here), not at req.file().
+      if (err?.statusCode === 413) {
+        throw new BadRequestException('Audio file too large (max 25MB)');
+      }
+      throw err;
+    }
+    const text = await this.aiTranscription.transcribe(workspace.id, buf);
+    return { text };
+  }
+
  /**
   * Ensure the chat exists, belongs to this workspace, AND was created by the
   * requesting user (per-user isolation). Throws ForbiddenException otherwise.
--- a/apps/server/src/core/ai-chat/ai-chat.module.ts
+++ b/apps/server/src/core/ai-chat/ai-chat.module.ts
@@ -3,6 +3,7 @@ import { AiModule } from '../../integrations/ai/ai.module';
 import { TokenModule } from '../auth/token.module';
 import { AiChatController } from './ai-chat.controller';
 import { AiChatService } from './ai-chat.service';
+import { AiTranscriptionService } from './ai-transcription.service';
 import { AiChatToolsService } from './tools/ai-chat-tools.service';
 import { EmbeddingModule } from './embedding/embedding.module';
 import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@@ -21,6 +22,6 @@ import { ExternalMcpModule } from './external-mcp/external-mcp.module';
@Module({
  imports: [AiModule, TokenModule, EmbeddingModule, ExternalMcpModule],
  controllers: [AiChatController],
-  providers: [AiChatService, AiChatToolsService],
+  providers: [AiChatService, AiTranscriptionService, AiChatToolsService],
 })
 export class AiChatModule {}
--- a/apps/server/src/core/ai-chat/ai-transcription.service.ts
+++ b/apps/server/src/core/ai-chat/ai-transcription.service.ts
@@ -0,0 +1,20 @@
+import { Injectable } from '@nestjs/common';
+import { experimental_transcribe as transcribe } from 'ai';
+import { AiService } from '../../integrations/ai/ai.service';
+
+/**
+ * Transcribes uploaded audio to text using the per-workspace STT model.
+ * Thin wrapper over the AI SDK's experimental_transcribe; never logs the
+ * audio or the key.
+ */
+@Injectable()
+export class AiTranscriptionService {
+  constructor(private readonly ai: AiService) {}
+
+  // Transcribe an uploaded audio buffer using the workspace STT model.
+  async transcribe(workspaceId: string, audio: Uint8Array): Promise<string> {
+    const model = await this.ai.getTranscriptionModel(workspaceId);
+    const { text } = await transcribe({ model, audio });
+    return text.trim();
+  }
+}
--- a/apps/server/src/core/workspace/dto/update-workspace.dto.ts
+++ b/apps/server/src/core/workspace/dto/update-workspace.dto.ts
@@ -49,6 +49,10 @@ export class UpdateWorkspaceDto extends PartialType(CreateWorkspaceDto) {
  @IsBoolean()
  aiChat: boolean;

+  @IsOptional()
+  @IsBoolean()
+  aiDictation: boolean;
+
  @IsOptional()
  @IsInt()
  @Min(1)
--- a/apps/server/src/core/workspace/services/workspace.service.ts
+++ b/apps/server/src/core/workspace/services/workspace.service.ts
@@ -497,6 +497,20 @@ export class WorkspaceService {
        );
      }

+      if (typeof updateWorkspaceDto.aiDictation !== 'undefined') {
+        const prev = settingsBefore?.ai?.dictation ?? false;
+        if (prev !== updateWorkspaceDto.aiDictation) {
+          before.aiDictation = prev;
+          after.aiDictation = updateWorkspaceDto.aiDictation;
+        }
+        await this.workspaceRepo.updateAiSettings(
+          workspaceId,
+          'dictation',
+          updateWorkspaceDto.aiDictation,
+          trx,
+        );
+      }
+
      delete updateWorkspaceDto.restrictApiToAdmins;
      delete updateWorkspaceDto.aiSearch;
      delete updateWorkspaceDto.generativeAi;
@@ -504,6 +518,7 @@ export class WorkspaceService {
      delete updateWorkspaceDto.mcpEnabled;
      delete updateWorkspaceDto.allowMemberTemplates;
      delete updateWorkspaceDto.aiChat;
+      delete updateWorkspaceDto.aiDictation;

      await this.workspaceRepo.updateWorkspace(
        updateWorkspaceDto,
--- a/apps/server/src/database/migrations/20260618T130000-ai-stt-credentials.ts
+++ b/apps/server/src/database/migrations/20260618T130000-ai-stt-credentials.ts
@@ -0,0 +1,18 @@
+import { type Kysely } from 'kysely';
+
+export async function up(db: Kysely<any>): Promise<void> {
+  // Encrypted, STT-specific provider key. Separate from `api_key_enc`
+  // (the chat key) so the transcription model can use a different token.
+  // When NULL, the STT model falls back to `api_key_enc`.
+  await db.schema
+    .alterTable('ai_provider_credentials')
+    .addColumn('stt_api_key_enc', 'text', (col) => col)
+    .execute();
+}
+
+export async function down(db: Kysely<any>): Promise<void> {
+  await db.schema
+    .alterTable('ai_provider_credentials')
+    .dropColumn('stt_api_key_enc')
+    .execute();
+}
--- a/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts
+++ b/apps/server/src/database/repos/ai-chat/ai-provider-credentials.repo.ts
@@ -98,4 +98,42 @@ export class AiProviderCredentialsRepo {
      .where('driver', '=', driver)
      .execute();
  }
+
+  // Upsert the STT-specific encrypted key. If no row exists yet this inserts one
+  // with `apiKeyEnc` left null (the column is nullable). On conflict only
+  // `sttApiKeyEnc` / `updatedAt` are touched, so the chat & embedding keys are kept.
+  async upsertSttKey(
+    workspaceId: string,
+    driver: string,
+    sttApiKeyEnc: string,
+    trx?: KyselyTransaction,
+  ): Promise<AiProviderCredentials> {
+    const db = dbOrTx(this.db, trx);
+    return db
+      .insertInto('aiProviderCredentials')
+      .values({ workspaceId, driver, sttApiKeyEnc })
+      .onConflict((oc) =>
+        oc.columns(['workspaceId', 'driver']).doUpdateSet({
+          sttApiKeyEnc,
+          updatedAt: new Date(),
+        }),
+      )
+      .returningAll()
+      .executeTakeFirst();
+  }
+
+  // Clear only the STT-specific key; the chat & embedding keys are kept.
+  async clearSttKey(
+    workspaceId: string,
+    driver: string,
+    trx?: KyselyTransaction,
+  ): Promise<void> {
+    const db = dbOrTx(this.db, trx);
+    await db
+      .updateTable('aiProviderCredentials')
+      .set({ sttApiKeyEnc: null, updatedAt: new Date() })
+      .where('workspaceId', '=', workspaceId)
+      .where('driver', '=', driver)
+      .execute();
+  }
 }
--- a/apps/server/src/database/repos/workspace/workspace.repo.ts
+++ b/apps/server/src/database/repos/workspace/workspace.repo.ts
@@ -239,7 +239,7 @@ export class WorkspaceRepo {
    // is a real jsonb object, never a double-encoded string. The CASE self-heals
    // workspaces whose settings.ai.provider was previously corrupted into an
    // array/string.
-    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'systemPrompt'];
+    const ALLOWED = ['driver', 'chatModel', 'embeddingModel', 'baseUrl', 'embeddingBaseUrl', 'sttModel', 'sttBaseUrl', 'systemPrompt'];
    const entries = Object.entries(provider).filter(
      ([k, v]) => v !== undefined && ALLOWED.includes(k),
    );
--- a/apps/server/src/database/types/ai-provider-credentials.types.ts
+++ b/apps/server/src/database/types/ai-provider-credentials.types.ts
@@ -14,6 +14,8 @@ export interface AiProviderCredentials {
  apiKeyEnc: string | null;
  // Encrypted, embedding-specific provider key. Falls back to apiKeyEnc when null.
  embeddingApiKeyEnc: string | null;
+  // Encrypted, STT-specific provider key. Falls back to apiKeyEnc when null.
+  sttApiKeyEnc: string | null;
  createdAt: Generated<Timestamp>;
  updatedAt: Generated<Timestamp>;
 }
--- a/apps/server/src/integrations/ai/ai-settings.service.ts
+++ b/apps/server/src/integrations/ai/ai-settings.service.ts
@@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput {
  systemPrompt?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
+  sttApiKey?: string;
 }

 /**
@@ -113,6 +116,7 @@ export class AiSettingsService {
      driver: provider.driver,
      chatModel: provider.chatModel,
      embeddingModel: provider.embeddingModel,
+      sttModel: provider.sttModel,
      baseUrl: provider.baseUrl,
      systemPrompt: provider.systemPrompt,
    };
@@ -122,6 +126,10 @@ export class AiSettingsService {
    // unconditionally.
    config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl;

+    // Effective STT base URL: the STT-specific value, else the chat base URL.
+    // Set unconditionally, same rationale as embeddingBaseUrl.
+    config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl;
+
    if (provider.driver !== 'ollama') {
      const creds = await this.aiProviderCredentialsRepo.find(
        workspaceId,
@@ -134,6 +142,10 @@ export class AiSettingsService {
      config.embeddingApiKey = creds?.embeddingApiKeyEnc
        ? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc)
        : config.apiKey;
+      // Effective STT key: the STT-specific key, else the chat key.
+      config.sttApiKey = creds?.sttApiKeyEnc
+        ? this.secretBox.decryptSecret(creds.sttApiKeyEnc)
+        : config.apiKey;
    }

    return config;
@@ -151,6 +163,7 @@ export class AiSettingsService {

    let hasApiKey = false;
    let hasEmbeddingApiKey = false;
+    let hasSttApiKey = false;
    if (provider.driver) {
      const creds = await this.aiProviderCredentialsRepo.find(
        workspaceId,
@@ -158,6 +171,7 @@ export class AiSettingsService {
      );
      hasApiKey = !!creds?.apiKeyEnc;
      hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
+      hasSttApiKey = !!creds?.sttApiKeyEnc;
    }

    // totalPages now counts only pages with embeddable content (non-empty text
@@ -174,9 +188,12 @@ export class AiSettingsService {
      embeddingModel: provider.embeddingModel,
      baseUrl: provider.baseUrl,
      embeddingBaseUrl: provider.embeddingBaseUrl,
+      sttModel: provider.sttModel,
+      sttBaseUrl: provider.sttBaseUrl,
      systemPrompt: provider.systemPrompt,
      hasApiKey,
      hasEmbeddingApiKey,
+      hasSttApiKey,
      indexedPages,
      totalPages,
    };
@@ -197,7 +214,7 @@ export class AiSettingsService {
    workspaceId: string,
    dto: UpdateAiSettingsInput,
  ): Promise<MaskedAiSettings> {
-    const { apiKey, embeddingApiKey, ...nonSecret } = dto;
+    const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto;

    // Persist non-secret provider fields (only those present in the partial).
    const providerPatch: Partial<AiProviderSettings> = {};
@@ -207,6 +224,8 @@ export class AiSettingsService {
      'embeddingModel',
      'baseUrl',
      'embeddingBaseUrl',
+      'sttModel',
+      'sttBaseUrl',
      'systemPrompt',
    ] as const) {
      if (nonSecret[key] !== undefined) {
@@ -222,7 +241,11 @@ export class AiSettingsService {

    // Key handling (write-only). Both keys share the same target driver and the
    // same "driver required" guard, resolved once.
-    if (apiKey !== undefined || embeddingApiKey !== undefined) {
+    if (
+      apiKey !== undefined ||
+      embeddingApiKey !== undefined ||
+      sttApiKey !== undefined
+    ) {
      const stored = await this.readProvider(workspaceId);
      const targetDriver = dto.driver ?? stored.driver;
      if (!targetDriver) {
@@ -264,6 +287,23 @@ export class AiSettingsService {
          );
        }
      }
+
+      // STT key.
+      if (sttApiKey !== undefined) {
+        if (sttApiKey === '') {
+          await this.aiProviderCredentialsRepo.clearSttKey(
+            workspaceId,
+            targetDriver,
+          );
+        } else {
+          const enc = this.secretBox.encryptSecret(sttApiKey);
+          await this.aiProviderCredentialsRepo.upsertSttKey(
+            workspaceId,
+            targetDriver,
+            enc,
+          );
+        }
+      }
    }

    return this.getMasked(workspaceId);
--- a/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts
+++ b/apps/server/src/integrations/ai/ai-stt-not-configured.exception.ts
@@ -0,0 +1,13 @@
+import { ServiceUnavailableException } from '@nestjs/common';
+
+/**
+ * Thrown when no usable STT (speech-to-text) config exists for the workspace
+ * (missing driver / sttModel). Distinct from the chat & embedding variants so
+ * the transcription endpoint can 503 independently of chat/embeddings being
+ * configured.
+ */
+export class AiSttNotConfiguredException extends ServiceUnavailableException {
+  constructor() {
+    super('AI STT model not configured');
+  }
+}
--- a/apps/server/src/integrations/ai/ai.service.ts
+++ b/apps/server/src/integrations/ai/ai.service.ts
@@ -4,6 +4,7 @@ import {
  generateText,
  type EmbeddingModel,
  type LanguageModel,
+  type TranscriptionModel,
 } from 'ai';
 import { createOpenAI } from '@ai-sdk/openai';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
@@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama';
 import { AiSettingsService } from './ai-settings.service';
 import { AiNotConfiguredException } from './ai-not-configured.exception';
 import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
+import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
 import { describeProviderError } from './ai-error.util';

 /**
@@ -106,6 +108,26 @@ export class AiService {
    }
  }

+  /**
+   * Resolve the workspace config and build the transcription (STT) model.
+   * STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
+   * (only @ai-sdk/openai exposes .transcription()), regardless of the chat
+   * driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
+   * to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
+   * on demand; the decrypted key is never logged.
+   *
+   * Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
+   */
+  async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
+    const cfg = await this.aiSettings.resolve(workspaceId);
+    if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
+    const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
+    // apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
+    return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
+      cfg.sttModel,
+    );
+  }
+
  /**
   * Embed a batch of texts with the workspace embedding model. Returns one
   * vector per input, in the same order. Thin wrapper over the AI SDK's
--- a/apps/server/src/integrations/ai/ai.types.ts
+++ b/apps/server/src/integrations/ai/ai.types.ts
@@ -21,6 +21,9 @@ export interface AiProviderSettings {
  baseUrl?: string;
  // Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
  embeddingBaseUrl?: string;
+  sttModel?: string;
+  // STT-specific base URL. Falls back to baseUrl when empty/unset.
+  sttBaseUrl?: string;
  systemPrompt?: string;
 }

@@ -31,12 +34,15 @@ export interface AiProviderSettings {
 *
 * `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and
 * key, already resolved with the chat-value fallback applied by `resolve`.
+ * `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key,
+ * already resolved with the chat-value fallback applied by `resolve`.
 */
 export interface ResolvedAiConfig extends Partial<AiProviderSettings> {
  driver?: AiDriver;
  chatModel?: string;
  apiKey?: string;
  embeddingApiKey?: string;
+  sttApiKey?: string;
 }

 /**
@@ -50,9 +56,12 @@ export interface MaskedAiSettings {
  embeddingModel?: string;
  baseUrl?: string;
  embeddingBaseUrl?: string;
+  sttModel?: string;
+  sttBaseUrl?: string;
  systemPrompt?: string;
  hasApiKey: boolean;
  hasEmbeddingApiKey: boolean;
+  hasSttApiKey: boolean;
  // RAG indexing coverage for the settings UI.
  indexedPages: number;
  totalPages: number;
--- a/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
+++ b/apps/server/src/integrations/ai/dto/update-ai-settings.dto.ts
@@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types';
 /**
 * Admin update payload for the workspace AI provider settings.
 *
- * `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored
- * encrypted, '' → cleared, absent → left untouched. They are NEVER returned by
- * any endpoint. The global ValidationPipe runs with `whitelist: true`, so
- * unknown fields are stripped.
+ * `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided →
+ * stored encrypted, '' → cleared, absent → left untouched. They are NEVER
+ * returned by any endpoint. The global ValidationPipe runs with
+ * `whitelist: true`, so unknown fields are stripped.
 */
 export class UpdateAiSettingsDto {
  @IsOptional()
@@ -41,4 +41,16 @@ export class UpdateAiSettingsDto {
  @IsOptional()
  @IsString()
  embeddingApiKey?: string;
+
+  @IsOptional()
+  @IsString()
+  sttModel?: string;
+
+  @IsOptional()
+  @IsString()
+  sttBaseUrl?: string;
+
+  @IsOptional()
+  @IsString()
+  sttApiKey?: string;
 }