feat(ai): server-side voice dictation (STT) with mic in chat and editor
Add push-to-talk voice dictation that transcribes recorded audio on the server via the workspace's OpenAI-compatible AI provider (Whisper / gpt-4o-transcribe / self-hosted whisper), then inserts the text. Backend: - New `stt_api_key_enc` column + migration; STT creds parity with chat/ embeddings (sttModel/sttBaseUrl/sttApiKey, write-only key, fallbacks to chat baseUrl/key). Both provider whitelists updated (service + repo). - AiService.getTranscriptionModel + AiTranscriptionService. - Gated POST /ai-chat/transcribe (dictation flag → 403, JWT + workspace scope + throttle, 25MB cap, MIME whitelist, never logs audio/key). - New `settings.ai.dictation` workspace flag (DTO + service + audit). Frontend: - Wire up the Voice/STT settings card (model/base URL/key) and the Voice-dictation toggle. - New `features/dictation`: useDictation (MediaRecorder state machine), MicButton, transcribe service; integrated into the chat composer and a new editor-toolbar dictation group, both gated by ai.dictation.
This commit is contained in:
@@ -28,6 +28,9 @@ export interface UpdateAiSettingsInput {
|
||||
systemPrompt?: string;
|
||||
apiKey?: string;
|
||||
embeddingApiKey?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
sttApiKey?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -113,6 +116,7 @@ export class AiSettingsService {
|
||||
driver: provider.driver,
|
||||
chatModel: provider.chatModel,
|
||||
embeddingModel: provider.embeddingModel,
|
||||
sttModel: provider.sttModel,
|
||||
baseUrl: provider.baseUrl,
|
||||
systemPrompt: provider.systemPrompt,
|
||||
};
|
||||
@@ -122,6 +126,10 @@ export class AiSettingsService {
|
||||
// unconditionally.
|
||||
config.embeddingBaseUrl = provider.embeddingBaseUrl || provider.baseUrl;
|
||||
|
||||
// Effective STT base URL: the STT-specific value, else the chat base URL.
|
||||
// Set unconditionally, same rationale as embeddingBaseUrl.
|
||||
config.sttBaseUrl = provider.sttBaseUrl || provider.baseUrl;
|
||||
|
||||
if (provider.driver !== 'ollama') {
|
||||
const creds = await this.aiProviderCredentialsRepo.find(
|
||||
workspaceId,
|
||||
@@ -134,6 +142,10 @@ export class AiSettingsService {
|
||||
config.embeddingApiKey = creds?.embeddingApiKeyEnc
|
||||
? this.secretBox.decryptSecret(creds.embeddingApiKeyEnc)
|
||||
: config.apiKey;
|
||||
// Effective STT key: the STT-specific key, else the chat key.
|
||||
config.sttApiKey = creds?.sttApiKeyEnc
|
||||
? this.secretBox.decryptSecret(creds.sttApiKeyEnc)
|
||||
: config.apiKey;
|
||||
}
|
||||
|
||||
return config;
|
||||
@@ -151,6 +163,7 @@ export class AiSettingsService {
|
||||
|
||||
let hasApiKey = false;
|
||||
let hasEmbeddingApiKey = false;
|
||||
let hasSttApiKey = false;
|
||||
if (provider.driver) {
|
||||
const creds = await this.aiProviderCredentialsRepo.find(
|
||||
workspaceId,
|
||||
@@ -158,6 +171,7 @@ export class AiSettingsService {
|
||||
);
|
||||
hasApiKey = !!creds?.apiKeyEnc;
|
||||
hasEmbeddingApiKey = !!creds?.embeddingApiKeyEnc;
|
||||
hasSttApiKey = !!creds?.sttApiKeyEnc;
|
||||
}
|
||||
|
||||
// totalPages now counts only pages with embeddable content (non-empty text
|
||||
@@ -174,9 +188,12 @@ export class AiSettingsService {
|
||||
embeddingModel: provider.embeddingModel,
|
||||
baseUrl: provider.baseUrl,
|
||||
embeddingBaseUrl: provider.embeddingBaseUrl,
|
||||
sttModel: provider.sttModel,
|
||||
sttBaseUrl: provider.sttBaseUrl,
|
||||
systemPrompt: provider.systemPrompt,
|
||||
hasApiKey,
|
||||
hasEmbeddingApiKey,
|
||||
hasSttApiKey,
|
||||
indexedPages,
|
||||
totalPages,
|
||||
};
|
||||
@@ -197,7 +214,7 @@ export class AiSettingsService {
|
||||
workspaceId: string,
|
||||
dto: UpdateAiSettingsInput,
|
||||
): Promise<MaskedAiSettings> {
|
||||
const { apiKey, embeddingApiKey, ...nonSecret } = dto;
|
||||
const { apiKey, embeddingApiKey, sttApiKey, ...nonSecret } = dto;
|
||||
|
||||
// Persist non-secret provider fields (only those present in the partial).
|
||||
const providerPatch: Partial<AiProviderSettings> = {};
|
||||
@@ -207,6 +224,8 @@ export class AiSettingsService {
|
||||
'embeddingModel',
|
||||
'baseUrl',
|
||||
'embeddingBaseUrl',
|
||||
'sttModel',
|
||||
'sttBaseUrl',
|
||||
'systemPrompt',
|
||||
] as const) {
|
||||
if (nonSecret[key] !== undefined) {
|
||||
@@ -222,7 +241,11 @@ export class AiSettingsService {
|
||||
|
||||
// Key handling (write-only). Both keys share the same target driver and the
|
||||
// same "driver required" guard, resolved once.
|
||||
if (apiKey !== undefined || embeddingApiKey !== undefined) {
|
||||
if (
|
||||
apiKey !== undefined ||
|
||||
embeddingApiKey !== undefined ||
|
||||
sttApiKey !== undefined
|
||||
) {
|
||||
const stored = await this.readProvider(workspaceId);
|
||||
const targetDriver = dto.driver ?? stored.driver;
|
||||
if (!targetDriver) {
|
||||
@@ -264,6 +287,23 @@ export class AiSettingsService {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// STT key.
|
||||
if (sttApiKey !== undefined) {
|
||||
if (sttApiKey === '') {
|
||||
await this.aiProviderCredentialsRepo.clearSttKey(
|
||||
workspaceId,
|
||||
targetDriver,
|
||||
);
|
||||
} else {
|
||||
const enc = this.secretBox.encryptSecret(sttApiKey);
|
||||
await this.aiProviderCredentialsRepo.upsertSttKey(
|
||||
workspaceId,
|
||||
targetDriver,
|
||||
enc,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return this.getMasked(workspaceId);
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
import { ServiceUnavailableException } from '@nestjs/common';
|
||||
|
||||
/**
|
||||
* Thrown when no usable STT (speech-to-text) config exists for the workspace
|
||||
* (missing driver / sttModel). Distinct from the chat & embedding variants so
|
||||
* the transcription endpoint can 503 independently of chat/embeddings being
|
||||
* configured.
|
||||
*/
|
||||
export class AiSttNotConfiguredException extends ServiceUnavailableException {
|
||||
constructor() {
|
||||
super('AI STT model not configured');
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import {
|
||||
generateText,
|
||||
type EmbeddingModel,
|
||||
type LanguageModel,
|
||||
type TranscriptionModel,
|
||||
} from 'ai';
|
||||
import { createOpenAI } from '@ai-sdk/openai';
|
||||
import { createGoogleGenerativeAI } from '@ai-sdk/google';
|
||||
@@ -11,6 +12,7 @@ import { createOllama } from 'ai-sdk-ollama';
|
||||
import { AiSettingsService } from './ai-settings.service';
|
||||
import { AiNotConfiguredException } from './ai-not-configured.exception';
|
||||
import { AiEmbeddingNotConfiguredException } from './ai-embedding-not-configured.exception';
|
||||
import { AiSttNotConfiguredException } from './ai-stt-not-configured.exception';
|
||||
import { describeProviderError } from './ai-error.util';
|
||||
|
||||
/**
|
||||
@@ -106,6 +108,26 @@ export class AiService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the workspace config and build the transcription (STT) model.
|
||||
* STT always speaks the OpenAI-compatible /v1/audio/transcriptions API
|
||||
* (only @ai-sdk/openai exposes .transcription()), regardless of the chat
|
||||
* driver. sttBaseUrl falls back to the chat baseUrl; the API key falls back
|
||||
* to the chat key (resolved by AiSettingsService.resolve). Built PER WORKSPACE
|
||||
* on demand; the decrypted key is never logged.
|
||||
*
|
||||
* Throws AiSttNotConfiguredException (-> 503) when no STT model is set.
|
||||
*/
|
||||
async getTranscriptionModel(workspaceId: string): Promise<TranscriptionModel> {
|
||||
const cfg = await this.aiSettings.resolve(workspaceId);
|
||||
if (!cfg?.sttModel) throw new AiSttNotConfiguredException();
|
||||
const baseURL = cfg.sttBaseUrl || cfg.baseUrl; // stt-specific, else chat
|
||||
// apiKey may be unused for keyless self-hosted whisper; pass a placeholder.
|
||||
return createOpenAI({ apiKey: cfg.sttApiKey ?? 'unused', baseURL }).transcription(
|
||||
cfg.sttModel,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Embed a batch of texts with the workspace embedding model. Returns one
|
||||
* vector per input, in the same order. Thin wrapper over the AI SDK's
|
||||
|
||||
@@ -21,6 +21,9 @@ export interface AiProviderSettings {
|
||||
baseUrl?: string;
|
||||
// Embedding-specific base URL. Falls back to `baseUrl` when empty/unset.
|
||||
embeddingBaseUrl?: string;
|
||||
sttModel?: string;
|
||||
// STT-specific base URL. Falls back to baseUrl when empty/unset.
|
||||
sttBaseUrl?: string;
|
||||
systemPrompt?: string;
|
||||
}
|
||||
|
||||
@@ -31,12 +34,15 @@ export interface AiProviderSettings {
|
||||
*
|
||||
* `embeddingBaseUrl` / `embeddingApiKey` are the embedding-specific endpoint and
|
||||
* key, already resolved with the chat-value fallback applied by `resolve`.
|
||||
* `sttBaseUrl` / `sttApiKey` are likewise the STT-specific endpoint and key,
|
||||
* already resolved with the chat-value fallback applied by `resolve`.
|
||||
*/
|
||||
export interface ResolvedAiConfig extends Partial<AiProviderSettings> {
|
||||
driver?: AiDriver;
|
||||
chatModel?: string;
|
||||
apiKey?: string;
|
||||
embeddingApiKey?: string;
|
||||
sttApiKey?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -50,9 +56,12 @@ export interface MaskedAiSettings {
|
||||
embeddingModel?: string;
|
||||
baseUrl?: string;
|
||||
embeddingBaseUrl?: string;
|
||||
sttModel?: string;
|
||||
sttBaseUrl?: string;
|
||||
systemPrompt?: string;
|
||||
hasApiKey: boolean;
|
||||
hasEmbeddingApiKey: boolean;
|
||||
hasSttApiKey: boolean;
|
||||
// RAG indexing coverage for the settings UI.
|
||||
indexedPages: number;
|
||||
totalPages: number;
|
||||
|
||||
@@ -4,10 +4,10 @@ import { AI_DRIVERS, AiDriver } from '../ai.types';
|
||||
/**
|
||||
* Admin update payload for the workspace AI provider settings.
|
||||
*
|
||||
* `apiKey` / `embeddingApiKey` are write-only (§8.2): provided → stored
|
||||
* encrypted, '' → cleared, absent → left untouched. They are NEVER returned by
|
||||
* any endpoint. The global ValidationPipe runs with `whitelist: true`, so
|
||||
* unknown fields are stripped.
|
||||
* `apiKey` / `embeddingApiKey` / `sttApiKey` are write-only (§8.2): provided →
|
||||
* stored encrypted, '' → cleared, absent → left untouched. They are NEVER
|
||||
* returned by any endpoint. The global ValidationPipe runs with
|
||||
* `whitelist: true`, so unknown fields are stripped.
|
||||
*/
|
||||
export class UpdateAiSettingsDto {
|
||||
@IsOptional()
|
||||
@@ -41,4 +41,16 @@ export class UpdateAiSettingsDto {
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
embeddingApiKey?: string;
|
||||
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
sttModel?: string;
|
||||
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
sttBaseUrl?: string;
|
||||
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
sttApiKey?: string;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user