diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json index acceeb48..e1ac4abb 100644 --- a/apps/client/public/locales/en-US/translation.json +++ b/apps/client/public/locales/en-US/translation.json @@ -1237,6 +1237,8 @@ "No microphone found": "No microphone found", "Could not start recording": "Could not start recording", "Transcription failed": "Transcription failed", + "Transcribe": "Transcribe", + "No speech detected": "No speech detected", "Voice dictation is not configured": "Voice dictation is not configured", "Microphone is unavailable or already in use": "Microphone is unavailable or already in use", "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context", diff --git a/apps/client/public/locales/ru-RU/translation.json b/apps/client/public/locales/ru-RU/translation.json index b68f9b82..ca14b406 100644 --- a/apps/client/public/locales/ru-RU/translation.json +++ b/apps/client/public/locales/ru-RU/translation.json @@ -385,6 +385,11 @@ "Quote": "Цитата", "Image": "Изображение", "Audio": "Аудио", + "Transcribe": "Транскрибировать", + "Transcribing…": "Транскрибация…", + "No speech detected": "Речь не распознана", + "Transcription failed": "Не удалось распознать речь", + "Voice dictation is not configured": "Голосовой ввод не настроен", "Embed PDF": "Встроить PDF", "Upload and embed a PDF file.": "Загрузите и встроите PDF-файл.", "Embed as PDF": "Встроить как PDF", diff --git a/apps/client/src/features/editor/components/audio/audio-menu.tsx b/apps/client/src/features/editor/components/audio/audio-menu.tsx index eadc1afe..bd649482 100644 --- a/apps/client/src/features/editor/components/audio/audio-menu.tsx +++ b/apps/client/src/features/editor/components/audio/audio-menu.tsx @@ -1,23 +1,43 @@ import { BubbleMenu as BaseBubbleMenu } from "@tiptap/react/menus"; import { findParentNode, posToDOMRect, useEditorState } from "@tiptap/react"; -import { useCallback } from "react"; +import { useCallback, useState } from "react"; import { Node as PMNode } from "@tiptap/pm/model"; import { isEditorReady } from "@docmost/editor-ext"; import { EditorMenuProps, ShouldShowProps, } from "@/features/editor/components/table/types/types.ts"; -import { ActionIcon, Tooltip } from "@mantine/core"; +import { ActionIcon, Loader, Tooltip } from "@mantine/core"; import { IconDownload, + IconFileText, IconTrash, } from "@tabler/icons-react"; +import { notifications } from "@mantine/notifications"; +import { useAtomValue } from "jotai"; import { useTranslation } from "react-i18next"; import { getFileUrl } from "@/lib/config.ts"; +import { workspaceAtom } from "@/features/user/atoms/current-user-atom.ts"; +import { transcribeAudio } from "@/features/dictation/services/dictation-service"; import classes from "../common/toolbar-menu.module.css"; +// STT-accepted audio MIME types (mirror of the server whitelist). If the +// fetched blob's type is not one of these, we infer it from the file +// extension so the upload's content-type is something the endpoint accepts. +const RECOGNIZED_AUDIO_MIME = new Set([ + "audio/webm", "audio/ogg", "audio/mp4", "audio/mpeg", + "audio/wav", "audio/x-wav", "audio/wave", "audio/m4a", "audio/x-m4a", +]); +const AUDIO_MIME_BY_EXT: Record = { + mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4", + wav: "audio/wav", ogg: "audio/ogg", oga: "audio/ogg", webm: "audio/webm", +}; + export function AudioMenu({ editor }: EditorMenuProps) { const { t } = useTranslation(); + const workspace = useAtomValue(workspaceAtom); + const dictationEnabled = workspace?.settings?.ai?.dictation === true; + const [isTranscribing, setIsTranscribing] = useState(false); const editorState = useEditorState({ editor, @@ -68,6 +88,100 @@ export function AudioMenu({ editor }: EditorMenuProps) { }; }, [editor]); + const handleTranscribe = useCallback(async () => { + const src = editorState?.src; + if (!src || isTranscribing) return; + + // The bubble menu shows for the selected audio node, so selection.from is + // that node's start position. Capture it now to disambiguate duplicate-src + // blocks after the async transcription completes. + const selectedPos = editor.state.selection.from; + + setIsTranscribing(true); + try { + const fileUrl = getFileUrl(src); + // Derive a filename from the internal src for the multipart part name and + // for MIME inference when the fetched blob has no usable type. + const filename = decodeURIComponent( + src.split("?")[0].split("/").pop() || "audio", + ); + + const res = await fetch(fileUrl, { credentials: "include" }); + if (!res.ok) { + throw new Error(`Failed to fetch audio file (HTTP ${res.status})`); + } + const blob = await res.blob(); + + // Ensure the upload's content-type is one the STT endpoint accepts; the + // server keys off the blob's MIME type. + let uploadBlob = blob; + const baseType = (blob.type || "").split(";")[0].trim().toLowerCase(); + if (!RECOGNIZED_AUDIO_MIME.has(baseType)) { + const ext = filename.split(".").pop()?.toLowerCase() ?? ""; + const inferred = AUDIO_MIME_BY_EXT[ext]; + if (inferred) { + // Rebuild the blob with an accepted content-type; the server keys off it. + uploadBlob = new Blob([blob], { type: inferred }); + } + } + + const text = (await transcribeAudio(uploadBlob, filename)).trim(); + if (text.length === 0) { + notifications.show({ message: t("No speech detected") }); + return; + } + + // Re-scan the doc at insert time so a collaborative edit during the async + // transcription can't misplace the text. Among audio nodes with this src + // (the same file may be embedded more than once), pick the occurrence + // closest to the originally-selected block. + let insertPos: number | null = null; + let bestDelta = Infinity; + editor.state.doc.descendants((node, pos) => { + if (node.type.name === "audio" && node.attrs.src === src) { + const delta = Math.abs(pos - selectedPos); + if (delta < bestDelta) { + bestDelta = delta; + insertPos = pos + node.nodeSize; // position just after the audio block + } + } + return true; // visit all nodes to find the closest match + }); + + const paragraph = { type: "paragraph", content: [{ type: "text", text }] }; + try { + if (insertPos !== null) { + editor.chain().focus().insertContentAt(insertPos, paragraph).run(); + } else { + editor.chain().focus().insertContent(paragraph).run(); + } + } catch (insertErr) { + // A destroyed editor or out-of-bounds position must not throw; log and + // ignore so the transcription itself is not reported as a failure. + console.error("[audio-transcribe] insert failed", insertErr); + } + } catch (err) { + console.error("[audio-transcribe] failed", err); + const resp = ( + err as { response?: { status?: number; data?: { message?: string } } } + )?.response; + const serverMsg = resp?.data?.message; + let message: string; + if (serverMsg && serverMsg.trim().length > 0) { + // The server already explains the cause (e.g. provider error, bad + // format, STT not configured) — show it verbatim. + message = serverMsg; + } else if (resp?.status === 503 || resp?.status === 403) { + message = t("Voice dictation is not configured"); + } else { + message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`; + } + notifications.show({ color: "red", message }); + } finally { + setIsTranscribing(false); + } + }, [editor, editorState?.src, isTranscribing, t]); + const handleDownload = useCallback(() => { if (!editorState?.src) return; const url = getFileUrl(editorState.src); @@ -95,6 +209,20 @@ export function AudioMenu({ editor }: EditorMenuProps) { shouldShow={shouldShow} >
+ {dictationEnabled && ( + + + {isTranscribing ? : } + + + )} + +