diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json
index acceeb48..e1ac4abb 100644
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -1237,6 +1237,8 @@
   "No microphone found": "No microphone found",
   "Could not start recording": "Could not start recording",
   "Transcription failed": "Transcription failed",
+  "Transcribe": "Transcribe",
+  "No speech detected": "No speech detected",
   "Voice dictation is not configured": "Voice dictation is not configured",
   "Microphone is unavailable or already in use": "Microphone is unavailable or already in use",
   "Audio recording is not available in this browser/context": "Audio recording is not available in this browser/context",
diff --git a/apps/client/public/locales/ru-RU/translation.json b/apps/client/public/locales/ru-RU/translation.json
index b68f9b82..ca14b406 100644
--- a/apps/client/public/locales/ru-RU/translation.json
+++ b/apps/client/public/locales/ru-RU/translation.json
@@ -385,6 +385,11 @@
   "Quote": "Цитата",
   "Image": "Изображение",
   "Audio": "Аудио",
+  "Transcribe": "Транскрибировать",
+  "Transcribing…": "Транскрибация…",
+  "No speech detected": "Речь не распознана",
+  "Transcription failed": "Не удалось распознать речь",
+  "Voice dictation is not configured": "Голосовой ввод не настроен",
   "Embed PDF": "Встроить PDF",
   "Upload and embed a PDF file.": "Загрузите и встроите PDF-файл.",
   "Embed as PDF": "Встроить как PDF",
diff --git a/apps/client/src/features/editor/components/audio/audio-menu.tsx b/apps/client/src/features/editor/components/audio/audio-menu.tsx
index eadc1afe..bd649482 100644
--- a/apps/client/src/features/editor/components/audio/audio-menu.tsx
+++ b/apps/client/src/features/editor/components/audio/audio-menu.tsx
@@ -1,23 +1,43 @@
 import { BubbleMenu as BaseBubbleMenu } from "@tiptap/react/menus";
 import { findParentNode, posToDOMRect, useEditorState } from "@tiptap/react";
-import { useCallback } from "react";
+import { useCallback, useState } from "react";
 import { Node as PMNode } from "@tiptap/pm/model";
 import { isEditorReady } from "@docmost/editor-ext";
 import {
   EditorMenuProps,
   ShouldShowProps,
 } from "@/features/editor/components/table/types/types.ts";
-import { ActionIcon, Tooltip } from "@mantine/core";
+import { ActionIcon, Loader, Tooltip } from "@mantine/core";
 import {
   IconDownload,
+  IconFileText,
   IconTrash,
 } from "@tabler/icons-react";
+import { notifications } from "@mantine/notifications";
+import { useAtomValue } from "jotai";
 import { useTranslation } from "react-i18next";
 import { getFileUrl } from "@/lib/config.ts";
+import { workspaceAtom } from "@/features/user/atoms/current-user-atom.ts";
+import { transcribeAudio } from "@/features/dictation/services/dictation-service";
 import classes from "../common/toolbar-menu.module.css";
 
+// STT-accepted audio MIME types (mirror of the server whitelist). If the
+// fetched blob's type is not one of these, we infer it from the file
+// extension so the upload's content-type is something the endpoint accepts.
+const RECOGNIZED_AUDIO_MIME = new Set([
+  "audio/webm", "audio/ogg", "audio/mp4", "audio/mpeg",
+  "audio/wav", "audio/x-wav", "audio/wave", "audio/m4a", "audio/x-m4a",
+]);
+const AUDIO_MIME_BY_EXT: Record<string, string> = {
+  mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
+  wav: "audio/wav", ogg: "audio/ogg", oga: "audio/ogg", webm: "audio/webm",
+};
+
 export function AudioMenu({ editor }: EditorMenuProps) {
   const { t } = useTranslation();
+  const workspace = useAtomValue(workspaceAtom);
+  const dictationEnabled = workspace?.settings?.ai?.dictation === true;
+  const [isTranscribing, setIsTranscribing] = useState(false);
 
   const editorState = useEditorState({
     editor,
@@ -68,6 +88,100 @@ export function AudioMenu({ editor }: EditorMenuProps) {
     };
   }, [editor]);
 
+  const handleTranscribe = useCallback(async () => {
+    const src = editorState?.src;
+    if (!src || isTranscribing) return;
+
+    // The bubble menu shows for the selected audio node, so selection.from is
+    // that node's start position. Capture it now to disambiguate duplicate-src
+    // blocks after the async transcription completes.
+    const selectedPos = editor.state.selection.from;
+
+    setIsTranscribing(true);
+    try {
+      const fileUrl = getFileUrl(src);
+      // Derive a filename from the internal src for the multipart part name and
+      // for MIME inference when the fetched blob has no usable type.
+      const filename = decodeURIComponent(
+        src.split("?")[0].split("/").pop() || "audio",
+      );
+
+      const res = await fetch(fileUrl, { credentials: "include" });
+      if (!res.ok) {
+        throw new Error(`Failed to fetch audio file (HTTP ${res.status})`);
+      }
+      const blob = await res.blob();
+
+      // Ensure the upload's content-type is one the STT endpoint accepts; the
+      // server keys off the blob's MIME type.
+      let uploadBlob = blob;
+      const baseType = (blob.type || "").split(";")[0].trim().toLowerCase();
+      if (!RECOGNIZED_AUDIO_MIME.has(baseType)) {
+        const ext = filename.split(".").pop()?.toLowerCase() ?? "";
+        const inferred = AUDIO_MIME_BY_EXT[ext];
+        if (inferred) {
+          // Rebuild the blob with an accepted content-type; the server keys off it.
+          uploadBlob = new Blob([blob], { type: inferred });
+        }
+      }
+
+      const text = (await transcribeAudio(uploadBlob, filename)).trim();
+      if (text.length === 0) {
+        notifications.show({ message: t("No speech detected") });
+        return;
+      }
+
+      // Re-scan the doc at insert time so a collaborative edit during the async
+      // transcription can't misplace the text. Among audio nodes with this src
+      // (the same file may be embedded more than once), pick the occurrence
+      // closest to the originally-selected block.
+      let insertPos: number | null = null;
+      let bestDelta = Infinity;
+      editor.state.doc.descendants((node, pos) => {
+        if (node.type.name === "audio" && node.attrs.src === src) {
+          const delta = Math.abs(pos - selectedPos);
+          if (delta < bestDelta) {
+            bestDelta = delta;
+            insertPos = pos + node.nodeSize; // position just after the audio block
+          }
+        }
+        return true; // visit all nodes to find the closest match
+      });
+
+      const paragraph = { type: "paragraph", content: [{ type: "text", text }] };
+      try {
+        if (insertPos !== null) {
+          editor.chain().focus().insertContentAt(insertPos, paragraph).run();
+        } else {
+          editor.chain().focus().insertContent(paragraph).run();
+        }
+      } catch (insertErr) {
+        // A destroyed editor or out-of-bounds position must not throw; log and
+        // ignore so the transcription itself is not reported as a failure.
+        console.error("[audio-transcribe] insert failed", insertErr);
+      }
+    } catch (err) {
+      console.error("[audio-transcribe] failed", err);
+      const resp = (
+        err as { response?: { status?: number; data?: { message?: string } } }
+      )?.response;
+      const serverMsg = resp?.data?.message;
+      let message: string;
+      if (serverMsg && serverMsg.trim().length > 0) {
+        // The server already explains the cause (e.g. provider error, bad
+        // format, STT not configured) — show it verbatim.
+        message = serverMsg;
+      } else if (resp?.status === 503 || resp?.status === 403) {
+        message = t("Voice dictation is not configured");
+      } else {
+        message = `${t("Transcription failed")}: ${(err as { message?: string })?.message ?? String(err)}`;
+      }
+      notifications.show({ color: "red", message });
+    } finally {
+      setIsTranscribing(false);
+    }
+  }, [editor, editorState?.src, isTranscribing, t]);
+
   const handleDownload = useCallback(() => {
     if (!editorState?.src) return;
     const url = getFileUrl(editorState.src);
@@ -95,6 +209,20 @@ export function AudioMenu({ editor }: EditorMenuProps) {
       shouldShow={shouldShow}
     >
       <div className={classes.toolbar}>
+        {dictationEnabled && (
+          <Tooltip position="top" label={isTranscribing ? t("Transcribing…") : t("Transcribe")} withinPortal={false}>
+            <ActionIcon
+              onClick={handleTranscribe}
+              size="lg"
+              aria-label={t("Transcribe")}
+              variant="subtle"
+              disabled={isTranscribing}
+            >
+              {isTranscribing ? <Loader size={18} /> : <IconFileText size={18} />}
+            </ActionIcon>
+          </Tooltip>
+        )}
+
         <Tooltip position="top" label={t("Download")} withinPortal={false}>
           <ActionIcon
             onClick={handleDownload}
diff --git a/apps/client/src/features/editor/page-editor.tsx b/apps/client/src/features/editor/page-editor.tsx
index 5aeea3d4..94a1b21e 100644
--- a/apps/client/src/features/editor/page-editor.tsx
+++ b/apps/client/src/features/editor/page-editor.tsx
@@ -49,6 +49,7 @@ import { TableHandlesLayer } from "@/features/editor/components/table/handle/tab
 import ImageMenu from "@/features/editor/components/image/image-menu.tsx";
 import CalloutMenu from "@/features/editor/components/callout/callout-menu.tsx";
 import VideoMenu from "@/features/editor/components/video/video-menu.tsx";
+import AudioMenu from "@/features/editor/components/audio/audio-menu.tsx";
 import PdfMenu from "@/features/editor/components/pdf/pdf-menu.tsx";
 import SubpagesMenu from "@/features/editor/components/subpages/subpages-menu.tsx";
 import {
@@ -461,6 +462,7 @@ export default function PageEditor({
                 <TableHandlesLayer editor={editor} />
                 <ImageMenu editor={editor} />
                 <VideoMenu editor={editor} />
+                <AudioMenu editor={editor} />
                 <PdfMenu editor={editor} />
                 <CalloutMenu editor={editor} />
                 <SubpagesMenu editor={editor} />