feat(editor): add editable image captions (#221)

Add a visible caption (<figcaption>) under images, editable from the image bubble-menu and persisted across all formats: native Yjs/JSON, HTML export, and Markdown. - image node: new plain-text `caption` attribute (parse/render `data-caption` on <img>, emitted only when set) + `setImageCaption` command. The node stays an atom; the schema shape is unchanged, so the server's generateHTML/generateJSON path round-trips it for free. - resize node-view: re-parent the resizable wrapper into a <figure> and render the caption in a <figcaption> BELOW it, outside nodeView.wrapper (so onCommit's offsetHeight measurement and the left/right resize handles still cover the image only). This path also drives read-only / share rendering. React placeholder view renders the caption too. - bubble-menu: new useCaptionControl panel modeled on useAltTextControl (own icon, Caption strings, softer sanitizer, ~500 char limit). - markdown lossless round-trip: a captioned image is emitted as a raw <img data-caption> wrapped in a block <div> (same trick as <video>) in both the editor-ext turndown rule and the MCP converter; caption-less images stay clean ![alt](src). Import restores the caption via the shared markdownToHtml + parseHTML. - styles + i18n keys; tests for the schema attr round-trip, markdown round-trip (editor-ext) and the MCP converter. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 06:25:51 +03:00
parent 904f7b4303
commit 5fbd655441
13 changed files with 427 additions and 8 deletions
--- a/apps/client/public/locales/en-US/translation.json
+++ b/apps/client/public/locales/en-US/translation.json
@@ -286,6 +286,9 @@
  "Alt text": "Alt text",
  "Describe this for accessibility.": "Describe this for accessibility.",
  "Add a description": "Add a description",
+  "Caption": "Caption",
+  "Add a caption": "Add a caption",
+  "Shown below the image.": "Shown below the image.",
  "Justify": "Justify",
  "Merge cells": "Merge cells",
  "Split cell": "Split cell",
--- a/apps/client/src/features/editor/components/common/use-caption-control.tsx
+++ b/apps/client/src/features/editor/components/common/use-caption-control.tsx
@@ -0,0 +1,141 @@
+import React, { useCallback, useEffect, useState } from "react";
+import { Editor } from "@tiptap/react";
+import {
+  ActionIcon,
+  Button,
+  Group,
+  Paper,
+  Text,
+  Textarea,
+  Tooltip,
+} from "@mantine/core";
+import { IconTextCaption } from "@tabler/icons-react";
+import { useTranslation } from "react-i18next";
+
+const CAPTION_MAX_LENGTH = 500;
+
+// Caption is plain visible text (not a markdown link target like alt), so it is
+// sanitized more softly than alt: collapse runs of whitespace/newlines into a
+// single space and trim, keeping the limit generous.
+function sanitizeCaption(value: string): string {
+  return value.replace(/\s+/g, " ").trim().slice(0, CAPTION_MAX_LENGTH);
+}
+
+type UseCaptionControlArgs = {
+  editor: Editor;
+  nodeName: string;
+  currentCaption: string;
+};
+
+export function useCaptionControl({
+  editor,
+  nodeName,
+  currentCaption,
+}: UseCaptionControlArgs) {
+  const { t } = useTranslation();
+  const [showInput, setShowInput] = useState(false);
+  const [draft, setDraft] = useState("");
+
+  const open = useCallback(() => {
+    setDraft(currentCaption || "");
+    setShowInput(true);
+  }, [currentCaption]);
+
+  useEffect(() => {
+    const handler = () => {
+      if (!editor.isActive(nodeName)) {
+        setShowInput(false);
+      }
+    };
+    editor.on("selectionUpdate", handler);
+    return () => {
+      editor.off("selectionUpdate", handler);
+    };
+  }, [editor, nodeName]);
+
+  const cancel = useCallback(() => {
+    setShowInput(false);
+  }, []);
+
+  const save = useCallback(() => {
+    editor
+      .chain()
+      .focus(undefined, { scrollIntoView: false })
+      .updateAttributes(nodeName, {
+        caption: sanitizeCaption(draft) || undefined,
+      })
+      .run();
+    setShowInput(false);
+  }, [editor, nodeName, draft]);
+
+  const onKeyDown = useCallback(
+    (e: React.KeyboardEvent) => {
+      if (e.key === "Enter" && (e.metaKey || e.ctrlKey)) {
+        e.preventDefault();
+        save();
+      } else if (e.key === "Escape") {
+        e.preventDefault();
+        cancel();
+      }
+    },
+    [save, cancel],
+  );
+
+  const button = (
+    <Tooltip position="top" label={t("Caption")} withinPortal={false}>
+      <ActionIcon
+        onClick={open}
+        size="lg"
+        aria-label={t("Caption")}
+        variant="subtle"
+      >
+        <IconTextCaption size={18} />
+      </ActionIcon>
+    </Tooltip>
+  );
+
+  const panel = showInput ? (
+    <Paper
+      withBorder
+      shadow="md"
+      radius={6}
+      p="sm"
+      w={320}
+      style={{ position: "relative", zIndex: 100 }}
+    >
+      <Text size="sm" fw={600} mb={2}>
+        {t("Caption")}
+      </Text>
+      <Text size="xs" c="dimmed" mb="xs">
+        {t("Shown below the image.")}
+      </Text>
+      <Textarea
+        size="xs"
+        placeholder={t("Add a caption")}
+        value={draft}
+        onChange={(e) => setDraft(e.currentTarget.value)}
+        onKeyDown={onKeyDown}
+        autoFocus
+        autosize
+        minRows={2}
+        maxRows={5}
+        maxLength={CAPTION_MAX_LENGTH}
+      />
+      <Group justify="space-between" align="center" mt="xs" wrap="nowrap">
+        <Text size="xs" c="dimmed">
+          {draft.length}/{CAPTION_MAX_LENGTH}
+        </Text>
+        <Group gap="xs">
+          <Button size="compact-xs" variant="default" onClick={cancel}>
+            {t("Cancel")}
+          </Button>
+          <Button size="compact-xs" onClick={save}>
+            {t("Save")}
+          </Button>
+        </Group>
+      </Group>
+    </Paper>
+  ) : null;
+
+  return { button, panel, isEditing: showInput };
+}
--- a/apps/client/src/features/editor/components/image/image-menu.tsx
+++ b/apps/client/src/features/editor/components/image/image-menu.tsx
@@ -23,6 +23,7 @@ import { useTranslation } from "react-i18next";
 import { getFileUrl } from "@/lib/config.ts";
 import { uploadImageAction } from "@/features/editor/components/image/upload-image-action.tsx";
 import { useAltTextControl } from "@/features/editor/components/common/use-alt-text-control.tsx";
+import { useCaptionControl } from "@/features/editor/components/common/use-caption-control.tsx";
 import classes from "../common/toolbar-menu.module.css";

 export function ImageMenu({ editor }: EditorMenuProps) {
@@ -47,6 +48,7 @@ export function ImageMenu({ editor }: EditorMenuProps) {
        isFloatRight: ctx.editor.isActive("image", { align: "floatRight" }),
        src: imageAttrs?.src || null,
        alt: imageAttrs?.alt || "",
+        caption: imageAttrs?.caption || "",
      };
    },
  });
@@ -168,6 +170,16 @@ export function ImageMenu({ editor }: EditorMenuProps) {
    currentAlt: editorState?.alt || "",
  });

+  const {
+    button: captionButton,
+    panel: captionPanel,
+    isEditing: isEditingCaption,
+  } = useCaptionControl({
+    editor,
+    nodeName: "image",
+    currentCaption: editorState?.caption || "",
+  });
+
  return (
    <BaseBubbleMenu
      editor={editor}
@@ -183,6 +195,8 @@ export function ImageMenu({ editor }: EditorMenuProps) {
    >
      {isEditingAlt ? (
        altTextPanel
+      ) : isEditingCaption ? (
+        captionPanel
      ) : (
        <div className={classes.toolbar}>
        <Tooltip position="top" label={t("Align left")} withinPortal={false}>
@@ -249,6 +263,8 @@ export function ImageMenu({ editor }: EditorMenuProps) {

        {altTextButton}

+        {captionButton}
+
        <div className={classes.divider} />

        <Tooltip position="top" label={t("Download")} withinPortal={false}>
--- a/apps/client/src/features/editor/components/image/image-view.module.css
+++ b/apps/client/src/features/editor/components/image/image-view.module.css
@@ -7,6 +7,16 @@
  overflow: hidden;
 }

+.imageCaption {
+  display: block;
+  text-align: center;
+  font-size: 0.875em;
+  color: var(--mantine-color-dimmed);
+  margin-top: 0.4em;
+  line-height: 1.35;
+  word-break: break-word;
+}
+
 .skeleton {
  animation: pulse 1.2s ease-in-out infinite;

--- a/apps/client/src/features/editor/components/image/image-view.tsx
+++ b/apps/client/src/features/editor/components/image/image-view.tsx
@@ -9,7 +9,9 @@ import { useTranslation } from "react-i18next";
 export default function ImageView(props: NodeViewProps) {
  const { t } = useTranslation();
  const { editor, node, selected } = props;
-  const { src, width, align, alt, aspectRatio, placeholder } = node.attrs;
+  const { src, width, align, alt, caption, aspectRatio, placeholder } =
+    node.attrs;
+  const captionText = (caption || "").trim();
  const alignClass = useMemo(() => {
    if (align === "left") return "alignLeft";
    if (align === "right") return "alignRight";
@@ -29,6 +31,7 @@ export default function ImageView(props: NodeViewProps) {

  return (
    <NodeViewWrapper data-drag-handle>
+      <figure style={{ margin: 0 }}>
      <div
        className={clsx(
          selected && "ProseMirror-selectednode",
@@ -66,6 +69,15 @@ export default function ImageView(props: NodeViewProps) {
          </Group>
        )}
      </div>
+      {captionText && (
+        <Text
+          component="figcaption"
+          className={clsx(classes.imageCaption, "image-caption")}
+        >
+          {captionText}
+        </Text>
+      )}
+      </figure>
    </NodeViewWrapper>
  );
 }
--- a/apps/client/src/features/editor/styles/media.css
+++ b/apps/client/src/features/editor/styles/media.css
@@ -33,6 +33,15 @@
    }
  }

+  .image-caption {
+    text-align: center;
+    font-size: 0.875em;
+    color: var(--mantine-color-dimmed);
+    margin-top: 0.4em;
+    line-height: 1.35;
+    word-break: break-word;
+  }
+
  .uploading-text {
    font-size: var(--mantine-font-size-md);
    line-height: var(--mantine-line-height-md);
--- a/packages/editor-ext/src/lib/image/image-markdown.test.ts
+++ b/packages/editor-ext/src/lib/image/image-markdown.test.ts
@@ -0,0 +1,46 @@
+import { describe, it, expect } from "vitest";
+import { htmlToMarkdown } from "../markdown/utils/turndown.utils";
+import { markdownToHtml } from "../markdown/utils/marked.utils";
+
+// Lossless markdown round-trip for image captions (issue #221). An image WITH a
+// caption can't be expressed as `![alt](src)`, so it is emitted as a raw <img>
+// (carrying data-caption) wrapped in a block <div>, the same trick the <video>
+// rule uses. marked passes the raw HTML through, so markdownToHtml keeps the
+// data-caption, and the image extension's parseHTML restores the attribute.
+describe("image caption markdown round-trip", () => {
+  it("HTML -> Markdown emits a raw <img data-caption> for captioned images", () => {
+    const html = `<p><img src="/files/a.png" alt="cat" data-caption="A grey cat"></p>`;
+    const md = htmlToMarkdown(html);
+    expect(md).toContain("data-caption=\"A grey cat\"");
+    expect(md).toContain('src="/files/a.png"');
+    expect(md).toContain('alt="cat"');
+    // It must NOT degrade to the lossy ![]() form.
+    expect(md).not.toContain("![cat]");
+  });
+
+  it("Markdown -> HTML restores data-caption on the <img>", async () => {
+    const html = `<p><img src="/files/a.png" alt="cat" data-caption="A grey cat"></p>`;
+    const md = htmlToMarkdown(html);
+    const back = await markdownToHtml(md);
+    expect(back).toContain('data-caption="A grey cat"');
+    expect(back).toContain('src="/files/a.png"');
+  });
+
+  it("special characters in the caption survive the round-trip (escaped)", async () => {
+    const html = `<p><img src="/files/a.png" data-caption='Tom &amp; &quot;Jerry&quot;'></p>`;
+    const md = htmlToMarkdown(html);
+    const back = await markdownToHtml(md);
+    // parse5 keeps the entity-encoded form inside the attribute value.
+    expect(back).toContain("data-caption=");
+    expect(back).toContain("Jerry");
+    expect(back).toContain("Tom");
+  });
+
+  it("caption-less images stay a clean ![alt](src) with no raw HTML", () => {
+    const html = `<p><img src="/files/a.png" alt="cat"></p>`;
+    const md = htmlToMarkdown(html);
+    expect(md).toContain("![cat](/files/a.png)");
+    expect(md).not.toContain("data-caption");
+    expect(md).not.toContain("<img");
+  });
+});
--- a/packages/editor-ext/src/lib/image/image.spec.ts
+++ b/packages/editor-ext/src/lib/image/image.spec.ts
@@ -1,5 +1,16 @@
 import { describe, it, expect, beforeEach } from "vitest";
-import { applyAlignment } from "./image";
+import { getSchema } from "@tiptap/core";
+import { generateHTML, generateJSON } from "@tiptap/html";
+import { Document } from "@tiptap/extension-document";
+import { Paragraph } from "@tiptap/extension-paragraph";
+import { Text } from "@tiptap/extension-text";
+import { applyAlignment, TiptapImage } from "./image";
+
+// CONTRACT tests for the image node's `caption` attribute (issue #221). The
+// caption is a plain-text string stored on the image atom and serialized as
+// `data-caption` on the <img>. If this mapping drifts, captions saved to HTML
+// (and thus to native storage / search / markdown) are silently lost.
+const extensions = [Document, Paragraph, Text, TiptapImage];

 // applyAlignment is a pure DOM mutation: it sets the float / padding /
 // justify-content / data-image-align on an image node-view container per the
@@ -65,3 +76,56 @@ describe("applyAlignment", () => {
    expect(el.style.justifyContent).toBe("flex-start");
  });
 });
+
+describe("image schema", () => {
+  it("registers the image node and keeps it an atom", () => {
+    const schema = getSchema(extensions);
+    expect(schema.nodes.image).toBeTruthy();
+    expect(schema.nodes.image.spec.atom).toBe(true);
+  });
+});
+
+describe("image caption parse/render round-trip", () => {
+  it("recovers caption from data-caption on parse (HTML -> JSON)", () => {
+    const html = `<img src="/files/a.png" alt="cat" data-caption="A grey cat">`;
+    const json = generateJSON(html, extensions);
+
+    const node = json.content?.[0];
+    expect(node?.type).toBe("image");
+    expect(node?.attrs?.caption).toBe("A grey cat");
+    expect(node?.attrs?.alt).toBe("cat");
+  });
+
+  it("emits data-caption on render when set (JSON -> HTML)", () => {
+    const json = {
+      type: "doc",
+      content: [
+        {
+          type: "image",
+          attrs: { src: "/files/a.png", alt: "cat", caption: "A grey cat" },
+        },
+      ],
+    };
+    const html = generateHTML(json, extensions);
+    expect(html).toContain('data-caption="A grey cat"');
+  });
+
+  it("omits data-caption when there is no caption (caption-less images stay clean)", () => {
+    const json = {
+      type: "doc",
+      content: [{ type: "image", attrs: { src: "/files/a.png", alt: "cat" } }],
+    };
+    const html = generateHTML(json, extensions);
+    expect(html).not.toContain("data-caption");
+  });
+
+  it("full HTML -> JSON -> HTML round-trip preserves the caption", () => {
+    const html = `<img src="/files/a.png" alt="cat" data-caption="Caption with &amp; &quot;quotes&quot;">`;
+    const json = generateJSON(html, extensions);
+    expect(json.content?.[0]?.attrs?.caption).toBe('Caption with & "quotes"');
+
+    const out = generateHTML(json, extensions);
+    const back = generateJSON(out, extensions);
+    expect(back.content?.[0]?.attrs?.caption).toBe('Caption with & "quotes"');
+  });
+});
--- a/packages/editor-ext/src/lib/image/image.ts
+++ b/packages/editor-ext/src/lib/image/image.ts
@@ -32,6 +32,7 @@ export interface ImageOptions extends DefaultImageOptions {
 export interface ImageAttributes {
  src?: string;
  alt?: string;
+  caption?: string;
  align?: string;
  attachmentId?: string;
  size?: number;
@@ -54,6 +55,7 @@ declare module "@tiptap/core" {
      setImageAlign: (
        align: "left" | "center" | "right" | "floatLeft" | "floatRight",
      ) => ReturnType;
+      setImageCaption: (caption: string | undefined) => ReturnType;
      setImageWidth: (width: number) => ReturnType;
      setImageSize: (width: number, height: number) => ReturnType;
    };
@@ -125,6 +127,13 @@ export const TiptapImage = Image.extend<ImageOptions>({
          alt: attributes.alt,
        }),
      },
+      caption: {
+        default: undefined,
+        parseHTML: (element) => element.getAttribute("data-caption") || undefined,
+        // Emit data-caption only when set, so caption-less images stay clean.
+        renderHTML: (attributes: ImageAttributes) =>
+          attributes.caption ? { "data-caption": attributes.caption } : {},
+      },
      attachmentId: {
        default: undefined,
        parseHTML: (element) => element.getAttribute("data-attachment-id"),
@@ -185,6 +194,11 @@ export const TiptapImage = Image.extend<ImageOptions>({
        ({ commands }) =>
          commands.updateAttributes("image", { align }),

+      setImageCaption:
+        (caption) =>
+        ({ commands }) =>
+          commands.updateAttributes("image", { caption }),
+
      setImageWidth:
        (width) =>
        ({ commands }) =>
@@ -304,6 +318,10 @@ export const TiptapImage = Image.extend<ImageOptions>({
            el.alt = updatedNode.attrs.alt || "";
          }

+          if (updatedNode.attrs.caption !== currentNode.attrs.caption) {
+            applyCaption(updatedNode.attrs.caption);
+          }
+
          const w = updatedNode.attrs.width;
          const h = updatedNode.attrs.height;
          if (w != null) {
@@ -335,6 +353,28 @@ export const TiptapImage = Image.extend<ImageOptions>({

      const dom = nodeView.dom as HTMLElement;

+      // Re-parent the resizable wrapper into a <figure> so the caption sits BELOW
+      // the image, OUTSIDE nodeView.wrapper. onCommit measures the img's
+      // offsetHeight for the persisted height/aspectRatio, and the left/right
+      // resize handles span the wrapper — both must cover the image only. The
+      // <figure> stays the single flex child of the container, so applyAlignment
+      // and the float modes keep working. This path also drives read-only/share.
+      const figure = document.createElement("figure");
+      figure.style.margin = "0";
+      figure.style.display = "inline-block"; // shrink-to-fit to image width
+      figure.appendChild(nodeView.wrapper);
+      dom.appendChild(figure);
+
+      const figcaption = document.createElement("figcaption");
+      figcaption.className = "image-caption";
+      const applyCaption = (text?: string) => {
+        const value = (text || "").trim();
+        figcaption.textContent = value;
+        figcaption.style.display = value ? "block" : "none";
+      };
+      applyCaption(node.attrs.caption);
+      figure.appendChild(figcaption);
+
      // Apply initial alignment
      applyAlignment(dom, node.attrs.align || "center");

--- a/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts
+++ b/packages/editor-ext/src/lib/markdown/utils/turndown.utils.ts
@@ -12,6 +12,14 @@ function sanitizeMdLinkText(value: string): string {
    .replace(/[\r\n]+/g, ' ');
 }

+// Escape a value placed inside a double-quoted HTML attribute (img src/alt/
+// data-caption in the raw-HTML image fallback). Only & and " are special in
+// that context; escaping them is idempotent because parse5/marked decode them
+// back on re-import.
+function escapeHtmlAttr(value: string): string {
+  return value.replace(/&/g, '&amp;').replace(/"/g, '&quot;');
+}
+
 // Tags turndown treats as void (self-closing). Footnote references render as an
 // empty <sup data-footnote-ref> whose meaning lives entirely in its data-id;
 // without marking it void, turndown's blank-node removal drops it before our
@@ -258,6 +266,17 @@ function image(turndownService: _TurndownService) {
    replacement: function (_content: string, node: HTMLInputElement) {
      const src = node.getAttribute('src') || '';
      if (!src) return '';
+      const caption = node.getAttribute('data-caption') || '';
+      if (caption) {
+        // ![]() can't carry a caption, so emit a raw <img> wrapped in a block
+        // <div> (like the video rule). marked passes it through and the image
+        // extension's parseHTML restores the caption from data-caption.
+        const parts = [`src="${escapeHtmlAttr(src)}"`];
+        const alt = node.getAttribute('alt') || '';
+        if (alt) parts.push(`alt="${escapeHtmlAttr(alt)}"`);
+        parts.push(`data-caption="${escapeHtmlAttr(caption)}"`);
+        return `<div><img ${parts.join(' ')}></div>`;
+      }
      const alt = sanitizeMdLinkText(node.getAttribute('alt') || '');
      const title = node.getAttribute('title') || '';
      const titlePart = title ? ' "' + title.replace(/"/g, '\\"') + '"' : '';
--- a/packages/mcp/build/lib/markdown-converter.js
+++ b/packages/mcp/build/lib/markdown-converter.js
@@ -207,16 +207,27 @@ export function convertProseMirrorToMarkdown(content) {
                // Two trailing spaces before the newline encode a markdown hard break;
                // a bare "\n" would be reimported as a soft break and lost.
                return "  \n";
-            case "image":
+            case "image": {
                const imgAlt = node.attrs?.alt || "";
+                const imgCaption = node.attrs?.caption || "";
+                if (imgCaption) {
+                    // ![]() can't carry a caption, so (symmetric to video) emit a raw
+                    // <img> wrapped in a block <div>. markdownToHtml passes it through and
+                    // the image extension's parseHTML restores the caption from
+                    // data-caption on import.
+                    const parts = [`src="${escapeAttr(node.attrs?.src ?? "")}"`];
+                    if (imgAlt)
+                        parts.push(`alt="${escapeAttr(imgAlt)}"`);
+                    parts.push(`data-caption="${escapeAttr(imgCaption)}"`);
+                    return `<div><img ${parts.join(" ")}></div>`;
+                }
                // Neutralize characters that could break out of the markdown image
                // URL: spaces/newlines and parentheses would terminate the (...) target
                // and let a stored src inject following markdown/HTML. Percent-encode
                // them so the URL stays a single inert token.
                const imgSrc = encodeMdUrl(node.attrs?.src);
-                // No "caption" attribute exists in the Docmost image schema, so we do
-                // not emit one (the previous caption branch was dead).
                return `![${imgAlt}](${imgSrc})`;
+            }
            case "video": {
                // Emit the schema-matching <video> element so generateJSON rebuilds the
                // node with its attrs intact. The schema's parseHTML reads src/aria-label
@@ -618,6 +629,8 @@ export function convertProseMirrorToMarkdown(content) {
        const parts = [`src="${escapeAttr(attrs.src ?? "")}"`];
        if (attrs.alt)
            parts.push(`alt="${escapeAttr(attrs.alt)}"`);
+        if (attrs.caption)
+            parts.push(`data-caption="${escapeAttr(attrs.caption)}"`);
        if (attrs.title)
            parts.push(`title="${escapeAttr(attrs.title)}"`);
        if (attrs.width != null)
--- a/packages/mcp/src/lib/markdown-converter.ts
+++ b/packages/mcp/src/lib/markdown-converter.ts
@@ -228,16 +228,26 @@ export function convertProseMirrorToMarkdown(content: any): string {
        // a bare "\n" would be reimported as a soft break and lost.
        return "  \n";

-      case "image":
+      case "image": {
        const imgAlt = node.attrs?.alt || "";
+        const imgCaption = node.attrs?.caption || "";
+        if (imgCaption) {
+          // ![]() can't carry a caption, so (symmetric to video) emit a raw
+          // <img> wrapped in a block <div>. markdownToHtml passes it through and
+          // the image extension's parseHTML restores the caption from
+          // data-caption on import.
+          const parts: string[] = [`src="${escapeAttr(node.attrs?.src ?? "")}"`];
+          if (imgAlt) parts.push(`alt="${escapeAttr(imgAlt)}"`);
+          parts.push(`data-caption="${escapeAttr(imgCaption)}"`);
+          return `<div><img ${parts.join(" ")}></div>`;
+        }
        // Neutralize characters that could break out of the markdown image
        // URL: spaces/newlines and parentheses would terminate the (...) target
        // and let a stored src inject following markdown/HTML. Percent-encode
        // them so the URL stays a single inert token.
        const imgSrc = encodeMdUrl(node.attrs?.src);
-        // No "caption" attribute exists in the Docmost image schema, so we do
-        // not emit one (the previous caption branch was dead).
        return `![${imgAlt}](${imgSrc})`;
+      }

      case "video": {
        // Emit the schema-matching <video> element so generateJSON rebuilds the
@@ -678,6 +688,8 @@ export function convertProseMirrorToMarkdown(content: any): string {
    const attrs = node.attrs || {};
    const parts: string[] = [`src="${escapeAttr(attrs.src ?? "")}"`];
    if (attrs.alt) parts.push(`alt="${escapeAttr(attrs.alt)}"`);
+    if (attrs.caption)
+      parts.push(`data-caption="${escapeAttr(attrs.caption)}"`);
    if (attrs.title) parts.push(`title="${escapeAttr(attrs.title)}"`);
    if (attrs.width != null) parts.push(`width="${escapeAttr(attrs.width)}"`);
    if (attrs.height != null) parts.push(`height="${escapeAttr(attrs.height)}"`);
--- a/packages/mcp/test/unit/markdown-converter.test.mjs
+++ b/packages/mcp/test/unit/markdown-converter.test.mjs
@@ -149,3 +149,37 @@ test("empty task item still emits its marker", () => {

  assert.equal(convertProseMirrorToMarkdown(input), "- [ ]\n- [x]");
 });
+
+// Image captions (issue #221). An image WITHOUT a caption stays the lossy-free
+// `![alt](src)`; WITH a caption it is emitted as a raw <img data-caption>
+// wrapped in a block <div> (symmetric to video) so the round-trip md -> html ->
+// json restores the caption via the image extension's parseHTML.
+test("image without a caption emits plain ![alt](src)", () => {
+  const input = doc({
+    type: "image",
+    attrs: { src: "/files/a.png", alt: "cat" },
+  });
+  assert.equal(convertProseMirrorToMarkdown(input), "![cat](/files/a.png)");
+});
+
+test("image with a caption emits a raw <img data-caption> in a block div", () => {
+  const input = doc({
+    type: "image",
+    attrs: { src: "/files/a.png", alt: "cat", caption: "A grey cat" },
+  });
+  assert.equal(
+    convertProseMirrorToMarkdown(input),
+    '<div><img src="/files/a.png" alt="cat" data-caption="A grey cat"></div>',
+  );
+});
+
+test("image caption escapes & and \" in the data-caption attribute", () => {
+  const input = doc({
+    type: "image",
+    attrs: { src: "/files/a.png", caption: 'Tom & "Jerry"' },
+  });
+  assert.equal(
+    convertProseMirrorToMarkdown(input),
+    '<div><img src="/files/a.png" data-caption="Tom &amp; &quot;Jerry&quot;"></div>',
+  );
+});