feat(sync): scaffold monorepo, extract docmost-client, add Phase-0 harness + read-only pull

Lock the access-layer decision (REST only) and start implementation per SPEC. - monorepo (npm workspaces): packages/docmost-client = DocmostClient + lib/* copied 1:1 from docmost-mcp/src (backport target), plus bannered sync methods (listTrash, restorePage, listAllSpacePages, exportPageBody, listRecentSince / collectRecentSince cursor scan) - engine stays the root app per AGENTS.md (src/, test/, build/, data/, settings.ts); add roundtrip.ts (SPEC §11 idempotency harness), pull.ts (SPEC §6 read-only Docmost->FS mirror), sanitize.ts (SPEC §12 filenames, path-traversal-safe) - Dockerfile builds the workspace lib before the app; vitest gates CI - exportPageBody never touches /comments (SPEC §3); serializeDocmostMarkdownBody emits meta + body only - SPEC: resolve access-layer (REST), reflect root-engine layout + REST pagination - tests: sanitize (incl. dot-traversal), collectRecentSince (cutoff/dedup/cap), stripBlockIds, markdown round-trip byte-stability Note: raw ProseMirror round-trip is byte-stable in Markdown but not yet attribute- idempotent (SPEC §11 Задача №0, before Phase 2).
2026-06-16 20:20:20 +03:00
parent 2f92dc4c1f
commit 447d2508ae
33 changed files with 10502 additions and 174 deletions
--- a/packages/docmost-client/src/lib/markdown-converter.ts
+++ b/packages/docmost-client/src/lib/markdown-converter.ts
@@ -0,0 +1,861 @@
+/**
+ * Convert ProseMirror/TipTap JSON content to Markdown
+ * Supports all Docmost-specific node types and extensions
+ */
+export function convertProseMirrorToMarkdown(content: any): string {
+  if (!content || !content.content) return "";
+
+  // Escape a value interpolated into an HTML double-quoted attribute value
+  // (textAlign, colors, image src, math `text`, all data-* attrs, etc.). In the
+  // ATTRIBUTE context only the quote that delimits the value and the ampersand
+  // that starts an entity are special, so we escape ONLY & " (and ' for safety
+  // when single-quoted delimiters are used). We deliberately do NOT escape < or
+  // >: the HTML re-parser (parse5/jsdom via @tiptap/html) does NOT decode
+  // &lt;/&gt; back inside attribute values, so escaping them would corrupt the
+  // stored data (e.g. a math node's LaTeX `a < b`) and ACCUMULATE escapes on
+  // every round-trip (`a < b` -> `a &lt; b` -> `a &amp;lt; b`). Escaping & "
+  // keeps the value inert against attribute-injection while staying idempotent.
+  // NOTE: escape ONLY & and " here. The value is always wrapped in double
+  // quotes, so " is the only delimiter; ' is NOT special in a double-quoted
+  // value, and parse5 does not decode &#39; back inside attribute values, so
+  // escaping ' would (like < >) corrupt the value and accumulate &amp; on every
+  // round-trip. Escaping & and " is idempotent (parse5 decodes them back).
+  const escapeAttr = (value: unknown): string =>
+    String(value)
+      .replace(/&/g, "&amp;")
+      .replace(/"/g, "&quot;");
+
+  // Escape a value placed as HTML element TEXT content (between tags), where
+  // <, >, and & are all significant. Used for text rendered inside raw-HTML
+  // blocks (table cells / columns) so stored characters cannot inject markup.
+  const escapeHtmlText = (value: unknown): string =>
+    String(value)
+      .replace(/&/g, "&amp;")
+      .replace(/</g, "&lt;")
+      .replace(/>/g, "&gt;");
+
+  // Percent-encode characters that would break out of a markdown URL target
+  // (...) — whitespace/newlines and parentheses — so a stored src stays a
+  // single inert token (used for image/video/youtube srcs).
+  const encodeMdUrl = (value: unknown): string =>
+    String(value || "")
+      .replace(/\s/g, (c: string) => (c === " " ? "%20" : encodeURIComponent(c)))
+      .replace(/\(/g, "%28")
+      .replace(/\)/g, "%29");
+
+  const processNode = (node: any): string => {
+    const type = node.type;
+    const nodeContent = node.content || [];
+
+    switch (type) {
+      case "doc":
+        return nodeContent.map(processNode).join("\n\n");
+
+      case "paragraph":
+        const text = nodeContent.map(processNode).join("");
+        const align = node.attrs?.textAlign;
+        if (align && align !== "left") {
+          return `<div align="${escapeAttr(align)}">${text}</div>`;
+        }
+        return text || "";
+
+      case "heading":
+        const level = node.attrs?.level || 1;
+        const headingText = nodeContent.map(processNode).join("");
+        return "#".repeat(level) + " " + headingText;
+
+      case "text":
+        let textContent = node.text || "";
+        // Apply marks (bold, italic, code, etc.)
+        if (node.marks) {
+          // Markdown code spans (`...`) cannot carry inner formatting, so when a
+          // run has the `code` mark alongside ANY other mark, backtick syntax
+          // would leak literal ** / []() into the code text. In that case emit
+          // nested HTML (<code> innermost, the other marks wrapping it as HTML)
+          // so the output is at least well-formed and re-parseable.
+          //
+          // NOTE: this does NOT round-trip both marks. The schema's `code` mark
+          // has `excludes: "_"` (it excludes every other mark), so on import the
+          // co-occurring mark is always dropped — the run comes back as `code`
+          // only. We keep the emission simple and accept that the other mark is
+          // lost; preserving both is impossible while `code` excludes them.
+          // Only use the backtick form when `code` is the sole mark.
+          const markTypes = node.marks.map((m: any) => m.type);
+          const hasCode = markTypes.includes("code");
+          const codeCombined = hasCode && markTypes.length > 1;
+          for (const mark of node.marks) {
+            switch (mark.type) {
+              case "bold":
+                textContent = codeCombined
+                  ? `<strong>${textContent}</strong>`
+                  : `**${textContent}**`;
+                break;
+              case "italic":
+                textContent = codeCombined
+                  ? `<em>${textContent}</em>`
+                  : `*${textContent}*`;
+                break;
+              case "code":
+                // When combined with another mark, wrap as <code> so the
+                // surrounding HTML marks can nest around it; otherwise use the
+                // plain backtick span.
+                textContent = codeCombined
+                  ? `<code>${textContent}</code>`
+                  : `\`${textContent}\``;
+                break;
+              case "link": {
+                const href = mark.attrs?.href || "";
+                const title = mark.attrs?.title;
+                if (codeCombined) {
+                  // Emit an HTML anchor so it can wrap the nested <code>.
+                  const safeHref = escapeAttr(href);
+                  if (title) {
+                    textContent = `<a href="${safeHref}" title="${escapeAttr(String(title))}">${textContent}</a>`;
+                  } else {
+                    textContent = `<a href="${safeHref}">${textContent}</a>`;
+                  }
+                } else if (title) {
+                  // Emit the optional markdown link title; escape an embedded
+                  // double-quote so it cannot terminate the title string early.
+                  const safeTitle = String(title).replace(/"/g, '\\"');
+                  textContent = `[${textContent}](${href} "${safeTitle}")`;
+                } else {
+                  textContent = `[${textContent}](${href})`;
+                }
+                break;
+              }
+              case "strike":
+                textContent = codeCombined
+                  ? `<s>${textContent}</s>`
+                  : `~~${textContent}~~`;
+                break;
+              case "underline":
+                textContent = `<u>${textContent}</u>`;
+                break;
+              case "subscript":
+                textContent = `<sub>${textContent}</sub>`;
+                break;
+              case "superscript":
+                textContent = `<sup>${textContent}</sup>`;
+                break;
+              case "highlight": {
+                // Preserve a null/empty color as a plain highlight (a bare
+                // <mark> with no background-color); only emit the style when a
+                // color is actually set, so a plain highlight is not forced to
+                // yellow on export.
+                const color = mark.attrs?.color;
+                textContent = color
+                  ? `<mark style="background-color: ${escapeAttr(color)}">${textContent}</mark>`
+                  : `<mark>${textContent}</mark>`;
+                break;
+              }
+              case "textStyle":
+                if (mark.attrs?.color) {
+                  textContent = `<span style="color: ${escapeAttr(mark.attrs.color)}">${textContent}</span>`;
+                }
+                break;
+              case "comment": {
+                // Emit the inline comment anchor so highlights round-trip. The
+                // schema's Comment mark parses span[data-comment-id] (attrs
+                // commentId/resolved).
+                const cid = mark.attrs?.commentId;
+                if (cid) {
+                  const resolvedAttr = mark.attrs?.resolved
+                    ? ` data-resolved="true"`
+                    : "";
+                  textContent = `<span data-comment-id="${escapeAttr(cid)}"${resolvedAttr}>${textContent}</span>`;
+                }
+                break;
+              }
+            }
+          }
+        }
+        return textContent;
+
+      case "codeBlock":
+        const language = node.attrs?.language || "";
+        // Strip ALL trailing newlines so the export is idempotent: marked
+        // re-adds exactly one trailing "\n" on import, so trimming only one
+        // here would let the text grow by "\n" on each round-trip. Removing
+        // every trailing newline makes repeated cycles stable.
+        const code = nodeContent
+          .map(processNode)
+          .join("")
+          .replace(/\n+$/, "");
+        return "```" + language + "\n" + code + "\n```";
+
+      case "bulletList":
+        return nodeContent
+          .map((item: any) => processListItem(item, "-"))
+          .join("\n");
+
+      case "orderedList":
+        return nodeContent
+          .map((item: any, index: number) =>
+            processListItem(item, `${index + 1}.`),
+          )
+          .join("\n");
+
+      case "taskList":
+        return nodeContent.map((item: any) => processTaskItem(item)).join("\n");
+
+      case "taskItem":
+        // Delegate to the same helper used by taskList so multi-block and
+        // nested task items render and indent consistently.
+        return processTaskItem(node);
+
+      case "listItem":
+        return nodeContent.map(processNode).join("\n");
+
+      case "blockquote":
+        // Prefix EVERY line of EVERY child with "> " and separate block-level
+        // children with a blank ">" line so code blocks / multi-paragraph
+        // quotes round-trip correctly.
+        return nodeContent
+          .map((n: any) =>
+            processNode(n)
+              .split("\n")
+              .map((line: string) => (line.length ? `> ${line}` : ">"))
+              .join("\n"),
+          )
+          .join("\n>\n");
+
+      case "horizontalRule":
+        return "---";
+
+      case "hardBreak":
+        // Two trailing spaces before the newline encode a markdown hard break;
+        // a bare "\n" would be reimported as a soft break and lost.
+        return "  \n";
+
+      case "image":
+        const imgAlt = node.attrs?.alt || "";
+        // Neutralize characters that could break out of the markdown image
+        // URL: spaces/newlines and parentheses would terminate the (...) target
+        // and let a stored src inject following markdown/HTML. Percent-encode
+        // them so the URL stays a single inert token.
+        const imgSrc = encodeMdUrl(node.attrs?.src);
+        // No "caption" attribute exists in the Docmost image schema, so we do
+        // not emit one (the previous caption branch was dead).
+        return `![${imgAlt}](${imgSrc})`;
+
+      case "video": {
+        // Emit the schema-matching <video> element so generateJSON rebuilds the
+        // node with its attrs intact. The schema's parseHTML reads src/aria-label
+        // from the standard attributes and the remaining attrs from data-*.
+        const attrs = node.attrs || {};
+        const parts: string[] = [`src="${escapeAttr(attrs.src ?? "")}"`];
+        if (attrs.alt) parts.push(`aria-label="${escapeAttr(attrs.alt)}"`);
+        if (attrs.attachmentId)
+          parts.push(
+            `data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
+          );
+        if (attrs.width != null)
+          parts.push(`width="${escapeAttr(attrs.width)}"`);
+        if (attrs.height != null)
+          parts.push(`height="${escapeAttr(attrs.height)}"`);
+        if (attrs.size != null)
+          parts.push(`data-size="${escapeAttr(attrs.size)}"`);
+        if (attrs.align)
+          parts.push(`data-align="${escapeAttr(attrs.align)}"`);
+        if (attrs.aspectRatio != null)
+          parts.push(`data-aspect-ratio="${escapeAttr(attrs.aspectRatio)}"`);
+        // Wrap in a block <div> so marked treats it as a block (a bare <video>
+        // is inline-level HTML and marked wraps it in <p>, leaving a spurious
+        // empty paragraph beside the hoisted block atom). The wrapper has no
+        // data-type, so the schema parser ignores it and just hoists the video.
+        return `<div><video ${parts.join(" ")}></video></div>`;
+      }
+
+      case "youtube": {
+        // Emit the schema-matching div[data-type="youtube"]; the schema reads
+        // src from data-src and width/height/align from data-* attributes.
+        const attrs = node.attrs || {};
+        const parts: string[] = [
+          `data-type="youtube"`,
+          `data-src="${escapeAttr(attrs.src ?? "")}"`,
+        ];
+        if (attrs.width != null)
+          parts.push(`data-width="${escapeAttr(attrs.width)}"`);
+        if (attrs.height != null)
+          parts.push(`data-height="${escapeAttr(attrs.height)}"`);
+        if (attrs.align)
+          parts.push(`data-align="${escapeAttr(attrs.align)}"`);
+        return `<div ${parts.join(" ")}></div>`;
+      }
+
+      case "table": {
+        // A GFM pipe table cannot represent merged cells. If ANY cell carries
+        // colspan>1 or rowspan>1, a pipe table would corrupt the grid on
+        // re-import, so emit the WHOLE table as raw HTML <table> instead: the
+        // schema's table family parseHTML (tag table/tr/td/th, with colspan/
+        // rowspan read from the same-named HTML attrs and align via parseHTML)
+        // round-trips it faithfully. Otherwise keep the lighter GFM pipe table.
+        const tableRows: any[] = nodeContent;
+        if (tableRows.length === 0) return "";
+        const hasSpan = tableRows.some((row: any) =>
+          (row.content || []).some(
+            (cell: any) =>
+              (cell.attrs?.colspan ?? 1) > 1 || (cell.attrs?.rowspan ?? 1) > 1,
+          ),
+        );
+
+        if (hasSpan) {
+          // Render each cell's block children to HTML (marked does NOT parse
+          // markdown inside a raw HTML block, so emitting markdown here would
+          // leak literal ** / `` into the cell). blockToHtml mirrors the schema
+          // HTML so inner formatting re-parses into the right marks/nodes.
+          const renderHtmlCell = (cell: any): string => {
+            const tag = cell.type === "tableHeader" ? "th" : "td";
+            const a = cell.attrs || {};
+            const cellParts: string[] = [];
+            if ((a.colspan ?? 1) > 1)
+              cellParts.push(`colspan="${escapeAttr(a.colspan)}"`);
+            if ((a.rowspan ?? 1) > 1)
+              cellParts.push(`rowspan="${escapeAttr(a.rowspan)}"`);
+            if (a.align) cellParts.push(`align="${escapeAttr(a.align)}"`);
+            const open = cellParts.length
+              ? `<${tag} ${cellParts.join(" ")}>`
+              : `<${tag}>`;
+            const inner = (cell.content || [])
+              .map((block: any) => blockToHtml(block))
+              .join("");
+            return `${open}${inner}</${tag}>`;
+          };
+          const htmlRows = tableRows
+            .map(
+              (row: any) =>
+                `<tr>${(row.content || []).map(renderHtmlCell).join("")}</tr>`,
+            )
+            .join("");
+          return `<table><tbody>${htmlRows}</tbody></table>`;
+        }
+
+        // No merged cells: emit a GFM table (header row + separator) so the
+        // markdown can be parsed back into a table on re-import.
+        const rows = tableRows.map(processNode);
+        const headerCells = tableRows[0]?.content || [];
+        const columns = headerCells.length || 1;
+        // Derive alignment markers (:--, :-:, --:) from each header cell.
+        const markers = Array.from({ length: columns }, (_, i) => {
+          const align = headerCells[i]?.attrs?.align;
+          switch (align) {
+            case "left":
+              return ":--";
+            case "center":
+              return ":-:";
+            case "right":
+              return "--:";
+            default:
+              return "---";
+          }
+        });
+        const separator = "| " + markers.join(" | ") + " |";
+        return [rows[0], separator, ...rows.slice(1)].join("\n");
+      }
+
+      case "tableRow":
+        return "| " + nodeContent.map(processNode).join(" | ") + " |";
+
+      case "tableCell":
+      case "tableHeader": {
+        // Join multiple block children with a space (not "") so adjacent blocks
+        // like a paragraph followed by a list don't collide into "line1- a".
+        // Then collapse newlines and escape pipes so a cell containing "|" or a
+        // line break cannot corrupt the surrounding GFM row.
+        return nodeContent
+          .map(processNode)
+          .join(" ")
+          .replace(/\r?\n/g, " ")
+          .replace(/\|/g, "\\|");
+      }
+
+      case "callout":
+        const calloutType = node.attrs?.type || "info";
+        const calloutContent = nodeContent.map(processNode).join("\n");
+        return `:::${calloutType.toLowerCase()}\n${calloutContent}\n:::`;
+
+      case "details":
+        return nodeContent.map(processNode).join("\n");
+
+      case "detailsSummary":
+        const summaryText = nodeContent.map(processNode).join("");
+        return `<details>\n<summary>${summaryText}</summary>\n`;
+
+      case "detailsContent":
+        const detailsText = nodeContent.map(processNode).join("\n");
+        return `${detailsText}\n</details>`;
+
+      case "mathInline": {
+        // The schema's `text` attribute has no parseHTML, so TipTap's default
+        // parser reads it from the `text` HTML attribute (NOT the element's text
+        // content). Emit span[data-type="mathInline"] carrying the LaTeX in a
+        // `text="..."` attribute so it round-trips. marked cannot parse $...$
+        // back, so the previous form was lossy.
+        const inlineMath = node.attrs?.text || "";
+        return `<span data-type="mathInline" data-katex="true" text="${escapeAttr(inlineMath)}"></span>`;
+      }
+
+      case "mathBlock": {
+        // Same as mathInline: the LaTeX must ride in the `text` HTML attribute
+        // for the schema's default parser to recover it.
+        const blockMath = node.attrs?.text || "";
+        return `<div data-type="mathBlock" data-katex="true" text="${escapeAttr(blockMath)}"></div>`;
+      }
+
+      case "mention": {
+        // Emit span[data-type="mention"] with the schema's data-* attributes so
+        // generateJSON rebuilds the mention node instead of leaving "@label"
+        // plain text that cannot re-parse.
+        const attrs = node.attrs || {};
+        const parts: string[] = [`data-type="mention"`];
+        if (attrs.id) parts.push(`data-id="${escapeAttr(attrs.id)}"`);
+        if (attrs.label)
+          parts.push(`data-label="${escapeAttr(attrs.label)}"`);
+        if (attrs.entityType)
+          parts.push(`data-entity-type="${escapeAttr(attrs.entityType)}"`);
+        if (attrs.entityId)
+          parts.push(`data-entity-id="${escapeAttr(attrs.entityId)}"`);
+        if (attrs.slugId)
+          parts.push(`data-slug-id="${escapeAttr(attrs.slugId)}"`);
+        if (attrs.creatorId)
+          parts.push(`data-creator-id="${escapeAttr(attrs.creatorId)}"`);
+        if (attrs.anchorId)
+          parts.push(`data-anchor-id="${escapeAttr(attrs.anchorId)}"`);
+        // Keep the label as visible text content too; the schema reads attrs
+        // from data-*, so the inner text is purely cosmetic and harmless.
+        const mentionLabel = attrs.label || attrs.id || "";
+        // The label is visible element TEXT content here (the data-* attrs above
+        // carry the real values), so escape it for the text context, not attrs.
+        return `<span ${parts.join(" ")}>@${escapeHtmlText(mentionLabel)}</span>`;
+      }
+
+      case "attachment": {
+        // BUG FIX: the old code read node.attrs.fileName / node.attrs.src, but
+        // the schema stores name/url (plus mime/size/attachmentId). Emit the
+        // schema-matching div[data-type="attachment"] with data-attachment-*
+        // attrs so the node round-trips instead of degrading to a markdown link.
+        const attrs = node.attrs || {};
+        const parts: string[] = [
+          `data-type="attachment"`,
+          `data-attachment-url="${escapeAttr(attrs.url ?? "")}"`,
+        ];
+        if (attrs.name)
+          parts.push(`data-attachment-name="${escapeAttr(attrs.name)}"`);
+        if (attrs.mime)
+          parts.push(`data-attachment-mime="${escapeAttr(attrs.mime)}"`);
+        if (attrs.size != null)
+          parts.push(`data-attachment-size="${escapeAttr(attrs.size)}"`);
+        if (attrs.attachmentId)
+          parts.push(
+            `data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
+          );
+        return `<div ${parts.join(" ")}></div>`;
+      }
+
+      case "drawio":
+      case "excalidraw": {
+        // Emit the schema-matching div[data-type=...] carrying the diagram's
+        // attrs as data-* (the schema's diagramAttributes reads src/title/alt/
+        // width/height/size/aspectRatio/align/attachmentId from data-*), so the
+        // diagram round-trips instead of degrading to a lossy placeholder.
+        const attrs = node.attrs || {};
+        const parts: string[] = [
+          `data-type="${type}"`,
+          `data-src="${escapeAttr(attrs.src ?? "")}"`,
+        ];
+        if (attrs.title != null)
+          parts.push(`data-title="${escapeAttr(attrs.title)}"`);
+        if (attrs.alt != null) parts.push(`data-alt="${escapeAttr(attrs.alt)}"`);
+        if (attrs.width != null)
+          parts.push(`data-width="${escapeAttr(attrs.width)}"`);
+        if (attrs.height != null)
+          parts.push(`data-height="${escapeAttr(attrs.height)}"`);
+        if (attrs.size != null)
+          parts.push(`data-size="${escapeAttr(attrs.size)}"`);
+        if (attrs.aspectRatio != null)
+          parts.push(`data-aspect-ratio="${escapeAttr(attrs.aspectRatio)}"`);
+        if (attrs.align)
+          parts.push(`data-align="${escapeAttr(attrs.align)}"`);
+        if (attrs.attachmentId)
+          parts.push(
+            `data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
+          );
+        return `<div ${parts.join(" ")}></div>`;
+      }
+
+      case "embed": {
+        // Emit the schema-matching div[data-type="embed"]; the schema reads
+        // src/provider/align/width/height from data-* attributes so the node
+        // (and its provider iframe info) survives the round-trip.
+        const attrs = node.attrs || {};
+        const parts: string[] = [
+          `data-type="embed"`,
+          `data-src="${escapeAttr(attrs.src ?? "")}"`,
+          `data-provider="${escapeAttr(attrs.provider ?? "")}"`,
+        ];
+        if (attrs.align)
+          parts.push(`data-align="${escapeAttr(attrs.align)}"`);
+        if (attrs.width != null)
+          parts.push(`data-width="${escapeAttr(attrs.width)}"`);
+        if (attrs.height != null)
+          parts.push(`data-height="${escapeAttr(attrs.height)}"`);
+        return `<div ${parts.join(" ")}></div>`;
+      }
+
+      case "audio": {
+        // Emit the schema-matching <audio> element (was emitting nothing). The
+        // schema reads src from src and attachmentId/size from data-*.
+        const attrs = node.attrs || {};
+        const parts: string[] = [`src="${escapeAttr(attrs.src ?? "")}"`];
+        if (attrs.attachmentId)
+          parts.push(
+            `data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
+          );
+        if (attrs.size != null)
+          parts.push(`data-size="${escapeAttr(attrs.size)}"`);
+        // Wrap in a block <div> for the same reason as video: a bare <audio> is
+        // inline-level HTML that marked would wrap in <p>.
+        return `<div><audio ${parts.join(" ")}></audio></div>`;
+      }
+
+      case "pdf": {
+        // Emit the schema-matching div[data-type="pdf"] (was emitting nothing).
+        // The schema reads src/width/height from standard attrs and name/
+        // attachmentId/size from data-*.
+        const attrs = node.attrs || {};
+        const parts: string[] = [
+          `data-type="pdf"`,
+          `src="${escapeAttr(attrs.src ?? "")}"`,
+        ];
+        if (attrs.name) parts.push(`data-name="${escapeAttr(attrs.name)}"`);
+        if (attrs.attachmentId)
+          parts.push(
+            `data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
+          );
+        if (attrs.size != null)
+          parts.push(`data-size="${escapeAttr(attrs.size)}"`);
+        if (attrs.width != null)
+          parts.push(`width="${escapeAttr(attrs.width)}"`);
+        if (attrs.height != null)
+          parts.push(`height="${escapeAttr(attrs.height)}"`);
+        return `<div ${parts.join(" ")}></div>`;
+      }
+
+      case "columns": {
+        // Emit the schema-matching div[data-type="columns"] wrapper so the
+        // multi-column layout survives. Without a case the children were
+        // concatenated with no separator and the text merged. The schema reads
+        // layout from data-layout and widthMode from data-width-mode. The whole
+        // block is raw HTML, so render children via blockToHtml (NOT markdown,
+        // which marked would not re-parse inside a raw HTML block).
+        const attrs = node.attrs || {};
+        const parts: string[] = [`data-type="columns"`];
+        if (attrs.layout)
+          parts.push(`data-layout="${escapeAttr(attrs.layout)}"`);
+        if (attrs.widthMode && attrs.widthMode !== "normal")
+          parts.push(`data-width-mode="${escapeAttr(attrs.widthMode)}"`);
+        const inner = nodeContent.map((n: any) => blockToHtml(n)).join("");
+        return `<div ${parts.join(" ")}>${inner}</div>`;
+      }
+
+      case "column": {
+        // Emit the schema-matching div[data-type="column"]; the schema reads the
+        // column width from data-width. Children are rendered as HTML so their
+        // formatting survives inside this raw HTML block.
+        const attrs = node.attrs || {};
+        const parts: string[] = [`data-type="column"`];
+        if (attrs.width)
+          parts.push(`data-width="${escapeAttr(attrs.width)}"`);
+        const inner = nodeContent.map((n: any) => blockToHtml(n)).join("");
+        return `<div ${parts.join(" ")}>${inner}</div>`;
+      }
+
+      case "subpages":
+        return "{{SUBPAGES}}";
+
+      default:
+        // Fallback: process children
+        return nodeContent.map(processNode).join("");
+    }
+  };
+
+  // Render inline content (text runs + their marks) to HTML. Used by the raw
+  // HTML fallbacks (spanned tables, columns) where marked will NOT re-parse
+  // markdown, so backtick/asterisk/bracket syntax would otherwise leak as
+  // literal characters. Each mark is mirrored to the HTML the schema's parseHTML
+  // accepts so it re-imports as the matching ProseMirror mark.
+  const inlineToHtml = (inlineNodes: any[]): string =>
+    (inlineNodes || [])
+      .map((n: any) => {
+        if (n.type === "hardBreak") return "<br>";
+        if (n.type !== "text") {
+          // Inline atoms (mention, mathInline) already emit schema HTML.
+          return processNode(n);
+        }
+        let t = escapeHtmlText(n.text || "");
+        for (const mark of n.marks || []) {
+          switch (mark.type) {
+            case "bold":
+              t = `<strong>${t}</strong>`;
+              break;
+            case "italic":
+              t = `<em>${t}</em>`;
+              break;
+            case "code":
+              t = `<code>${t}</code>`;
+              break;
+            case "strike":
+              t = `<s>${t}</s>`;
+              break;
+            case "underline":
+              t = `<u>${t}</u>`;
+              break;
+            case "subscript":
+              t = `<sub>${t}</sub>`;
+              break;
+            case "superscript":
+              t = `<sup>${t}</sup>`;
+              break;
+            case "link":
+              t = `<a href="${escapeAttr(mark.attrs?.href || "")}">${t}</a>`;
+              break;
+            case "highlight":
+              t = mark.attrs?.color
+                ? `<mark style="background-color: ${escapeAttr(mark.attrs.color)}">${t}</mark>`
+                : `<mark>${t}</mark>`;
+              break;
+            case "textStyle":
+              if (mark.attrs?.color)
+                t = `<span style="color: ${escapeAttr(mark.attrs.color)}">${t}</span>`;
+              break;
+            case "comment":
+              // Inline comment anchor inside a raw-HTML container (columns /
+              // spanned table cells), so commented text there also round-trips.
+              if (mark.attrs?.commentId) {
+                const r = mark.attrs?.resolved ? ` data-resolved="true"` : "";
+                t = `<span data-comment-id="${escapeAttr(mark.attrs.commentId)}"${r}>${t}</span>`;
+              }
+              break;
+          }
+        }
+        return t;
+      })
+      .join("");
+
+  // Emit the schema-matching <img> for an image node. Shared so the image is
+  // emitted as real HTML wherever a raw-HTML container needs it (inside a column
+  // or a spanned table cell), where markdown `![](...)` would NOT be re-parsed
+  // and would survive as literal text. The Image extension reads src/alt from
+  // the standard attributes; the Docmost extra attrs (width/height/align/size/
+  // attachmentId/aspectRatio) are global attributes read from same-named DOM
+  // attributes, so emit them by name.
+  const imageToHtml = (node: any): string => {
+    const attrs = node.attrs || {};
+    const parts: string[] = [`src="${escapeAttr(attrs.src ?? "")}"`];
+    if (attrs.alt) parts.push(`alt="${escapeAttr(attrs.alt)}"`);
+    if (attrs.title) parts.push(`title="${escapeAttr(attrs.title)}"`);
+    if (attrs.width != null) parts.push(`width="${escapeAttr(attrs.width)}"`);
+    if (attrs.height != null) parts.push(`height="${escapeAttr(attrs.height)}"`);
+    if (attrs.align) parts.push(`align="${escapeAttr(attrs.align)}"`);
+    if (attrs.size != null) parts.push(`data-size="${escapeAttr(attrs.size)}"`);
+    if (attrs.attachmentId)
+      parts.push(`data-attachment-id="${escapeAttr(attrs.attachmentId)}"`);
+    if (attrs.aspectRatio != null)
+      parts.push(`data-aspect-ratio="${escapeAttr(attrs.aspectRatio)}"`);
+    return `<img ${parts.join(" ")}>`;
+  };
+
+  // Emit the schema-matching div[data-type="callout"] for a callout node. The
+  // schema reads the banner type from data-callout-type. Children are rendered
+  // as HTML so they survive inside a raw-HTML container.
+  const calloutToHtml = (node: any): string => {
+    const type = (node.attrs?.type || "info").toLowerCase();
+    const inner = (node.content || []).map(blockToHtml).join("");
+    return `<div data-type="callout" data-callout-type="${escapeAttr(type)}">${inner}</div>`;
+  };
+
+  // Emit a schema-matching <details> tree. The schema parses <details>,
+  // summary[data-type="detailsSummary"], and div[data-type="detailsContent"].
+  const detailsToHtml = (node: any): string => {
+    const inner = (node.content || []).map(blockToHtml).join("");
+    return `<details>${inner}</details>`;
+  };
+  const detailsSummaryToHtml = (node: any): string =>
+    `<summary data-type="detailsSummary">${inlineToHtml(node.content || [])}</summary>`;
+  const detailsContentToHtml = (node: any): string => {
+    const inner = (node.content || []).map(blockToHtml).join("");
+    return `<div data-type="detailsContent">${inner}</div>`;
+  };
+
+  // Emit the schema-matching taskList/taskItem HTML. bridgeTaskLists (in
+  // collaboration.ts) recognizes ul[data-type="taskList"] with
+  // li[data-type="taskItem"][data-checked]; emitting that directly here keeps
+  // task lists inside columns/cells from degrading to literal "- [ ]" text.
+  const taskListToHtml = (node: any): string => {
+    const items = (node.content || [])
+      .map((it: any) => {
+        const checked = it.attrs?.checked ? "true" : "false";
+        return `<li data-type="taskItem" data-checked="${checked}">${blockChildrenToHtml(it)}</li>`;
+      })
+      .join("");
+    return `<ul data-type="taskList">${items}</ul>`;
+  };
+
+  // Render a block node to HTML for the raw-HTML containers (spanned tables,
+  // columns). marked does NOT re-parse markdown inside a raw-HTML block, so
+  // EVERY block type that can appear inside a column or a spanned cell must be
+  // emitted as schema-matching HTML here — never as markdown, or it would land
+  // as literal text on re-import. Nodes whose processNode case already produces
+  // schema-matching HTML (math/media/embed/attachment/nested columns/spanned
+  // table) are delegated to processNode; the markdown-emitting cases
+  // (image/blockquote/callout/details/hr/taskList) get explicit HTML here.
+  const blockToHtml = (block: any): string => {
+    const children = block.content || [];
+    switch (block.type) {
+      case "paragraph":
+        return `<p>${inlineToHtml(children)}</p>`;
+      case "heading": {
+        const level = block.attrs?.level || 1;
+        return `<h${level}>${inlineToHtml(children)}</h${level}>`;
+      }
+      case "bulletList":
+        return `<ul>${children
+          .map((li: any) => `<li>${blockChildrenToHtml(li)}</li>`)
+          .join("")}</ul>`;
+      case "orderedList":
+        return `<ol>${children
+          .map((li: any) => `<li>${blockChildrenToHtml(li)}</li>`)
+          .join("")}</ol>`;
+      case "codeBlock": {
+        const lang = block.attrs?.language || "";
+        // The code itself is element TEXT content (between <code> tags), so it
+        // must escape < > & — NOT the attribute escaper. The language rides in
+        // a class ATTRIBUTE, so it uses escapeAttr.
+        const code = escapeHtmlText(
+          children
+            .map(processNode)
+            .join("")
+            .replace(/\n+$/, ""),
+        );
+        const cls = lang ? ` class="language-${escapeAttr(lang)}"` : "";
+        return `<pre><code${cls}>${code}</code></pre>`;
+      }
+      case "image":
+        return imageToHtml(block);
+      case "blockquote":
+        return `<blockquote>${children.map(blockToHtml).join("")}</blockquote>`;
+      case "horizontalRule":
+        return "<hr>";
+      case "callout":
+        return calloutToHtml(block);
+      case "details":
+        return detailsToHtml(block);
+      case "detailsSummary":
+        return detailsSummaryToHtml(block);
+      case "detailsContent":
+        return detailsContentToHtml(block);
+      case "taskList":
+        return taskListToHtml(block);
+      case "taskItem":
+        // A bare taskItem (outside a taskList) still needs a wrapping list so
+        // the schema parses it; wrap it in a single-item taskList.
+        return taskListToHtml({ content: [block] });
+      // table (incl. spanned), columns/column, math, media, embed, attachment,
+      // mention, etc. already emit schema-matching HTML from processNode.
+      case "table":
+      case "columns":
+      case "column":
+      case "mathBlock":
+      case "video":
+      case "audio":
+      case "pdf":
+      case "youtube":
+      case "embed":
+      case "attachment":
+      case "drawio":
+      case "excalidraw":
+        return processNode(block);
+      default:
+        // Any still-unhandled block type: NEVER fall back to markdown inside a
+        // raw-HTML block (it would become literal text). Wrap its rendered
+        // children in a <div> so their content is preserved; if it has no block
+        // children, render its inline content instead.
+        if (children.length && children.some((c: any) => c.type !== "text")) {
+          return `<div>${children.map(blockToHtml).join("")}</div>`;
+        }
+        return `<div>${inlineToHtml(children)}</div>`;
+    }
+  };
+
+  // Render the block children of a list item to HTML (a listItem holds block+
+  // content). Mirrors processListItem but for the HTML fallback path.
+  const blockChildrenToHtml = (item: any): string =>
+    (item.content || []).map((b: any) => blockToHtml(b)).join("");
+
+  // Indent the rendered children of a list item under a marker prefix.
+  // Each child block is a (possibly multi-line) string. The very first physical
+  // line of the first child carries the marker (e.g. "- " or "1. "); EVERY
+  // other line — the remaining lines of the first child AND all lines of every
+  // subsequent child (nested lists, code blocks, extra paragraphs) — is indented
+  // to align under the marker. Without indenting these continuation lines, the
+  // 2nd/3rd line of a nested child collapses to column 0 and escapes the list.
+  //
+  // The continuation indent MUST equal the LIST marker width, which is not the
+  // same as the visible prefix width:
+  //   - bullet "- "          -> 2 columns
+  //   - task   "- [ ] "      -> marker is still "- " (the "[ ] " is content), 2
+  //   - ordered "1. "/"10. " -> 3/4 columns, scaling with the number's digits
+  // CommonMark anchors nested content to the marker column, so an ordered item
+  // indented to only 2 columns would be re-parsed as a sibling/loose content on
+  // re-import. Callers therefore pass the exact indent width to use.
+  const indentItemChildren = (
+    childStrings: string[],
+    prefix: string,
+    indentWidth: number,
+  ): string => {
+    const indent = " ".repeat(indentWidth);
+    const lines: string[] = [];
+    childStrings.forEach((child, childIndex) => {
+      child.split("\n").forEach((line, lineIndex) => {
+        if (childIndex === 0 && lineIndex === 0) {
+          // First physical line of the first block gets the marker.
+          lines.push(`${prefix} ${line}`);
+        } else {
+          // Indent every continuation line by the marker width; keep blank
+          // lines blank rather than emitting trailing whitespace.
+          lines.push(line.length ? `${indent}${line}` : "");
+        }
+      });
+    });
+    return lines.join("\n");
+  };
+
+  const processListItem = (item: any, prefix: string): string => {
+    const itemContent = item.content || [];
+    const childStrings = itemContent.map(processNode);
+    if (childStrings.length === 0) return prefix;
+    // The rendered marker is `${prefix} ` (prefix + one space), so its width —
+    // and thus the continuation indent — is prefix.length + 1. This is correct
+    // for both bullet ("-" -> 2) and ordered ("1." -> 3, "10." -> 4) markers,
+    // since for those the visible prefix IS the list marker.
+    return indentItemChildren(childStrings, prefix, prefix.length + 1);
+  };
+
+  const processTaskItem = (item: any): string => {
+    const checked = item.attrs?.checked || false;
+    const checkbox = checked ? "[x]" : "[ ]";
+    const prefix = `- ${checkbox}`;
+    const itemContent = item.content || [];
+    const childStrings = itemContent.map(processNode);
+    // An empty task item still needs its checkbox marker; without this guard
+    // the indent below produces "" and the "- [ ]"/"- [x]" row disappears.
+    if (childStrings.length === 0) return prefix;
+    // The list marker for a task item is just "- " (2 columns); the "[ ] "/"[x] "
+    // checkbox is item content, NOT part of the marker. So the continuation
+    // indent is a fixed 2 — do NOT derive it from the wider prefix.length.
+    return indentItemChildren(childStrings, prefix, 2);
+  };
+
+  return processNode(content).trim();
+}