import { encodeHtmlEmbedSource } from "./docmost-schema.js"; /** * Hard cap on processNode recursion depth (see the depth guard below). * * Chosen well above any realistic document (the deepest legitimate nesting the * editor can produce is far shallower) yet far below the point where the * converter's own call stack overflows. The heaviest shape (deeply nested * lists) costs ~5 JS frames per level and the runtime stack holds ~10k frames, * so the measured overflow is around level ~650 (deeply nested lists); 400 * leaves a comfortable margin while still rendering pathological-but-bounded * docs in full (the 200-level stress fixture reaches depth ~204). */ const MAX_NODE_DEPTH = 400; /** * Convert ProseMirror/TipTap JSON content to Markdown * Supports all Docmost-specific node types and extensions */ export function convertProseMirrorToMarkdown(content: any): string { if (!content || !content.content) return ""; // Escape a value interpolated into an HTML double-quoted attribute value // (textAlign, colors, image src, math `text`, all data-* attrs, etc.). In the // ATTRIBUTE context only the quote that delimits the value and the ampersand // that starts an entity are special, so we escape ONLY & " (and ' for safety // when single-quoted delimiters are used). We deliberately do NOT escape < or // >: the HTML re-parser (parse5/jsdom via @tiptap/html) does NOT decode // </> back inside attribute values, so escaping them would corrupt the // stored data (e.g. a math node's LaTeX `a < b`) and ACCUMULATE escapes on // every round-trip (`a < b` -> `a < b` -> `a < b`). Escaping & " // keeps the value inert against attribute-injection while staying idempotent. // NOTE: escape ONLY & and " here. The value is always wrapped in double // quotes, so " is the only delimiter; ' is NOT special in a double-quoted // value, and parse5 does not decode ' back inside attribute values, so // escaping ' would (like < >) corrupt the value and accumulate & on every // round-trip. Escaping & and " is idempotent (parse5 decodes them back). const escapeAttr = (value: unknown): string => String(value) .replace(/&/g, "&") .replace(/"/g, """); // Escape a value placed as HTML element TEXT content (between tags), where // <, >, and & are all significant. Used for text rendered inside raw-HTML // blocks (table cells / columns) so stored characters cannot inject markup. const escapeHtmlText = (value: unknown): string => String(value) .replace(/&/g, "&") .replace(//g, ">"); // Percent-encode characters that would break out of a markdown URL target // (...) — whitespace/newlines and parentheses — so a stored src stays a // single inert token (used for image/video/youtube srcs). const encodeMdUrl = (value: unknown): string => String(value || "") .replace(/\s/g, (c: string) => (c === " " ? "%20" : encodeURIComponent(c))) .replace(/\(/g, "%28") .replace(/\)/g, "%29"); // Recursion depth guard. processNode is mutually recursive (directly and via // processListItem/processTaskItem/blockToHtml), and a pathologically nested // document (e.g. tens of thousands of nested blockquotes) would otherwise // overflow the call stack and throw a RangeError, which would abort the sync // and prevent the page from ever being written. We track the live nesting // depth in a closure counter (the wrapper below) so we NEVER throw: past the // limit we stop recursing and emit the node's own text (or nothing) instead. // Normal documents never approach MAX_NODE_DEPTH, so their output is byte- // identical. NOTE: the wrapper signature is (node) only — several callers use // `.map(processNode)`, which would otherwise pass the array index as a second // argument; the wrapper ignores extra arguments so that is harmless. let nodeDepth = 0; const processNode = (node: any): string => { if (nodeDepth >= MAX_NODE_DEPTH) { // Bail out of deeper recursion without throwing. A text node still has // its own content worth keeping; a container at the limit collapses to // "" (its already-too-deep subtree is dropped) rather than overflowing. return typeof node?.text === "string" ? node.text : ""; } nodeDepth++; try { return processNodeInner(node); } finally { nodeDepth--; } }; const processNodeInner = (node: any): string => { const type = node.type; const nodeContent = node.content || []; switch (type) { case "doc": return nodeContent.map(processNode).join("\n\n"); case "paragraph": const text = nodeContent.map(processNode).join(""); const align = node.attrs?.textAlign; if (align && align !== "left") { return `
so the
// surrounding HTML marks can nest around it; otherwise use the
// plain backtick span.
textContent = codeCombined
? `${textContent}`
: `\`${textContent}\``;
break;
case "link": {
const href = mark.attrs?.href || "";
const title = mark.attrs?.title;
if (codeCombined) {
// Emit an HTML anchor so it can wrap the nested .
const safeHref = escapeAttr(href);
if (title) {
textContent = `${textContent}`;
} else {
textContent = `${textContent}`;
}
} else if (title) {
// Emit the optional markdown link title; escape an embedded
// double-quote so it cannot terminate the title string early.
const safeTitle = String(title).replace(/"/g, '\\"');
textContent = `[${textContent}](${href} "${safeTitle}")`;
} else {
textContent = `[${textContent}](${href})`;
}
break;
}
case "strike":
textContent = codeCombined
? `${textContent}`
: `~~${textContent}~~`;
break;
case "underline":
textContent = `${textContent}`;
break;
case "subscript":
textContent = `${textContent}`;
break;
case "superscript":
textContent = `${textContent}`;
break;
case "highlight": {
// Preserve a null/empty color as a plain highlight (a bare
// with no background-color); only emit the style when a
// color is actually set, so a plain highlight is not forced to
// yellow on export.
const color = mark.attrs?.color;
textContent = color
? `${textContent}`
: `${textContent}`;
break;
}
case "textStyle":
if (mark.attrs?.color) {
textContent = `${textContent}`;
}
break;
case "comment": {
// Emit the inline comment anchor so highlights round-trip. The
// schema's Comment mark parses span[data-comment-id] (attrs
// commentId/resolved).
const cid = mark.attrs?.commentId;
if (cid) {
const resolvedAttr = mark.attrs?.resolved
? ` data-resolved="true"`
: "";
textContent = `${textContent}`;
}
break;
}
}
}
}
return textContent;
case "codeBlock":
const language = node.attrs?.language || "";
// Strip ALL trailing newlines so the export is idempotent: marked
// re-adds exactly one trailing "\n" on import, so trimming only one
// here would let the text grow by "\n" on each round-trip. Removing
// every trailing newline makes repeated cycles stable.
const code = nodeContent
.map(processNode)
.join("")
.replace(/\n+$/, "");
// CommonMark: an inner ``` run inside the code would prematurely close
// a 3-backtick fence (corrupting the block on re-import). Use an outer
// fence one backtick longer than the longest backtick run in the code
// (minimum 3) so the inner fence is always content.
const longestBacktickRun = (code.match(/`+/g) || []).reduce(
(max: number, run: string) => Math.max(max, run.length),
0,
);
const fence = "`".repeat(Math.max(3, longestBacktickRun + 1));
return fence + language + "\n" + code + "\n" + fence;
case "bulletList":
return nodeContent
.map((item: any) => processListItem(item, "-"))
.join("\n");
case "orderedList":
return nodeContent
.map((item: any, index: number) =>
processListItem(item, `${index + 1}.`),
)
.join("\n");
case "taskList":
return nodeContent.map((item: any) => processTaskItem(item)).join("\n");
case "taskItem":
// Delegate to the same helper used by taskList so multi-block and
// nested task items render and indent consistently.
return processTaskItem(node);
case "listItem":
return nodeContent.map(processNode).join("\n");
case "blockquote":
// Prefix EVERY line of EVERY child with "> " and separate block-level
// children with a blank ">" line so code blocks / multi-paragraph
// quotes round-trip correctly.
return nodeContent
.map((n: any) =>
processNode(n)
.split("\n")
.map((line: string) => (line.length ? `> ${line}` : ">"))
.join("\n"),
)
.join("\n>\n");
case "horizontalRule":
return "---";
case "hardBreak":
// Two trailing spaces before the newline encode a markdown hard break;
// a bare "\n" would be reimported as a soft break and lost.
return " \n";
case "image": {
const imgAttrs = node.attrs || {};
// A top-level image with layout/identity attrs beyond src/alt cannot be
// expressed by markdown `` — width/height/align/size/
// attachmentId/aspectRatio would be silently dropped on export and lost
// on re-import. Emit the SAME schema-matching
used inside columns
// (imageToHtml) so those attrs survive the round-trip. A bare image
// (only src/alt, optionally a title — which has no schema attr) keeps
// the lighter markdown form so existing image round-trip tests hold.
const hasLayoutAttrs =
imgAttrs.width != null ||
imgAttrs.height != null ||
imgAttrs.align ||
imgAttrs.size != null ||
imgAttrs.attachmentId ||
imgAttrs.aspectRatio != null;
if (hasLayoutAttrs) {
return imageToHtml(node);
}
const imgAlt = imgAttrs.alt || "";
// Neutralize characters that could break out of the markdown image
// URL: spaces/newlines and parentheses would terminate the (...) target
// and let a stored src inject following markdown/HTML. Percent-encode
// them so the URL stays a single inert token.
const imgSrc = encodeMdUrl(imgAttrs.src);
// No "caption" attribute exists in the Docmost image schema, so we do
// not emit one (the previous caption branch was dead).
return ``;
}
case "video": {
// Emit the schema-matching