import { encodeHtmlEmbedSource } from "./docmost-schema.js"; /** * Hard cap on processNode recursion depth (see the depth guard below). * * Chosen well above any realistic document (the deepest legitimate nesting the * editor can produce is far shallower) yet far below the point where the * converter's own call stack overflows. The heaviest shape (deeply nested * lists) costs ~5 JS frames per level and the runtime stack holds ~10k frames, * so the measured overflow is around level ~650 (deeply nested lists); 400 * leaves a comfortable margin while still rendering pathological-but-bounded * docs in full (the 200-level stress fixture reaches depth ~204). */ const MAX_NODE_DEPTH = 400; /** * Convert ProseMirror/TipTap JSON content to Markdown * Supports all Docmost-specific node types and extensions */ export function convertProseMirrorToMarkdown(content: any): string { if (!content || !content.content) return ""; // Escape a value interpolated into an HTML double-quoted attribute value // (textAlign, colors, image src, math `text`, all data-* attrs, etc.). In the // ATTRIBUTE context only the quote that delimits the value and the ampersand // that starts an entity are special, so we escape ONLY & " (and ' for safety // when single-quoted delimiters are used). We deliberately do NOT escape < or // >: the HTML re-parser (parse5/jsdom via @tiptap/html) does NOT decode // </> back inside attribute values, so escaping them would corrupt the // stored data (e.g. a math node's LaTeX `a < b`) and ACCUMULATE escapes on // every round-trip (`a < b` -> `a < b` -> `a &lt; b`). Escaping & " // keeps the value inert against attribute-injection while staying idempotent. // NOTE: escape ONLY & and " here. The value is always wrapped in double // quotes, so " is the only delimiter; ' is NOT special in a double-quoted // value, and parse5 does not decode ' back inside attribute values, so // escaping ' would (like < >) corrupt the value and accumulate & on every // round-trip. Escaping & and " is idempotent (parse5 decodes them back). const escapeAttr = (value: unknown): string => String(value) .replace(/&/g, "&") .replace(/"/g, """); // Escape a value placed as HTML element TEXT content (between tags), where // <, >, and & are all significant. Used for text rendered inside raw-HTML // blocks (table cells / columns) so stored characters cannot inject markup. const escapeHtmlText = (value: unknown): string => String(value) .replace(/&/g, "&") .replace(//g, ">"); // Percent-encode characters that would break out of a markdown URL target // (...) — whitespace/newlines and parentheses — so a stored src stays a // single inert token (used for image/video/youtube srcs). const encodeMdUrl = (value: unknown): string => String(value || "") .replace(/\s/g, (c: string) => (c === " " ? "%20" : encodeURIComponent(c))) .replace(/\(/g, "%28") .replace(/\)/g, "%29"); // Recursion depth guard. processNode is mutually recursive (directly and via // processListItem/processTaskItem/blockToHtml), and a pathologically nested // document (e.g. tens of thousands of nested blockquotes) would otherwise // overflow the call stack and throw a RangeError, which would abort the sync // and prevent the page from ever being written. We track the live nesting // depth in a closure counter (the wrapper below) so we NEVER throw: past the // limit we stop recursing and emit the node's own text (or nothing) instead. // Normal documents never approach MAX_NODE_DEPTH, so their output is byte- // identical. NOTE: the wrapper signature is (node) only — several callers use // `.map(processNode)`, which would otherwise pass the array index as a second // argument; the wrapper ignores extra arguments so that is harmless. let nodeDepth = 0; const processNode = (node: any): string => { if (nodeDepth >= MAX_NODE_DEPTH) { // Bail out of deeper recursion without throwing. A text node still has // its own content worth keeping; a container at the limit collapses to // "" (its already-too-deep subtree is dropped) rather than overflowing. return typeof node?.text === "string" ? node.text : ""; } nodeDepth++; try { return processNodeInner(node); } finally { nodeDepth--; } }; const processNodeInner = (node: any): string => { const type = node.type; const nodeContent = node.content || []; switch (type) { case "doc": return nodeContent.map(processNode).join("\n\n"); case "paragraph": const text = nodeContent.map(processNode).join(""); const align = node.attrs?.textAlign; if (align && align !== "left") { return `
${text}
`; } return text || ""; case "heading": const level = node.attrs?.level || 1; const headingText = nodeContent.map(processNode).join(""); return "#".repeat(level) + " " + headingText; case "text": let textContent = node.text || ""; // Apply marks (bold, italic, code, etc.) if (node.marks) { // The schema's `code` mark declares `excludes: "_"` — it excludes every // other inline mark — so the editor can NEVER produce a text run that // carries `code` together with another mark, and on import any // co-occurring mark is always dropped (the run comes back as code-only). // The lossless, byte-stable behavior is therefore: when a run has the // `code` mark, emit ONLY the backtick code span and ignore every other // mark, so md1 is already code-only and md2 === md1. Runs WITHOUT a code // mark are rendered exactly as before. const markTypes = node.marks.map((m: any) => m.type); const hasCode = markTypes.includes("code"); if (hasCode) { textContent = `\`${textContent}\``; return textContent; } const codeCombined = false; for (const mark of node.marks) { switch (mark.type) { case "bold": textContent = codeCombined ? `${textContent}` : `**${textContent}**`; break; case "italic": textContent = codeCombined ? `${textContent}` : `*${textContent}*`; break; case "code": // When combined with another mark, wrap as so the // surrounding HTML marks can nest around it; otherwise use the // plain backtick span. textContent = codeCombined ? `${textContent}` : `\`${textContent}\``; break; case "link": { const href = mark.attrs?.href || ""; const title = mark.attrs?.title; if (codeCombined) { // Emit an HTML anchor so it can wrap the nested . const safeHref = escapeAttr(href); if (title) { textContent = `${textContent}`; } else { textContent = `${textContent}`; } } else if (title) { // Emit the optional markdown link title; escape an embedded // double-quote so it cannot terminate the title string early. const safeTitle = String(title).replace(/"/g, '\\"'); textContent = `[${textContent}](${href} "${safeTitle}")`; } else { textContent = `[${textContent}](${href})`; } break; } case "strike": textContent = codeCombined ? `${textContent}` : `~~${textContent}~~`; break; case "underline": textContent = `${textContent}`; break; case "subscript": textContent = `${textContent}`; break; case "superscript": textContent = `${textContent}`; break; case "highlight": { // Preserve a null/empty color as a plain highlight (a bare // with no background-color); only emit the style when a // color is actually set, so a plain highlight is not forced to // yellow on export. const color = mark.attrs?.color; textContent = color ? `${textContent}` : `${textContent}`; break; } case "textStyle": if (mark.attrs?.color) { textContent = `${textContent}`; } break; case "comment": { // Emit the inline comment anchor so highlights round-trip. The // schema's Comment mark parses span[data-comment-id] (attrs // commentId/resolved). const cid = mark.attrs?.commentId; if (cid) { const resolvedAttr = mark.attrs?.resolved ? ` data-resolved="true"` : ""; textContent = `${textContent}`; } break; } } } } return textContent; case "codeBlock": const language = node.attrs?.language || ""; // Strip ALL trailing newlines so the export is idempotent: marked // re-adds exactly one trailing "\n" on import, so trimming only one // here would let the text grow by "\n" on each round-trip. Removing // every trailing newline makes repeated cycles stable. const code = nodeContent .map(processNode) .join("") .replace(/\n+$/, ""); // CommonMark: an inner ``` run inside the code would prematurely close // a 3-backtick fence (corrupting the block on re-import). Use an outer // fence one backtick longer than the longest backtick run in the code // (minimum 3) so the inner fence is always content. const longestBacktickRun = (code.match(/`+/g) || []).reduce( (max: number, run: string) => Math.max(max, run.length), 0, ); const fence = "`".repeat(Math.max(3, longestBacktickRun + 1)); return fence + language + "\n" + code + "\n" + fence; case "bulletList": return nodeContent .map((item: any) => processListItem(item, "-")) .join("\n"); case "orderedList": return nodeContent .map((item: any, index: number) => processListItem(item, `${index + 1}.`), ) .join("\n"); case "taskList": return nodeContent.map((item: any) => processTaskItem(item)).join("\n"); case "taskItem": // Delegate to the same helper used by taskList so multi-block and // nested task items render and indent consistently. return processTaskItem(node); case "listItem": return nodeContent.map(processNode).join("\n"); case "blockquote": // Prefix EVERY line of EVERY child with "> " and separate block-level // children with a blank ">" line so code blocks / multi-paragraph // quotes round-trip correctly. return nodeContent .map((n: any) => processNode(n) .split("\n") .map((line: string) => (line.length ? `> ${line}` : ">")) .join("\n"), ) .join("\n>\n"); case "horizontalRule": return "---"; case "hardBreak": // Two trailing spaces before the newline encode a markdown hard break; // a bare "\n" would be reimported as a soft break and lost. return " \n"; case "image": { const imgAttrs = node.attrs || {}; // A top-level image with layout/identity attrs beyond src/alt cannot be // expressed by markdown `![](src)` — width/height/align/size/ // attachmentId/aspectRatio would be silently dropped on export and lost // on re-import. Emit the SAME schema-matching used inside columns // (imageToHtml) so those attrs survive the round-trip. A bare image // (only src/alt, optionally a title — which has no schema attr) keeps // the lighter markdown form so existing image round-trip tests hold. const hasLayoutAttrs = imgAttrs.width != null || imgAttrs.height != null || imgAttrs.align || imgAttrs.size != null || imgAttrs.attachmentId || imgAttrs.aspectRatio != null; if (hasLayoutAttrs) { return imageToHtml(node); } const imgAlt = imgAttrs.alt || ""; // Neutralize characters that could break out of the markdown image // URL: spaces/newlines and parentheses would terminate the (...) target // and let a stored src inject following markdown/HTML. Percent-encode // them so the URL stays a single inert token. const imgSrc = encodeMdUrl(imgAttrs.src); // No "caption" attribute exists in the Docmost image schema, so we do // not emit one (the previous caption branch was dead). return `![${imgAlt}](${imgSrc})`; } case "video": { // Emit the schema-matching