/** * Convert ProseMirror/TipTap JSON content to Markdown * Supports all Docmost-specific node types and extensions */ export function convertProseMirrorToMarkdown(content: any): string { if (!content || !content.content) return ""; // Escape a value interpolated into an HTML double-quoted attribute value // (textAlign, colors, image src, math `text`, all data-* attrs, etc.). In the // ATTRIBUTE context only the quote that delimits the value and the ampersand // that starts an entity are special, so we escape ONLY & " (and ' for safety // when single-quoted delimiters are used). We deliberately do NOT escape < or // >: the HTML re-parser (parse5/jsdom via @tiptap/html) does NOT decode // </> back inside attribute values, so escaping them would corrupt the // stored data (e.g. a math node's LaTeX `a < b`) and ACCUMULATE escapes on // every round-trip (`a < b` -> `a < b` -> `a &lt; b`). Escaping & " // keeps the value inert against attribute-injection while staying idempotent. // NOTE: escape ONLY & and " here. The value is always wrapped in double // quotes, so " is the only delimiter; ' is NOT special in a double-quoted // value, and parse5 does not decode ' back inside attribute values, so // escaping ' would (like < >) corrupt the value and accumulate & on every // round-trip. Escaping & and " is idempotent (parse5 decodes them back). const escapeAttr = (value: unknown): string => String(value) .replace(/&/g, "&") .replace(/"/g, """); // Escape a value placed as HTML element TEXT content (between tags), where // <, >, and & are all significant. Used for text rendered inside raw-HTML // blocks (table cells / columns) so stored characters cannot inject markup. const escapeHtmlText = (value: unknown): string => String(value) .replace(/&/g, "&") .replace(//g, ">"); // Percent-encode characters that would break out of a markdown URL target // (...) — whitespace/newlines and parentheses — so a stored src stays a // single inert token (used for image/video/youtube srcs). const encodeMdUrl = (value: unknown): string => String(value || "") .replace(/\s/g, (c: string) => (c === " " ? "%20" : encodeURIComponent(c))) .replace(/\(/g, "%28") .replace(/\)/g, "%29"); const processNode = (node: any): string => { const type = node.type; const nodeContent = node.content || []; switch (type) { case "doc": return nodeContent.map(processNode).join("\n\n"); case "paragraph": const text = nodeContent.map(processNode).join(""); const align = node.attrs?.textAlign; if (align && align !== "left") { return `
${text}
`; } return text || ""; case "heading": const level = node.attrs?.level || 1; const headingText = nodeContent.map(processNode).join(""); return "#".repeat(level) + " " + headingText; case "text": let textContent = node.text || ""; // Apply marks (bold, italic, code, etc.) if (node.marks) { // Markdown code spans (`...`) cannot carry inner formatting, so when a // run has the `code` mark alongside ANY other mark, backtick syntax // would leak literal ** / []() into the code text. In that case emit // nested HTML ( innermost, the other marks wrapping it as HTML) // so the output is at least well-formed and re-parseable. // // NOTE: this does NOT round-trip both marks. The schema's `code` mark // has `excludes: "_"` (it excludes every other mark), so on import the // co-occurring mark is always dropped — the run comes back as `code` // only. We keep the emission simple and accept that the other mark is // lost; preserving both is impossible while `code` excludes them. // Only use the backtick form when `code` is the sole mark. const markTypes = node.marks.map((m: any) => m.type); const hasCode = markTypes.includes("code"); const codeCombined = hasCode && markTypes.length > 1; for (const mark of node.marks) { switch (mark.type) { case "bold": textContent = codeCombined ? `${textContent}` : `**${textContent}**`; break; case "italic": textContent = codeCombined ? `${textContent}` : `*${textContent}*`; break; case "code": // When combined with another mark, wrap as so the // surrounding HTML marks can nest around it; otherwise use the // plain backtick span. textContent = codeCombined ? `${textContent}` : `\`${textContent}\``; break; case "link": { const href = mark.attrs?.href || ""; const title = mark.attrs?.title; if (codeCombined) { // Emit an HTML anchor so it can wrap the nested . const safeHref = escapeAttr(href); if (title) { textContent = `${textContent}`; } else { textContent = `${textContent}`; } } else if (title) { // Emit the optional markdown link title; escape an embedded // double-quote so it cannot terminate the title string early. const safeTitle = String(title).replace(/"/g, '\\"'); textContent = `[${textContent}](${href} "${safeTitle}")`; } else { textContent = `[${textContent}](${href})`; } break; } case "strike": textContent = codeCombined ? `${textContent}` : `~~${textContent}~~`; break; case "underline": textContent = `${textContent}`; break; case "subscript": textContent = `${textContent}`; break; case "superscript": textContent = `${textContent}`; break; case "highlight": { // Preserve a null/empty color as a plain highlight (a bare // with no background-color); only emit the style when a // color is actually set, so a plain highlight is not forced to // yellow on export. const color = mark.attrs?.color; textContent = color ? `${textContent}` : `${textContent}`; break; } case "textStyle": if (mark.attrs?.color) { textContent = `${textContent}`; } break; case "comment": { // Emit the inline comment anchor so highlights round-trip. The // schema's Comment mark parses span[data-comment-id] (attrs // commentId/resolved). const cid = mark.attrs?.commentId; if (cid) { const resolvedAttr = mark.attrs?.resolved ? ` data-resolved="true"` : ""; textContent = `${textContent}`; } break; } } } } return textContent; case "codeBlock": const language = node.attrs?.language || ""; // Strip ALL trailing newlines so the export is idempotent: marked // re-adds exactly one trailing "\n" on import, so trimming only one // here would let the text grow by "\n" on each round-trip. Removing // every trailing newline makes repeated cycles stable. const code = nodeContent .map(processNode) .join("") .replace(/\n+$/, ""); return "```" + language + "\n" + code + "\n```"; case "bulletList": return nodeContent .map((item: any) => processListItem(item, "-")) .join("\n"); case "orderedList": return nodeContent .map((item: any, index: number) => processListItem(item, `${index + 1}.`), ) .join("\n"); case "taskList": return nodeContent.map((item: any) => processTaskItem(item)).join("\n"); case "taskItem": // Delegate to the same helper used by taskList so multi-block and // nested task items render and indent consistently. return processTaskItem(node); case "listItem": return nodeContent.map(processNode).join("\n"); case "blockquote": // Prefix EVERY line of EVERY child with "> " and separate block-level // children with a blank ">" line so code blocks / multi-paragraph // quotes round-trip correctly. return nodeContent .map((n: any) => processNode(n) .split("\n") .map((line: string) => (line.length ? `> ${line}` : ">")) .join("\n"), ) .join("\n>\n"); case "horizontalRule": return "---"; case "hardBreak": // Two trailing spaces before the newline encode a markdown hard break; // a bare "\n" would be reimported as a soft break and lost. return " \n"; case "image": const imgAlt = node.attrs?.alt || ""; // Neutralize characters that could break out of the markdown image // URL: spaces/newlines and parentheses would terminate the (...) target // and let a stored src inject following markdown/HTML. Percent-encode // them so the URL stays a single inert token. const imgSrc = encodeMdUrl(node.attrs?.src); // No "caption" attribute exists in the Docmost image schema, so we do // not emit one (the previous caption branch was dead). return `![${imgAlt}](${imgSrc})`; case "video": { // Emit the schema-matching