/** * Convert ProseMirror/TipTap JSON content to Markdown * Supports all Docmost-specific node types and extensions */ export function convertProseMirrorToMarkdown(content) { if (!content || !content.content) return ""; // Escape a value interpolated into an HTML double-quoted attribute value // (textAlign, colors, image src, math `text`, all data-* attrs, etc.). In the // ATTRIBUTE context only the quote that delimits the value and the ampersand // that starts an entity are special, so we escape ONLY & " (and ' for safety // when single-quoted delimiters are used). We deliberately do NOT escape < or // >: the HTML re-parser (parse5/jsdom via @tiptap/html) does NOT decode // </> back inside attribute values, so escaping them would corrupt the // stored data (e.g. a math node's LaTeX `a < b`) and ACCUMULATE escapes on // every round-trip (`a < b` -> `a < b` -> `a < b`). Escaping & " // keeps the value inert against attribute-injection while staying idempotent. // NOTE: escape ONLY & and " here. The value is always wrapped in double // quotes, so " is the only delimiter; ' is NOT special in a double-quoted // value, and parse5 does not decode ' back inside attribute values, so // escaping ' would (like < >) corrupt the value and accumulate & on every // round-trip. Escaping & and " is idempotent (parse5 decodes them back). const escapeAttr = (value) => String(value) .replace(/&/g, "&") .replace(/"/g, """); // Escape a value placed as HTML element TEXT content (between tags), where // <, >, and & are all significant. Used for text rendered inside raw-HTML // blocks (table cells / columns) so stored characters cannot inject markup. const escapeHtmlText = (value) => String(value) .replace(/&/g, "&") .replace(//g, ">"); // Percent-encode characters that would break out of a markdown URL target // (...) — whitespace/newlines and parentheses — so a stored src stays a // single inert token (used for image/video/youtube srcs). const encodeMdUrl = (value) => String(value || "") .replace(/\s/g, (c) => (c === " " ? "%20" : encodeURIComponent(c))) .replace(/\(/g, "%28") .replace(/\)/g, "%29"); const processNode = (node) => { const type = node.type; const nodeContent = node.content || []; switch (type) { case "doc": return nodeContent.map(processNode).join("\n\n"); case "paragraph": const text = nodeContent.map(processNode).join(""); const align = node.attrs?.textAlign; if (align && align !== "left") { return `
so the
// surrounding HTML marks can nest around it; otherwise use the
// plain backtick span.
textContent = codeCombined
? `${textContent}`
: `\`${textContent}\``;
break;
case "link": {
const href = mark.attrs?.href || "";
const title = mark.attrs?.title;
if (codeCombined) {
// Emit an HTML anchor so it can wrap the nested .
const safeHref = escapeAttr(href);
if (title) {
textContent = `${textContent}`;
}
else {
textContent = `${textContent}`;
}
}
else if (title) {
// Emit the optional markdown link title; escape an embedded
// double-quote so it cannot terminate the title string early.
const safeTitle = String(title).replace(/"/g, '\\"');
textContent = `[${textContent}](${href} "${safeTitle}")`;
}
else {
textContent = `[${textContent}](${href})`;
}
break;
}
case "strike":
textContent = codeCombined
? `${textContent}`
: `~~${textContent}~~`;
break;
case "underline":
textContent = `${textContent}`;
break;
case "subscript":
textContent = `${textContent}`;
break;
case "superscript":
textContent = `${textContent}`;
break;
case "highlight": {
// Preserve a null/empty color as a plain highlight (a bare
// with no background-color); only emit the style when a
// color is actually set, so a plain highlight is not forced to
// yellow on export.
const color = mark.attrs?.color;
textContent = color
? `${textContent}`
: `${textContent}`;
break;
}
case "textStyle":
if (mark.attrs?.color) {
textContent = `${textContent}`;
}
break;
case "comment": {
// Emit the inline comment anchor so highlights round-trip. The
// schema's Comment mark parses span[data-comment-id] (attrs
// commentId/resolved).
const cid = mark.attrs?.commentId;
if (cid) {
const resolvedAttr = mark.attrs?.resolved
? ` data-resolved="true"`
: "";
textContent = `${textContent}`;
}
break;
}
}
}
}
return textContent;
case "codeBlock":
const language = node.attrs?.language || "";
// Strip ALL trailing newlines so the export is idempotent: marked
// re-adds exactly one trailing "\n" on import, so trimming only one
// here would let the text grow by "\n" on each round-trip. Removing
// every trailing newline makes repeated cycles stable.
const code = nodeContent
.map(processNode)
.join("")
.replace(/\n+$/, "");
return "```" + language + "\n" + code + "\n```";
case "bulletList":
return nodeContent
.map((item) => processListItem(item, "-"))
.join("\n");
case "orderedList":
return nodeContent
.map((item, index) => processListItem(item, `${index + 1}.`))
.join("\n");
case "taskList":
return nodeContent.map((item) => processTaskItem(item)).join("\n");
case "taskItem":
// Delegate to the same helper used by taskList so multi-block and
// nested task items render and indent consistently.
return processTaskItem(node);
case "listItem":
return nodeContent.map(processNode).join("\n");
case "blockquote":
// Prefix EVERY line of EVERY child with "> " and separate block-level
// children with a blank ">" line so code blocks / multi-paragraph
// quotes round-trip correctly.
return nodeContent
.map((n) => processNode(n)
.split("\n")
.map((line) => (line.length ? `> ${line}` : ">"))
.join("\n"))
.join("\n>\n");
case "horizontalRule":
return "---";
case "hardBreak":
// Two trailing spaces before the newline encode a markdown hard break;
// a bare "\n" would be reimported as a soft break and lost.
return " \n";
case "image":
const imgAlt = node.attrs?.alt || "";
// Neutralize characters that could break out of the markdown image
// URL: spaces/newlines and parentheses would terminate the (...) target
// and let a stored src inject following markdown/HTML. Percent-encode
// them so the URL stays a single inert token.
const imgSrc = encodeMdUrl(node.attrs?.src);
// No "caption" attribute exists in the Docmost image schema, so we do
// not emit one (the previous caption branch was dead).
return ``;
case "video": {
// Emit the schema-matching