feat(sync): scaffold monorepo, extract docmost-client, add Phase-0 harness + read-only pull
Lock the access-layer decision (REST only) and start implementation per SPEC. - monorepo (npm workspaces): packages/docmost-client = DocmostClient + lib/* copied 1:1 from docmost-mcp/src (backport target), plus bannered sync methods (listTrash, restorePage, listAllSpacePages, exportPageBody, listRecentSince / collectRecentSince cursor scan) - engine stays the root app per AGENTS.md (src/, test/, build/, data/, settings.ts); add roundtrip.ts (SPEC §11 idempotency harness), pull.ts (SPEC §6 read-only Docmost->FS mirror), sanitize.ts (SPEC §12 filenames, path-traversal-safe) - Dockerfile builds the workspace lib before the app; vitest gates CI - exportPageBody never touches /comments (SPEC §3); serializeDocmostMarkdownBody emits meta + body only - SPEC: resolve access-layer (REST), reflect root-engine layout + REST pagination - tests: sanitize (incl. dot-traversal), collectRecentSince (cutoff/dedup/cap), stripBlockIds, markdown round-trip byte-stability Note: raw ProseMirror round-trip is byte-stable in Markdown but not yet attribute- idempotent (SPEC §11 Задача №0, before Phase 2).
This commit is contained in:
861
packages/docmost-client/src/lib/markdown-converter.ts
Normal file
861
packages/docmost-client/src/lib/markdown-converter.ts
Normal file
@@ -0,0 +1,861 @@
|
||||
/**
|
||||
* Convert ProseMirror/TipTap JSON content to Markdown
|
||||
* Supports all Docmost-specific node types and extensions
|
||||
*/
|
||||
export function convertProseMirrorToMarkdown(content: any): string {
|
||||
if (!content || !content.content) return "";
|
||||
|
||||
// Escape a value interpolated into an HTML double-quoted attribute value
|
||||
// (textAlign, colors, image src, math `text`, all data-* attrs, etc.). In the
|
||||
// ATTRIBUTE context only the quote that delimits the value and the ampersand
|
||||
// that starts an entity are special, so we escape ONLY & " (and ' for safety
|
||||
// when single-quoted delimiters are used). We deliberately do NOT escape < or
|
||||
// >: the HTML re-parser (parse5/jsdom via @tiptap/html) does NOT decode
|
||||
// </> back inside attribute values, so escaping them would corrupt the
|
||||
// stored data (e.g. a math node's LaTeX `a < b`) and ACCUMULATE escapes on
|
||||
// every round-trip (`a < b` -> `a < b` -> `a &lt; b`). Escaping & "
|
||||
// keeps the value inert against attribute-injection while staying idempotent.
|
||||
// NOTE: escape ONLY & and " here. The value is always wrapped in double
|
||||
// quotes, so " is the only delimiter; ' is NOT special in a double-quoted
|
||||
// value, and parse5 does not decode ' back inside attribute values, so
|
||||
// escaping ' would (like < >) corrupt the value and accumulate & on every
|
||||
// round-trip. Escaping & and " is idempotent (parse5 decodes them back).
|
||||
const escapeAttr = (value: unknown): string =>
|
||||
String(value)
|
||||
.replace(/&/g, "&")
|
||||
.replace(/"/g, """);
|
||||
|
||||
// Escape a value placed as HTML element TEXT content (between tags), where
|
||||
// <, >, and & are all significant. Used for text rendered inside raw-HTML
|
||||
// blocks (table cells / columns) so stored characters cannot inject markup.
|
||||
const escapeHtmlText = (value: unknown): string =>
|
||||
String(value)
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">");
|
||||
|
||||
// Percent-encode characters that would break out of a markdown URL target
|
||||
// (...) — whitespace/newlines and parentheses — so a stored src stays a
|
||||
// single inert token (used for image/video/youtube srcs).
|
||||
const encodeMdUrl = (value: unknown): string =>
|
||||
String(value || "")
|
||||
.replace(/\s/g, (c: string) => (c === " " ? "%20" : encodeURIComponent(c)))
|
||||
.replace(/\(/g, "%28")
|
||||
.replace(/\)/g, "%29");
|
||||
|
||||
const processNode = (node: any): string => {
|
||||
const type = node.type;
|
||||
const nodeContent = node.content || [];
|
||||
|
||||
switch (type) {
|
||||
case "doc":
|
||||
return nodeContent.map(processNode).join("\n\n");
|
||||
|
||||
case "paragraph":
|
||||
const text = nodeContent.map(processNode).join("");
|
||||
const align = node.attrs?.textAlign;
|
||||
if (align && align !== "left") {
|
||||
return `<div align="${escapeAttr(align)}">${text}</div>`;
|
||||
}
|
||||
return text || "";
|
||||
|
||||
case "heading":
|
||||
const level = node.attrs?.level || 1;
|
||||
const headingText = nodeContent.map(processNode).join("");
|
||||
return "#".repeat(level) + " " + headingText;
|
||||
|
||||
case "text":
|
||||
let textContent = node.text || "";
|
||||
// Apply marks (bold, italic, code, etc.)
|
||||
if (node.marks) {
|
||||
// Markdown code spans (`...`) cannot carry inner formatting, so when a
|
||||
// run has the `code` mark alongside ANY other mark, backtick syntax
|
||||
// would leak literal ** / []() into the code text. In that case emit
|
||||
// nested HTML (<code> innermost, the other marks wrapping it as HTML)
|
||||
// so the output is at least well-formed and re-parseable.
|
||||
//
|
||||
// NOTE: this does NOT round-trip both marks. The schema's `code` mark
|
||||
// has `excludes: "_"` (it excludes every other mark), so on import the
|
||||
// co-occurring mark is always dropped — the run comes back as `code`
|
||||
// only. We keep the emission simple and accept that the other mark is
|
||||
// lost; preserving both is impossible while `code` excludes them.
|
||||
// Only use the backtick form when `code` is the sole mark.
|
||||
const markTypes = node.marks.map((m: any) => m.type);
|
||||
const hasCode = markTypes.includes("code");
|
||||
const codeCombined = hasCode && markTypes.length > 1;
|
||||
for (const mark of node.marks) {
|
||||
switch (mark.type) {
|
||||
case "bold":
|
||||
textContent = codeCombined
|
||||
? `<strong>${textContent}</strong>`
|
||||
: `**${textContent}**`;
|
||||
break;
|
||||
case "italic":
|
||||
textContent = codeCombined
|
||||
? `<em>${textContent}</em>`
|
||||
: `*${textContent}*`;
|
||||
break;
|
||||
case "code":
|
||||
// When combined with another mark, wrap as <code> so the
|
||||
// surrounding HTML marks can nest around it; otherwise use the
|
||||
// plain backtick span.
|
||||
textContent = codeCombined
|
||||
? `<code>${textContent}</code>`
|
||||
: `\`${textContent}\``;
|
||||
break;
|
||||
case "link": {
|
||||
const href = mark.attrs?.href || "";
|
||||
const title = mark.attrs?.title;
|
||||
if (codeCombined) {
|
||||
// Emit an HTML anchor so it can wrap the nested <code>.
|
||||
const safeHref = escapeAttr(href);
|
||||
if (title) {
|
||||
textContent = `<a href="${safeHref}" title="${escapeAttr(String(title))}">${textContent}</a>`;
|
||||
} else {
|
||||
textContent = `<a href="${safeHref}">${textContent}</a>`;
|
||||
}
|
||||
} else if (title) {
|
||||
// Emit the optional markdown link title; escape an embedded
|
||||
// double-quote so it cannot terminate the title string early.
|
||||
const safeTitle = String(title).replace(/"/g, '\\"');
|
||||
textContent = `[${textContent}](${href} "${safeTitle}")`;
|
||||
} else {
|
||||
textContent = `[${textContent}](${href})`;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "strike":
|
||||
textContent = codeCombined
|
||||
? `<s>${textContent}</s>`
|
||||
: `~~${textContent}~~`;
|
||||
break;
|
||||
case "underline":
|
||||
textContent = `<u>${textContent}</u>`;
|
||||
break;
|
||||
case "subscript":
|
||||
textContent = `<sub>${textContent}</sub>`;
|
||||
break;
|
||||
case "superscript":
|
||||
textContent = `<sup>${textContent}</sup>`;
|
||||
break;
|
||||
case "highlight": {
|
||||
// Preserve a null/empty color as a plain highlight (a bare
|
||||
// <mark> with no background-color); only emit the style when a
|
||||
// color is actually set, so a plain highlight is not forced to
|
||||
// yellow on export.
|
||||
const color = mark.attrs?.color;
|
||||
textContent = color
|
||||
? `<mark style="background-color: ${escapeAttr(color)}">${textContent}</mark>`
|
||||
: `<mark>${textContent}</mark>`;
|
||||
break;
|
||||
}
|
||||
case "textStyle":
|
||||
if (mark.attrs?.color) {
|
||||
textContent = `<span style="color: ${escapeAttr(mark.attrs.color)}">${textContent}</span>`;
|
||||
}
|
||||
break;
|
||||
case "comment": {
|
||||
// Emit the inline comment anchor so highlights round-trip. The
|
||||
// schema's Comment mark parses span[data-comment-id] (attrs
|
||||
// commentId/resolved).
|
||||
const cid = mark.attrs?.commentId;
|
||||
if (cid) {
|
||||
const resolvedAttr = mark.attrs?.resolved
|
||||
? ` data-resolved="true"`
|
||||
: "";
|
||||
textContent = `<span data-comment-id="${escapeAttr(cid)}"${resolvedAttr}>${textContent}</span>`;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return textContent;
|
||||
|
||||
case "codeBlock":
|
||||
const language = node.attrs?.language || "";
|
||||
// Strip ALL trailing newlines so the export is idempotent: marked
|
||||
// re-adds exactly one trailing "\n" on import, so trimming only one
|
||||
// here would let the text grow by "\n" on each round-trip. Removing
|
||||
// every trailing newline makes repeated cycles stable.
|
||||
const code = nodeContent
|
||||
.map(processNode)
|
||||
.join("")
|
||||
.replace(/\n+$/, "");
|
||||
return "```" + language + "\n" + code + "\n```";
|
||||
|
||||
case "bulletList":
|
||||
return nodeContent
|
||||
.map((item: any) => processListItem(item, "-"))
|
||||
.join("\n");
|
||||
|
||||
case "orderedList":
|
||||
return nodeContent
|
||||
.map((item: any, index: number) =>
|
||||
processListItem(item, `${index + 1}.`),
|
||||
)
|
||||
.join("\n");
|
||||
|
||||
case "taskList":
|
||||
return nodeContent.map((item: any) => processTaskItem(item)).join("\n");
|
||||
|
||||
case "taskItem":
|
||||
// Delegate to the same helper used by taskList so multi-block and
|
||||
// nested task items render and indent consistently.
|
||||
return processTaskItem(node);
|
||||
|
||||
case "listItem":
|
||||
return nodeContent.map(processNode).join("\n");
|
||||
|
||||
case "blockquote":
|
||||
// Prefix EVERY line of EVERY child with "> " and separate block-level
|
||||
// children with a blank ">" line so code blocks / multi-paragraph
|
||||
// quotes round-trip correctly.
|
||||
return nodeContent
|
||||
.map((n: any) =>
|
||||
processNode(n)
|
||||
.split("\n")
|
||||
.map((line: string) => (line.length ? `> ${line}` : ">"))
|
||||
.join("\n"),
|
||||
)
|
||||
.join("\n>\n");
|
||||
|
||||
case "horizontalRule":
|
||||
return "---";
|
||||
|
||||
case "hardBreak":
|
||||
// Two trailing spaces before the newline encode a markdown hard break;
|
||||
// a bare "\n" would be reimported as a soft break and lost.
|
||||
return " \n";
|
||||
|
||||
case "image":
|
||||
const imgAlt = node.attrs?.alt || "";
|
||||
// Neutralize characters that could break out of the markdown image
|
||||
// URL: spaces/newlines and parentheses would terminate the (...) target
|
||||
// and let a stored src inject following markdown/HTML. Percent-encode
|
||||
// them so the URL stays a single inert token.
|
||||
const imgSrc = encodeMdUrl(node.attrs?.src);
|
||||
// No "caption" attribute exists in the Docmost image schema, so we do
|
||||
// not emit one (the previous caption branch was dead).
|
||||
return ``;
|
||||
|
||||
case "video": {
|
||||
// Emit the schema-matching <video> element so generateJSON rebuilds the
|
||||
// node with its attrs intact. The schema's parseHTML reads src/aria-label
|
||||
// from the standard attributes and the remaining attrs from data-*.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [`src="${escapeAttr(attrs.src ?? "")}"`];
|
||||
if (attrs.alt) parts.push(`aria-label="${escapeAttr(attrs.alt)}"`);
|
||||
if (attrs.attachmentId)
|
||||
parts.push(
|
||||
`data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
|
||||
);
|
||||
if (attrs.width != null)
|
||||
parts.push(`width="${escapeAttr(attrs.width)}"`);
|
||||
if (attrs.height != null)
|
||||
parts.push(`height="${escapeAttr(attrs.height)}"`);
|
||||
if (attrs.size != null)
|
||||
parts.push(`data-size="${escapeAttr(attrs.size)}"`);
|
||||
if (attrs.align)
|
||||
parts.push(`data-align="${escapeAttr(attrs.align)}"`);
|
||||
if (attrs.aspectRatio != null)
|
||||
parts.push(`data-aspect-ratio="${escapeAttr(attrs.aspectRatio)}"`);
|
||||
// Wrap in a block <div> so marked treats it as a block (a bare <video>
|
||||
// is inline-level HTML and marked wraps it in <p>, leaving a spurious
|
||||
// empty paragraph beside the hoisted block atom). The wrapper has no
|
||||
// data-type, so the schema parser ignores it and just hoists the video.
|
||||
return `<div><video ${parts.join(" ")}></video></div>`;
|
||||
}
|
||||
|
||||
case "youtube": {
|
||||
// Emit the schema-matching div[data-type="youtube"]; the schema reads
|
||||
// src from data-src and width/height/align from data-* attributes.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [
|
||||
`data-type="youtube"`,
|
||||
`data-src="${escapeAttr(attrs.src ?? "")}"`,
|
||||
];
|
||||
if (attrs.width != null)
|
||||
parts.push(`data-width="${escapeAttr(attrs.width)}"`);
|
||||
if (attrs.height != null)
|
||||
parts.push(`data-height="${escapeAttr(attrs.height)}"`);
|
||||
if (attrs.align)
|
||||
parts.push(`data-align="${escapeAttr(attrs.align)}"`);
|
||||
return `<div ${parts.join(" ")}></div>`;
|
||||
}
|
||||
|
||||
case "table": {
|
||||
// A GFM pipe table cannot represent merged cells. If ANY cell carries
|
||||
// colspan>1 or rowspan>1, a pipe table would corrupt the grid on
|
||||
// re-import, so emit the WHOLE table as raw HTML <table> instead: the
|
||||
// schema's table family parseHTML (tag table/tr/td/th, with colspan/
|
||||
// rowspan read from the same-named HTML attrs and align via parseHTML)
|
||||
// round-trips it faithfully. Otherwise keep the lighter GFM pipe table.
|
||||
const tableRows: any[] = nodeContent;
|
||||
if (tableRows.length === 0) return "";
|
||||
const hasSpan = tableRows.some((row: any) =>
|
||||
(row.content || []).some(
|
||||
(cell: any) =>
|
||||
(cell.attrs?.colspan ?? 1) > 1 || (cell.attrs?.rowspan ?? 1) > 1,
|
||||
),
|
||||
);
|
||||
|
||||
if (hasSpan) {
|
||||
// Render each cell's block children to HTML (marked does NOT parse
|
||||
// markdown inside a raw HTML block, so emitting markdown here would
|
||||
// leak literal ** / `` into the cell). blockToHtml mirrors the schema
|
||||
// HTML so inner formatting re-parses into the right marks/nodes.
|
||||
const renderHtmlCell = (cell: any): string => {
|
||||
const tag = cell.type === "tableHeader" ? "th" : "td";
|
||||
const a = cell.attrs || {};
|
||||
const cellParts: string[] = [];
|
||||
if ((a.colspan ?? 1) > 1)
|
||||
cellParts.push(`colspan="${escapeAttr(a.colspan)}"`);
|
||||
if ((a.rowspan ?? 1) > 1)
|
||||
cellParts.push(`rowspan="${escapeAttr(a.rowspan)}"`);
|
||||
if (a.align) cellParts.push(`align="${escapeAttr(a.align)}"`);
|
||||
const open = cellParts.length
|
||||
? `<${tag} ${cellParts.join(" ")}>`
|
||||
: `<${tag}>`;
|
||||
const inner = (cell.content || [])
|
||||
.map((block: any) => blockToHtml(block))
|
||||
.join("");
|
||||
return `${open}${inner}</${tag}>`;
|
||||
};
|
||||
const htmlRows = tableRows
|
||||
.map(
|
||||
(row: any) =>
|
||||
`<tr>${(row.content || []).map(renderHtmlCell).join("")}</tr>`,
|
||||
)
|
||||
.join("");
|
||||
return `<table><tbody>${htmlRows}</tbody></table>`;
|
||||
}
|
||||
|
||||
// No merged cells: emit a GFM table (header row + separator) so the
|
||||
// markdown can be parsed back into a table on re-import.
|
||||
const rows = tableRows.map(processNode);
|
||||
const headerCells = tableRows[0]?.content || [];
|
||||
const columns = headerCells.length || 1;
|
||||
// Derive alignment markers (:--, :-:, --:) from each header cell.
|
||||
const markers = Array.from({ length: columns }, (_, i) => {
|
||||
const align = headerCells[i]?.attrs?.align;
|
||||
switch (align) {
|
||||
case "left":
|
||||
return ":--";
|
||||
case "center":
|
||||
return ":-:";
|
||||
case "right":
|
||||
return "--:";
|
||||
default:
|
||||
return "---";
|
||||
}
|
||||
});
|
||||
const separator = "| " + markers.join(" | ") + " |";
|
||||
return [rows[0], separator, ...rows.slice(1)].join("\n");
|
||||
}
|
||||
|
||||
case "tableRow":
|
||||
return "| " + nodeContent.map(processNode).join(" | ") + " |";
|
||||
|
||||
case "tableCell":
|
||||
case "tableHeader": {
|
||||
// Join multiple block children with a space (not "") so adjacent blocks
|
||||
// like a paragraph followed by a list don't collide into "line1- a".
|
||||
// Then collapse newlines and escape pipes so a cell containing "|" or a
|
||||
// line break cannot corrupt the surrounding GFM row.
|
||||
return nodeContent
|
||||
.map(processNode)
|
||||
.join(" ")
|
||||
.replace(/\r?\n/g, " ")
|
||||
.replace(/\|/g, "\\|");
|
||||
}
|
||||
|
||||
case "callout":
|
||||
const calloutType = node.attrs?.type || "info";
|
||||
const calloutContent = nodeContent.map(processNode).join("\n");
|
||||
return `:::${calloutType.toLowerCase()}\n${calloutContent}\n:::`;
|
||||
|
||||
case "details":
|
||||
return nodeContent.map(processNode).join("\n");
|
||||
|
||||
case "detailsSummary":
|
||||
const summaryText = nodeContent.map(processNode).join("");
|
||||
return `<details>\n<summary>${summaryText}</summary>\n`;
|
||||
|
||||
case "detailsContent":
|
||||
const detailsText = nodeContent.map(processNode).join("\n");
|
||||
return `${detailsText}\n</details>`;
|
||||
|
||||
case "mathInline": {
|
||||
// The schema's `text` attribute has no parseHTML, so TipTap's default
|
||||
// parser reads it from the `text` HTML attribute (NOT the element's text
|
||||
// content). Emit span[data-type="mathInline"] carrying the LaTeX in a
|
||||
// `text="..."` attribute so it round-trips. marked cannot parse $...$
|
||||
// back, so the previous form was lossy.
|
||||
const inlineMath = node.attrs?.text || "";
|
||||
return `<span data-type="mathInline" data-katex="true" text="${escapeAttr(inlineMath)}"></span>`;
|
||||
}
|
||||
|
||||
case "mathBlock": {
|
||||
// Same as mathInline: the LaTeX must ride in the `text` HTML attribute
|
||||
// for the schema's default parser to recover it.
|
||||
const blockMath = node.attrs?.text || "";
|
||||
return `<div data-type="mathBlock" data-katex="true" text="${escapeAttr(blockMath)}"></div>`;
|
||||
}
|
||||
|
||||
case "mention": {
|
||||
// Emit span[data-type="mention"] with the schema's data-* attributes so
|
||||
// generateJSON rebuilds the mention node instead of leaving "@label"
|
||||
// plain text that cannot re-parse.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [`data-type="mention"`];
|
||||
if (attrs.id) parts.push(`data-id="${escapeAttr(attrs.id)}"`);
|
||||
if (attrs.label)
|
||||
parts.push(`data-label="${escapeAttr(attrs.label)}"`);
|
||||
if (attrs.entityType)
|
||||
parts.push(`data-entity-type="${escapeAttr(attrs.entityType)}"`);
|
||||
if (attrs.entityId)
|
||||
parts.push(`data-entity-id="${escapeAttr(attrs.entityId)}"`);
|
||||
if (attrs.slugId)
|
||||
parts.push(`data-slug-id="${escapeAttr(attrs.slugId)}"`);
|
||||
if (attrs.creatorId)
|
||||
parts.push(`data-creator-id="${escapeAttr(attrs.creatorId)}"`);
|
||||
if (attrs.anchorId)
|
||||
parts.push(`data-anchor-id="${escapeAttr(attrs.anchorId)}"`);
|
||||
// Keep the label as visible text content too; the schema reads attrs
|
||||
// from data-*, so the inner text is purely cosmetic and harmless.
|
||||
const mentionLabel = attrs.label || attrs.id || "";
|
||||
// The label is visible element TEXT content here (the data-* attrs above
|
||||
// carry the real values), so escape it for the text context, not attrs.
|
||||
return `<span ${parts.join(" ")}>@${escapeHtmlText(mentionLabel)}</span>`;
|
||||
}
|
||||
|
||||
case "attachment": {
|
||||
// BUG FIX: the old code read node.attrs.fileName / node.attrs.src, but
|
||||
// the schema stores name/url (plus mime/size/attachmentId). Emit the
|
||||
// schema-matching div[data-type="attachment"] with data-attachment-*
|
||||
// attrs so the node round-trips instead of degrading to a markdown link.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [
|
||||
`data-type="attachment"`,
|
||||
`data-attachment-url="${escapeAttr(attrs.url ?? "")}"`,
|
||||
];
|
||||
if (attrs.name)
|
||||
parts.push(`data-attachment-name="${escapeAttr(attrs.name)}"`);
|
||||
if (attrs.mime)
|
||||
parts.push(`data-attachment-mime="${escapeAttr(attrs.mime)}"`);
|
||||
if (attrs.size != null)
|
||||
parts.push(`data-attachment-size="${escapeAttr(attrs.size)}"`);
|
||||
if (attrs.attachmentId)
|
||||
parts.push(
|
||||
`data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
|
||||
);
|
||||
return `<div ${parts.join(" ")}></div>`;
|
||||
}
|
||||
|
||||
case "drawio":
|
||||
case "excalidraw": {
|
||||
// Emit the schema-matching div[data-type=...] carrying the diagram's
|
||||
// attrs as data-* (the schema's diagramAttributes reads src/title/alt/
|
||||
// width/height/size/aspectRatio/align/attachmentId from data-*), so the
|
||||
// diagram round-trips instead of degrading to a lossy placeholder.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [
|
||||
`data-type="${type}"`,
|
||||
`data-src="${escapeAttr(attrs.src ?? "")}"`,
|
||||
];
|
||||
if (attrs.title != null)
|
||||
parts.push(`data-title="${escapeAttr(attrs.title)}"`);
|
||||
if (attrs.alt != null) parts.push(`data-alt="${escapeAttr(attrs.alt)}"`);
|
||||
if (attrs.width != null)
|
||||
parts.push(`data-width="${escapeAttr(attrs.width)}"`);
|
||||
if (attrs.height != null)
|
||||
parts.push(`data-height="${escapeAttr(attrs.height)}"`);
|
||||
if (attrs.size != null)
|
||||
parts.push(`data-size="${escapeAttr(attrs.size)}"`);
|
||||
if (attrs.aspectRatio != null)
|
||||
parts.push(`data-aspect-ratio="${escapeAttr(attrs.aspectRatio)}"`);
|
||||
if (attrs.align)
|
||||
parts.push(`data-align="${escapeAttr(attrs.align)}"`);
|
||||
if (attrs.attachmentId)
|
||||
parts.push(
|
||||
`data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
|
||||
);
|
||||
return `<div ${parts.join(" ")}></div>`;
|
||||
}
|
||||
|
||||
case "embed": {
|
||||
// Emit the schema-matching div[data-type="embed"]; the schema reads
|
||||
// src/provider/align/width/height from data-* attributes so the node
|
||||
// (and its provider iframe info) survives the round-trip.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [
|
||||
`data-type="embed"`,
|
||||
`data-src="${escapeAttr(attrs.src ?? "")}"`,
|
||||
`data-provider="${escapeAttr(attrs.provider ?? "")}"`,
|
||||
];
|
||||
if (attrs.align)
|
||||
parts.push(`data-align="${escapeAttr(attrs.align)}"`);
|
||||
if (attrs.width != null)
|
||||
parts.push(`data-width="${escapeAttr(attrs.width)}"`);
|
||||
if (attrs.height != null)
|
||||
parts.push(`data-height="${escapeAttr(attrs.height)}"`);
|
||||
return `<div ${parts.join(" ")}></div>`;
|
||||
}
|
||||
|
||||
case "audio": {
|
||||
// Emit the schema-matching <audio> element (was emitting nothing). The
|
||||
// schema reads src from src and attachmentId/size from data-*.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [`src="${escapeAttr(attrs.src ?? "")}"`];
|
||||
if (attrs.attachmentId)
|
||||
parts.push(
|
||||
`data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
|
||||
);
|
||||
if (attrs.size != null)
|
||||
parts.push(`data-size="${escapeAttr(attrs.size)}"`);
|
||||
// Wrap in a block <div> for the same reason as video: a bare <audio> is
|
||||
// inline-level HTML that marked would wrap in <p>.
|
||||
return `<div><audio ${parts.join(" ")}></audio></div>`;
|
||||
}
|
||||
|
||||
case "pdf": {
|
||||
// Emit the schema-matching div[data-type="pdf"] (was emitting nothing).
|
||||
// The schema reads src/width/height from standard attrs and name/
|
||||
// attachmentId/size from data-*.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [
|
||||
`data-type="pdf"`,
|
||||
`src="${escapeAttr(attrs.src ?? "")}"`,
|
||||
];
|
||||
if (attrs.name) parts.push(`data-name="${escapeAttr(attrs.name)}"`);
|
||||
if (attrs.attachmentId)
|
||||
parts.push(
|
||||
`data-attachment-id="${escapeAttr(attrs.attachmentId)}"`,
|
||||
);
|
||||
if (attrs.size != null)
|
||||
parts.push(`data-size="${escapeAttr(attrs.size)}"`);
|
||||
if (attrs.width != null)
|
||||
parts.push(`width="${escapeAttr(attrs.width)}"`);
|
||||
if (attrs.height != null)
|
||||
parts.push(`height="${escapeAttr(attrs.height)}"`);
|
||||
return `<div ${parts.join(" ")}></div>`;
|
||||
}
|
||||
|
||||
case "columns": {
|
||||
// Emit the schema-matching div[data-type="columns"] wrapper so the
|
||||
// multi-column layout survives. Without a case the children were
|
||||
// concatenated with no separator and the text merged. The schema reads
|
||||
// layout from data-layout and widthMode from data-width-mode. The whole
|
||||
// block is raw HTML, so render children via blockToHtml (NOT markdown,
|
||||
// which marked would not re-parse inside a raw HTML block).
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [`data-type="columns"`];
|
||||
if (attrs.layout)
|
||||
parts.push(`data-layout="${escapeAttr(attrs.layout)}"`);
|
||||
if (attrs.widthMode && attrs.widthMode !== "normal")
|
||||
parts.push(`data-width-mode="${escapeAttr(attrs.widthMode)}"`);
|
||||
const inner = nodeContent.map((n: any) => blockToHtml(n)).join("");
|
||||
return `<div ${parts.join(" ")}>${inner}</div>`;
|
||||
}
|
||||
|
||||
case "column": {
|
||||
// Emit the schema-matching div[data-type="column"]; the schema reads the
|
||||
// column width from data-width. Children are rendered as HTML so their
|
||||
// formatting survives inside this raw HTML block.
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [`data-type="column"`];
|
||||
if (attrs.width)
|
||||
parts.push(`data-width="${escapeAttr(attrs.width)}"`);
|
||||
const inner = nodeContent.map((n: any) => blockToHtml(n)).join("");
|
||||
return `<div ${parts.join(" ")}>${inner}</div>`;
|
||||
}
|
||||
|
||||
case "subpages":
|
||||
return "{{SUBPAGES}}";
|
||||
|
||||
default:
|
||||
// Fallback: process children
|
||||
return nodeContent.map(processNode).join("");
|
||||
}
|
||||
};
|
||||
|
||||
// Render inline content (text runs + their marks) to HTML. Used by the raw
|
||||
// HTML fallbacks (spanned tables, columns) where marked will NOT re-parse
|
||||
// markdown, so backtick/asterisk/bracket syntax would otherwise leak as
|
||||
// literal characters. Each mark is mirrored to the HTML the schema's parseHTML
|
||||
// accepts so it re-imports as the matching ProseMirror mark.
|
||||
const inlineToHtml = (inlineNodes: any[]): string =>
|
||||
(inlineNodes || [])
|
||||
.map((n: any) => {
|
||||
if (n.type === "hardBreak") return "<br>";
|
||||
if (n.type !== "text") {
|
||||
// Inline atoms (mention, mathInline) already emit schema HTML.
|
||||
return processNode(n);
|
||||
}
|
||||
let t = escapeHtmlText(n.text || "");
|
||||
for (const mark of n.marks || []) {
|
||||
switch (mark.type) {
|
||||
case "bold":
|
||||
t = `<strong>${t}</strong>`;
|
||||
break;
|
||||
case "italic":
|
||||
t = `<em>${t}</em>`;
|
||||
break;
|
||||
case "code":
|
||||
t = `<code>${t}</code>`;
|
||||
break;
|
||||
case "strike":
|
||||
t = `<s>${t}</s>`;
|
||||
break;
|
||||
case "underline":
|
||||
t = `<u>${t}</u>`;
|
||||
break;
|
||||
case "subscript":
|
||||
t = `<sub>${t}</sub>`;
|
||||
break;
|
||||
case "superscript":
|
||||
t = `<sup>${t}</sup>`;
|
||||
break;
|
||||
case "link":
|
||||
t = `<a href="${escapeAttr(mark.attrs?.href || "")}">${t}</a>`;
|
||||
break;
|
||||
case "highlight":
|
||||
t = mark.attrs?.color
|
||||
? `<mark style="background-color: ${escapeAttr(mark.attrs.color)}">${t}</mark>`
|
||||
: `<mark>${t}</mark>`;
|
||||
break;
|
||||
case "textStyle":
|
||||
if (mark.attrs?.color)
|
||||
t = `<span style="color: ${escapeAttr(mark.attrs.color)}">${t}</span>`;
|
||||
break;
|
||||
case "comment":
|
||||
// Inline comment anchor inside a raw-HTML container (columns /
|
||||
// spanned table cells), so commented text there also round-trips.
|
||||
if (mark.attrs?.commentId) {
|
||||
const r = mark.attrs?.resolved ? ` data-resolved="true"` : "";
|
||||
t = `<span data-comment-id="${escapeAttr(mark.attrs.commentId)}"${r}>${t}</span>`;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return t;
|
||||
})
|
||||
.join("");
|
||||
|
||||
// Emit the schema-matching <img> for an image node. Shared so the image is
|
||||
// emitted as real HTML wherever a raw-HTML container needs it (inside a column
|
||||
// or a spanned table cell), where markdown `` would NOT be re-parsed
|
||||
// and would survive as literal text. The Image extension reads src/alt from
|
||||
// the standard attributes; the Docmost extra attrs (width/height/align/size/
|
||||
// attachmentId/aspectRatio) are global attributes read from same-named DOM
|
||||
// attributes, so emit them by name.
|
||||
const imageToHtml = (node: any): string => {
|
||||
const attrs = node.attrs || {};
|
||||
const parts: string[] = [`src="${escapeAttr(attrs.src ?? "")}"`];
|
||||
if (attrs.alt) parts.push(`alt="${escapeAttr(attrs.alt)}"`);
|
||||
if (attrs.title) parts.push(`title="${escapeAttr(attrs.title)}"`);
|
||||
if (attrs.width != null) parts.push(`width="${escapeAttr(attrs.width)}"`);
|
||||
if (attrs.height != null) parts.push(`height="${escapeAttr(attrs.height)}"`);
|
||||
if (attrs.align) parts.push(`align="${escapeAttr(attrs.align)}"`);
|
||||
if (attrs.size != null) parts.push(`data-size="${escapeAttr(attrs.size)}"`);
|
||||
if (attrs.attachmentId)
|
||||
parts.push(`data-attachment-id="${escapeAttr(attrs.attachmentId)}"`);
|
||||
if (attrs.aspectRatio != null)
|
||||
parts.push(`data-aspect-ratio="${escapeAttr(attrs.aspectRatio)}"`);
|
||||
return `<img ${parts.join(" ")}>`;
|
||||
};
|
||||
|
||||
// Emit the schema-matching div[data-type="callout"] for a callout node. The
|
||||
// schema reads the banner type from data-callout-type. Children are rendered
|
||||
// as HTML so they survive inside a raw-HTML container.
|
||||
const calloutToHtml = (node: any): string => {
|
||||
const type = (node.attrs?.type || "info").toLowerCase();
|
||||
const inner = (node.content || []).map(blockToHtml).join("");
|
||||
return `<div data-type="callout" data-callout-type="${escapeAttr(type)}">${inner}</div>`;
|
||||
};
|
||||
|
||||
// Emit a schema-matching <details> tree. The schema parses <details>,
|
||||
// summary[data-type="detailsSummary"], and div[data-type="detailsContent"].
|
||||
const detailsToHtml = (node: any): string => {
|
||||
const inner = (node.content || []).map(blockToHtml).join("");
|
||||
return `<details>${inner}</details>`;
|
||||
};
|
||||
const detailsSummaryToHtml = (node: any): string =>
|
||||
`<summary data-type="detailsSummary">${inlineToHtml(node.content || [])}</summary>`;
|
||||
const detailsContentToHtml = (node: any): string => {
|
||||
const inner = (node.content || []).map(blockToHtml).join("");
|
||||
return `<div data-type="detailsContent">${inner}</div>`;
|
||||
};
|
||||
|
||||
// Emit the schema-matching taskList/taskItem HTML. bridgeTaskLists (in
|
||||
// collaboration.ts) recognizes ul[data-type="taskList"] with
|
||||
// li[data-type="taskItem"][data-checked]; emitting that directly here keeps
|
||||
// task lists inside columns/cells from degrading to literal "- [ ]" text.
|
||||
const taskListToHtml = (node: any): string => {
|
||||
const items = (node.content || [])
|
||||
.map((it: any) => {
|
||||
const checked = it.attrs?.checked ? "true" : "false";
|
||||
return `<li data-type="taskItem" data-checked="${checked}">${blockChildrenToHtml(it)}</li>`;
|
||||
})
|
||||
.join("");
|
||||
return `<ul data-type="taskList">${items}</ul>`;
|
||||
};
|
||||
|
||||
// Render a block node to HTML for the raw-HTML containers (spanned tables,
|
||||
// columns). marked does NOT re-parse markdown inside a raw-HTML block, so
|
||||
// EVERY block type that can appear inside a column or a spanned cell must be
|
||||
// emitted as schema-matching HTML here — never as markdown, or it would land
|
||||
// as literal text on re-import. Nodes whose processNode case already produces
|
||||
// schema-matching HTML (math/media/embed/attachment/nested columns/spanned
|
||||
// table) are delegated to processNode; the markdown-emitting cases
|
||||
// (image/blockquote/callout/details/hr/taskList) get explicit HTML here.
|
||||
const blockToHtml = (block: any): string => {
|
||||
const children = block.content || [];
|
||||
switch (block.type) {
|
||||
case "paragraph":
|
||||
return `<p>${inlineToHtml(children)}</p>`;
|
||||
case "heading": {
|
||||
const level = block.attrs?.level || 1;
|
||||
return `<h${level}>${inlineToHtml(children)}</h${level}>`;
|
||||
}
|
||||
case "bulletList":
|
||||
return `<ul>${children
|
||||
.map((li: any) => `<li>${blockChildrenToHtml(li)}</li>`)
|
||||
.join("")}</ul>`;
|
||||
case "orderedList":
|
||||
return `<ol>${children
|
||||
.map((li: any) => `<li>${blockChildrenToHtml(li)}</li>`)
|
||||
.join("")}</ol>`;
|
||||
case "codeBlock": {
|
||||
const lang = block.attrs?.language || "";
|
||||
// The code itself is element TEXT content (between <code> tags), so it
|
||||
// must escape < > & — NOT the attribute escaper. The language rides in
|
||||
// a class ATTRIBUTE, so it uses escapeAttr.
|
||||
const code = escapeHtmlText(
|
||||
children
|
||||
.map(processNode)
|
||||
.join("")
|
||||
.replace(/\n+$/, ""),
|
||||
);
|
||||
const cls = lang ? ` class="language-${escapeAttr(lang)}"` : "";
|
||||
return `<pre><code${cls}>${code}</code></pre>`;
|
||||
}
|
||||
case "image":
|
||||
return imageToHtml(block);
|
||||
case "blockquote":
|
||||
return `<blockquote>${children.map(blockToHtml).join("")}</blockquote>`;
|
||||
case "horizontalRule":
|
||||
return "<hr>";
|
||||
case "callout":
|
||||
return calloutToHtml(block);
|
||||
case "details":
|
||||
return detailsToHtml(block);
|
||||
case "detailsSummary":
|
||||
return detailsSummaryToHtml(block);
|
||||
case "detailsContent":
|
||||
return detailsContentToHtml(block);
|
||||
case "taskList":
|
||||
return taskListToHtml(block);
|
||||
case "taskItem":
|
||||
// A bare taskItem (outside a taskList) still needs a wrapping list so
|
||||
// the schema parses it; wrap it in a single-item taskList.
|
||||
return taskListToHtml({ content: [block] });
|
||||
// table (incl. spanned), columns/column, math, media, embed, attachment,
|
||||
// mention, etc. already emit schema-matching HTML from processNode.
|
||||
case "table":
|
||||
case "columns":
|
||||
case "column":
|
||||
case "mathBlock":
|
||||
case "video":
|
||||
case "audio":
|
||||
case "pdf":
|
||||
case "youtube":
|
||||
case "embed":
|
||||
case "attachment":
|
||||
case "drawio":
|
||||
case "excalidraw":
|
||||
return processNode(block);
|
||||
default:
|
||||
// Any still-unhandled block type: NEVER fall back to markdown inside a
|
||||
// raw-HTML block (it would become literal text). Wrap its rendered
|
||||
// children in a <div> so their content is preserved; if it has no block
|
||||
// children, render its inline content instead.
|
||||
if (children.length && children.some((c: any) => c.type !== "text")) {
|
||||
return `<div>${children.map(blockToHtml).join("")}</div>`;
|
||||
}
|
||||
return `<div>${inlineToHtml(children)}</div>`;
|
||||
}
|
||||
};
|
||||
|
||||
// Render the block children of a list item to HTML (a listItem holds block+
|
||||
// content). Mirrors processListItem but for the HTML fallback path.
|
||||
const blockChildrenToHtml = (item: any): string =>
|
||||
(item.content || []).map((b: any) => blockToHtml(b)).join("");
|
||||
|
||||
// Indent the rendered children of a list item under a marker prefix.
|
||||
// Each child block is a (possibly multi-line) string. The very first physical
|
||||
// line of the first child carries the marker (e.g. "- " or "1. "); EVERY
|
||||
// other line — the remaining lines of the first child AND all lines of every
|
||||
// subsequent child (nested lists, code blocks, extra paragraphs) — is indented
|
||||
// to align under the marker. Without indenting these continuation lines, the
|
||||
// 2nd/3rd line of a nested child collapses to column 0 and escapes the list.
|
||||
//
|
||||
// The continuation indent MUST equal the LIST marker width, which is not the
|
||||
// same as the visible prefix width:
|
||||
// - bullet "- " -> 2 columns
|
||||
// - task "- [ ] " -> marker is still "- " (the "[ ] " is content), 2
|
||||
// - ordered "1. "/"10. " -> 3/4 columns, scaling with the number's digits
|
||||
// CommonMark anchors nested content to the marker column, so an ordered item
|
||||
// indented to only 2 columns would be re-parsed as a sibling/loose content on
|
||||
// re-import. Callers therefore pass the exact indent width to use.
|
||||
const indentItemChildren = (
|
||||
childStrings: string[],
|
||||
prefix: string,
|
||||
indentWidth: number,
|
||||
): string => {
|
||||
const indent = " ".repeat(indentWidth);
|
||||
const lines: string[] = [];
|
||||
childStrings.forEach((child, childIndex) => {
|
||||
child.split("\n").forEach((line, lineIndex) => {
|
||||
if (childIndex === 0 && lineIndex === 0) {
|
||||
// First physical line of the first block gets the marker.
|
||||
lines.push(`${prefix} ${line}`);
|
||||
} else {
|
||||
// Indent every continuation line by the marker width; keep blank
|
||||
// lines blank rather than emitting trailing whitespace.
|
||||
lines.push(line.length ? `${indent}${line}` : "");
|
||||
}
|
||||
});
|
||||
});
|
||||
return lines.join("\n");
|
||||
};
|
||||
|
||||
const processListItem = (item: any, prefix: string): string => {
|
||||
const itemContent = item.content || [];
|
||||
const childStrings = itemContent.map(processNode);
|
||||
if (childStrings.length === 0) return prefix;
|
||||
// The rendered marker is `${prefix} ` (prefix + one space), so its width —
|
||||
// and thus the continuation indent — is prefix.length + 1. This is correct
|
||||
// for both bullet ("-" -> 2) and ordered ("1." -> 3, "10." -> 4) markers,
|
||||
// since for those the visible prefix IS the list marker.
|
||||
return indentItemChildren(childStrings, prefix, prefix.length + 1);
|
||||
};
|
||||
|
||||
const processTaskItem = (item: any): string => {
|
||||
const checked = item.attrs?.checked || false;
|
||||
const checkbox = checked ? "[x]" : "[ ]";
|
||||
const prefix = `- ${checkbox}`;
|
||||
const itemContent = item.content || [];
|
||||
const childStrings = itemContent.map(processNode);
|
||||
// An empty task item still needs its checkbox marker; without this guard
|
||||
// the indent below produces "" and the "- [ ]"/"- [x]" row disappears.
|
||||
if (childStrings.length === 0) return prefix;
|
||||
// The list marker for a task item is just "- " (2 columns); the "[ ] "/"[x] "
|
||||
// checkbox is item content, NOT part of the marker. So the continuation
|
||||
// indent is a fixed 2 — do NOT derive it from the wider prefix.length.
|
||||
return indentItemChildren(childStrings, prefix, 2);
|
||||
};
|
||||
|
||||
return processNode(content).trim();
|
||||
}
|
||||
Reference in New Issue
Block a user