/** * Pure markdown -> ProseMirror conversion. * * The converter path is `markdownToProseMirror` (marked -> HTML -> * generateJSON) plus the two pre/post processors it needs (`preprocessCallouts`, * `bridgeTaskLists`). The gitmost server writes the resulting page bodies * natively through the collab gateway, so no websocket/Yjs write-path lives * here. */ import { generateJSON } from "@tiptap/html"; import { JSDOM } from "jsdom"; import { marked } from "marked"; import { docmostExtensions } from "./docmost-schema.js"; // Setup DOM environment for Tiptap HTML parsing in Node.js const dom = new JSDOM(""); global.window = dom.window; global.document = dom.window.document; // @ts-ignore global.Element = dom.window.Element; /** * Hard ceiling above which we skip callout preprocessing entirely. The linear * scanner below has no quadratic blow-up, but we still cap input defensively so * a pathological multi-megabyte payload cannot tie up the event loop; in that * case the markdown is passed through verbatim (callouts are simply not * detected) rather than risking a slow scan. */ const MAX_CALLOUT_PREPROCESS_BYTES = 4 * 1024 * 1024; // 4 MB /** Matches an opening callout fence: `:::type` (type captured, lower-cased). */ const CALLOUT_OPEN_RE = /^:::\s*(\w+)\s*$/; /** Matches a bare closing callout fence: `:::`. */ const CALLOUT_CLOSE_RE = /^:::\s*$/; /** Matches the start/end of a code fence (``` or ~~~), capturing the marker. */ const CODE_FENCE_RE = /^(\s*)(`{3,}|~{3,})/; /** * Pre-process Docmost-flavoured markdown: convert `:::type ... :::` * callout blocks (the syntax our markdown export produces) into HTML * divs that the callout extension parses. The inner content is rendered * through marked as regular markdown. * * Implemented as a single linear pass over the lines (no quadratic regex * rescan). It: * - tracks fenced code regions (```...``` and ~~~...~~~) and never treats a * `:::` line that lives inside a code fence as a callout delimiter, so a * callout body that itself contains a fenced code block with a `:::` line is * no longer corrupted; * - matches an opening `:::type` line with the next CLOSING `:::` at the SAME * nesting level, supporting NESTED callouts via a depth counter (an inner * `:::type` opens a deeper level and consumes a matching `:::`); * - emits the same `

` output * (inner rendered through marked) as the previous regex implementation. */ async function preprocessCallouts(markdown) { // Defensive cap: skip preprocessing for pathologically large inputs. if (markdown.length > MAX_CALLOUT_PREPROCESS_BYTES) { return markdown; } // Recursively transform a slice of lines, converting top-level callouts in // that slice into

blocks and rendering their inner content (which may // itself contain nested callouts) through this same function. const transform = async (lines) => { const out = []; let inCodeFence = false; let codeFenceMarker = ""; // the exact run of backticks/tildes that opened it let i = 0; while (i < lines.length) { const line = lines[i]; // Inside a code fence, only its matching closing fence is significant; // everything else (including `:::` lines) is copied through verbatim. if (inCodeFence) { out.push(line); const fence = line.match(CODE_FENCE_RE); if (fence && fence[2].startsWith(codeFenceMarker[0]) && fence[2].length >= codeFenceMarker.length) { inCodeFence = false; codeFenceMarker = ""; } i++; continue; } // A code fence opening outside any callout body: enter code-fence mode. const fenceOpen = line.match(CODE_FENCE_RE); if (fenceOpen) { inCodeFence = true; codeFenceMarker = fenceOpen[2]; out.push(line); i++; continue; } // An opening callout fence: scan forward (with code-fence and nested // callout awareness) for its matching closing `:::` at the same level. const open = line.match(CALLOUT_OPEN_RE); if (open) { const type = open[1].toLowerCase(); const bodyLines = []; let depth = 1; let innerInCodeFence = false; let innerCodeFenceMarker = ""; let j = i + 1; for (; j < lines.length; j++) { const bl = lines[j]; if (innerInCodeFence) { const f = bl.match(CODE_FENCE_RE); if (f && f[2].startsWith(innerCodeFenceMarker[0]) && f[2].length >= innerCodeFenceMarker.length) { innerInCodeFence = false; innerCodeFenceMarker = ""; } bodyLines.push(bl); continue; } const innerFence = bl.match(CODE_FENCE_RE); if (innerFence) { innerInCodeFence = true; innerCodeFenceMarker = innerFence[2]; bodyLines.push(bl); continue; } if (CALLOUT_OPEN_RE.test(bl)) { depth++; bodyLines.push(bl); continue; } if (CALLOUT_CLOSE_RE.test(bl)) { depth--; if (depth === 0) break; // matching close for THIS callout bodyLines.push(bl); continue; } bodyLines.push(bl); } if (j < lines.length) { // Found the matching closing fence: render the body (recursively, so // nested callouts are handled) and emit the callout div. const inner = await transform(bodyLines); const renderedInner = await marked.parse(inner); out.push(`\n

${renderedInner}

\n`); i = j + 1; // skip past the closing `:::` continue; } // No matching close (unterminated callout): treat the opener as a // literal line and continue, preserving the original text. out.push(line); i++; continue; } out.push(line); i++; } return out.join("\n"); }; return transform(markdown.split("\n")); } /** * Bridge marked's checkbox lists to TipTap task lists. * * marked renders GitHub task list items (`- [x] done`) as a plain * `

text

` WITHOUT the * markup TipTap's TaskList/TaskItem extensions parse. This rewrites such lists * into the shape those extensions expect: * TaskList parseHTML matches `ul[data-type="taskList"]`, * TaskItem matches `li[data-type="taskItem"]`, * the checked state is read from `data-checked === "true"`. * * A list is only converted when it has at least one `

` and EVERY direct * `

` contains a checkbox input. Both `

`s) would otherwise lose its task state. TipTap task lists are unordered, * so a matching `