/** * Pure markdown -> ProseMirror conversion. * * The converter path is `markdownToProseMirror` (marked -> HTML -> * generateJSON) plus the two pre/post processors it needs (`preprocessCallouts`, * `bridgeTaskLists`). The gitmost server writes the resulting page bodies * natively through the collab gateway, so no websocket/Yjs write-path lives * here. */ import { generateJSON } from "@tiptap/html"; import { JSDOM } from "jsdom"; import { marked } from "marked"; import { docmostExtensions } from "./docmost-schema.js"; // Setup DOM environment for Tiptap HTML parsing in Node.js const dom = new JSDOM("
"); global.window = dom.window as any; global.document = dom.window.document; // @ts-ignore global.Element = dom.window.Element; /** * Hard ceiling above which we skip callout preprocessing entirely. The linear * scanner below has no quadratic blow-up, but we still cap input defensively so * a pathological multi-megabyte payload cannot tie up the event loop; in that * case the markdown is passed through verbatim (callouts are simply not * detected) rather than risking a slow scan. */ const MAX_CALLOUT_PREPROCESS_BYTES = 4 * 1024 * 1024; // 4 MB /** Matches an opening callout fence: `:::type` (type captured, lower-cased). */ const CALLOUT_OPEN_RE = /^:::\s*(\w+)\s*$/; /** Matches a bare closing callout fence: `:::`. */ const CALLOUT_CLOSE_RE = /^:::\s*$/; /** * Matches an Obsidian-native callout opener: `> [!type]` (type captured). An * optional title after the type is allowed but ignored (the Docmost callout * schema has no title). The body is the following contiguous blockquote lines. */ const CALLOUT_BQ_OPEN_RE = /^>\s*\[!(\w+)\]/; /** Matches any blockquote continuation line (`>` … ). */ const BLOCKQUOTE_LINE_RE = /^>/; /** Matches the start/end of a code fence (``` or ~~~), capturing the marker. */ const CODE_FENCE_RE = /^(\s*)(`{3,}|~{3,})/; /** * Pre-process Docmost-flavoured markdown: convert `:::type ... :::` * callout blocks (the syntax our markdown export produces) into HTML * divs that the callout extension parses. The inner content is rendered * through marked as regular markdown. * * Implemented as a single linear pass over the lines (no quadratic regex * rescan). It: * - tracks fenced code regions (```...``` and ~~~...~~~) and never treats a * `:::` line that lives inside a code fence as a callout delimiter, so a * callout body that itself contains a fenced code block with a `:::` line is * no longer corrupted; * - matches an opening `:::type` line with the next CLOSING `:::` at the SAME * nesting level, supporting NESTED callouts via a depth counter (an inner * `:::type` opens a deeper level and consumes a matching `:::`); * - emits the same `text
` * wrapper is kept inside the `
// child (the shape marked emits: `
text
; the schema * then HOISTS the block atom out of that paragraph, leaving an EMPTY paragraph * sibling. On the next export that empty `
` renders to "" and the doc "\n\n" * join injects a phantom blank gap, so the markdown is not byte-stable. * * Markdown blank lines are separators, never content, so generateJSON only ever * produces an empty paragraph as such a hoist artifact — removing them is safe * and general (it also subsumes the