diff --git a/apps/server/src/core/page/services/page.service.ts b/apps/server/src/core/page/services/page.service.ts index c6ee150d..ae20d277 100644 --- a/apps/server/src/core/page/services/page.service.ts +++ b/apps/server/src/core/page/services/page.service.ts @@ -52,7 +52,9 @@ import { INTERNAL_LINK_REGEX, extractPageSlugId, } from '../../../integrations/export/utils'; -import { markdownToHtml, canonicalizeFootnotes } from '@docmost/editor-ext'; +import { canonicalizeFootnotes } from '@docmost/editor-ext'; +import { markdownToProseMirror } from '@docmost/prosemirror-markdown'; +import { normalizeForeignMarkdown } from '../../../integrations/import/utils/foreign-markdown'; import { WatcherService } from '../../watcher/watcher.service'; import { sql } from 'kysely'; import { TransclusionService } from '../transclusion/transclusion.service'; @@ -1301,8 +1303,14 @@ export class PageService { switch (format) { case 'markdown': { - const html = await markdownToHtml(content as string); - prosemirrorJson = htmlToJson(html as string); + // Canonical markdown -> ProseMirror JSON directly via + // `@docmost/prosemirror-markdown` (issue #345) — no HTML intermediate, + // no editor-ext markdown layer. Foreign markdown surfaces the strict + // parser rejects (GFM `[^id]` reference footnotes) are normalized to the + // canonical inline form first. + prosemirrorJson = await markdownToProseMirror( + normalizeForeignMarkdown(content as string), + ); break; } case 'html': { diff --git a/apps/server/src/integrations/import/services/file-import-task.service.footnote-canonicalize.spec.ts b/apps/server/src/integrations/import/services/file-import-task.service.footnote-canonicalize.spec.ts index 08ecce15..10d36902 100644 --- a/apps/server/src/integrations/import/services/file-import-task.service.footnote-canonicalize.spec.ts +++ b/apps/server/src/integrations/import/services/file-import-task.service.footnote-canonicalize.spec.ts @@ -17,6 +17,22 @@ jest.mock('image-dimensions', () => ({ __esModule: true, imageDimensionsFromData: () => undefined, })); +// FileImportTaskService -> PageService -> collaboration.gateway -> +// metrics.registry imports `prom-client`, which is not resolvable in this +// workspace's node_modules (types-only stub, no runtime entry). Metrics are +// disabled on this path, so a virtual no-op mock keeps the module graph loadable. +jest.mock( + 'prom-client', + () => ({ + collectDefaultMetrics: () => undefined, + Registry: class {}, + Histogram: class {}, + Gauge: class {}, + Counter: class {}, + Summary: class {}, + }), + { virtual: true }, +); import { promises as fs } from 'fs'; import * as os from 'os'; @@ -26,14 +42,17 @@ import { ImportService } from './import.service'; /** * Binding test for issue #228 / review #5: FileImportTaskService.processGenericImport - * is a NON-editor write path (markdownToHtml -> processHTML -> JSON, never runs - * footnoteSyncPlugin), so it canonicalizes footnotes before persisting. This pins - * that binding — the same one import.service has a spec for — which previously had - * NO spec at all. + * is a NON-editor write path, so a zip-imported `.md` page ends up with canonical + * footnotes before persisting: ordered by first reference, reused refs deduped, + * orphan definitions dropped. * - * The markdown -> HTML -> ProseMirror conversion is REAL (a real ImportService, - * its createYdoc stubbed); the filesystem is a real temp dir with one .md file; - * the DB transaction is stubbed to capture the persisted page content. + * Since #345 the `.md` parse runs `normalizeForeignMarkdown` -> + * `markdownToProseMirror` -> `jsonToHtml` (feeding the shared HTML attachment / + * link pipeline) -> `processHTML` -> `canonicalizeFootnotes`. The parser assigns + * fresh `fn-*` ids, so we assert by definition BODY order rather than the source + * labels. The conversion is REAL (a real ImportService, its createYdoc stubbed); + * the filesystem is a real temp dir with one .md file; the DB transaction is + * stubbed to capture the persisted page content. */ // Out-of-order references (c, a, b), a REUSED reference ([^a] twice), and an @@ -49,13 +68,14 @@ const MARKDOWN = [ '[^z]: orphan note', ].join('\n'); -function footnoteListIds(content: any): string[] { +/** Definition body texts of the (single) footnotesList, in list order. */ +function footnoteListBodies(content: any): string[] { const list = (content?.content ?? []).find( (n: any) => n.type === 'footnotesList', ); return (list?.content ?? []) .filter((n: any) => n.type === 'footnoteDefinition') - .map((n: any) => n.attrs?.id); + .map((n: any) => n.content?.[0]?.content?.[0]?.text); } // A permissive chainable stub for the spaces lookup (selectFrom(...).select(...) @@ -134,15 +154,23 @@ describe('FileImportTaskService.processGenericImport — footnote canonicalizati expect(captured).toBeTruthy(); const content = captured.content; - // Reference order is c, a, b (NOT the markdown definition order a, b, c). - expect(footnoteListIds(content)).toEqual(['c', 'a', 'b']); + // Definitions ordered by FIRST REFERENCE (C, A, B), NOT the markdown + // definition order (A, B, C). Ids are the parser's fresh `fn-*`, so pin + // the BODIES. + expect(footnoteListBodies(content)).toEqual([ + 'note C', + 'note A', + 'note B', + ]); // Orphan [^z] dropped; reused [^a] collapses to one definition; one list. - expect(footnoteListIds(content)).not.toContain('z'); + expect(footnoteListBodies(content)).not.toContain('orphan note'); const lists = (content.content ?? []).filter( (n: any) => n.type === 'footnotesList', ); expect(lists).toHaveLength(1); - expect(footnoteListIds(content).filter((id) => id === 'a')).toHaveLength(1); + expect( + footnoteListBodies(content).filter((b) => b === 'note A'), + ).toHaveLength(1); } finally { await fs.rm(extractDir, { recursive: true, force: true }); } diff --git a/apps/server/src/integrations/import/services/file-import-task.service.ts b/apps/server/src/integrations/import/services/file-import-task.service.ts index 5ec2fe8d..a5115c43 100644 --- a/apps/server/src/integrations/import/services/file-import-task.service.ts +++ b/apps/server/src/integrations/import/services/file-import-task.service.ts @@ -1,6 +1,9 @@ import { Inject, Injectable, Logger } from '@nestjs/common'; import * as path from 'path'; -import { jsonToText } from '../../../collaboration/collaboration.util'; +import { + jsonToHtml, + jsonToText, +} from '../../../collaboration/collaboration.util'; import { InjectKysely } from 'nestjs-kysely'; import { KyselyDB } from '@docmost/db/types/kysely.types'; import { @@ -18,9 +21,11 @@ import { generateSlugId } from '../../../common/helpers'; import { v7 } from 'uuid'; import { generateJitteredKeyBetween } from 'fractional-indexing-jittered'; import { FileTask, InsertablePage } from '@docmost/db/types/entity.types'; -import { markdownToHtml, canonicalizeFootnotes } from '@docmost/editor-ext'; +import { canonicalizeFootnotes } from '@docmost/editor-ext'; +import { markdownToProseMirror } from '@docmost/prosemirror-markdown'; import { getProsemirrorContent } from '../../../common/helpers/prosemirror/utils'; import { formatImportHtml } from '../utils/import-formatter'; +import { normalizeForeignMarkdown } from '../utils/foreign-markdown'; import { buildAttachmentCandidates, collectMarkdownAndHtmlFiles, @@ -461,7 +466,18 @@ export class FileImportTaskService { content = await fs.readFile(absPath, 'utf-8'); if (page.fileExtension.toLowerCase() === '.md') { - content = await markdownToHtml(content); + // Parse markdown with the single canonical converter + // (`@docmost/prosemirror-markdown`), after normalizing foreign + // reference footnotes, then serialize to HTML so the shared HTML + // pipeline below (processAttachments + formatImportHtml + + // processHTML) keeps handling `.md` and `.html` imports + // uniformly. The markdown PARSE no longer goes through the + // editor-ext markdown layer (issue #345) — the drift source is + // gone. The PM -> HTML -> PM hop that follows is lossless + // plumbing for attachment/link resolution, NOT a second parse. + content = jsonToHtml( + await markdownToProseMirror(normalizeForeignMarkdown(content)), + ); } } catch (err: any) { if (err?.code === 'ENOENT') { @@ -500,10 +516,12 @@ export class FileImportTaskService { this.importService.extractTitleAndRemoveHeading(pmState); // Canonicalize footnote topology on this non-editor write path - // (markdownToHtml/processHTML never runs footnoteSyncPlugin), so a - // zip-imported page's footnotes are reference-ordered, deduped, and + // (the HTML pipeline's processHTML never runs footnoteSyncPlugin), so + // a zip-imported page's footnotes are reference-ordered, deduped, and // orphan-free like the editor's invariant (issue #228). Pure + - // idempotent + shape-safe; a footnote-free doc is unchanged. + // idempotent + shape-safe; a footnote-free doc is unchanged. (For a + // `.md` file the package parser already yields canonical footnotes, + // so this is a no-op there.) // (Future consolidation, architecture B: like import.service, this // path persists directly rather than via PageService — a shared // "prepare JSON for persist" helper would centralize this call.) diff --git a/apps/server/src/integrations/import/services/import.service.footnote-canonicalize.spec.ts b/apps/server/src/integrations/import/services/import.service.footnote-canonicalize.spec.ts index e53b17a1..40972b10 100644 --- a/apps/server/src/integrations/import/services/import.service.footnote-canonicalize.spec.ts +++ b/apps/server/src/integrations/import/services/import.service.footnote-canonicalize.spec.ts @@ -12,13 +12,19 @@ import { canonicalizeFootnotes } from '@docmost/editor-ext'; /** * Integration-ish test for the USER-FACING markdown import path - * (`ImportService.importPage`). It exercises the REAL markdown -> HTML -> JSON - * conversion and asserts that the stored page content has its footnotes - * canonicalized — the gap that issue #228 fixes: the import path builds - * ProseMirror JSON directly (never running the editor's footnoteSyncPlugin), so - * before this wiring the stored footnotes kept the markdown's physical - * definition order (out of order vs. references), retained orphan definitions, - * and did not collapse reused references. + * (`ImportService.importPage`). It exercises the REAL markdown -> ProseMirror + * conversion and asserts the stored page's footnotes are canonical: ordered by + * FIRST REFERENCE (not markdown definition order), reused references deduped to a + * single definition, and orphan definitions dropped. + * + * Since #345 the markdown parse runs through the canonical package + * (`normalizeForeignMarkdown` -> `markdownToProseMirror`), which owns this + * canonicalization: the input's GFM `[^id]` reference footnotes are normalized to + * inline `^[…]`, and the parser assigns fresh sequential ids (`fn-*`) in + * reference order while merging identical bodies — so we assert by definition + * BODY order, not by the source labels. `canonicalizeFootnotes` remains wired as + * an idempotent safety net (issue #228) and is a no-op on this already-canonical + * output. * * The DB/ydoc side-effects are stubbed: `getNewPagePosition` (DB query) and * `createYdoc` (Yjs encode) are spied, and `pageRepo.insertPage` captures the @@ -67,24 +73,14 @@ function makeService() { } /** List the footnote-definition ids of the (single) footnotesList, in order. */ -function footnoteListIds(content: any): string[] { +/** Definition body texts of the (single) footnotesList, in list order. */ +function footnoteListBodies(content: any): string[] { const list = (content.content ?? []).find( (n: any) => n.type === 'footnotesList', ); - if (!list) return []; - return (list.content ?? []) + return (list?.content ?? []) .filter((n: any) => n.type === 'footnoteDefinition') - .map((n: any) => n.attrs?.id); -} - -function definitionText(content: any, id: string): string | undefined { - const list = (content.content ?? []).find( - (n: any) => n.type === 'footnotesList', - ); - const def = (list?.content ?? []).find( - (n: any) => n.type === 'footnoteDefinition' && n.attrs?.id === id, - ); - return def?.content?.[0]?.content?.[0]?.text; + .map((n: any) => n.content?.[0]?.content?.[0]?.text); } describe('ImportService.importPage — footnote canonicalization (#228)', () => { @@ -101,23 +97,23 @@ describe('ImportService.importPage — footnote canonicalization (#228)', () => const content = getCaptured().content; expect(content).toBeTruthy(); - // Reference order is c, a, b (NOT the markdown definition order a, b, c). - expect(footnoteListIds(content)).toEqual(['c', 'a', 'b']); - - // Definitions preserved and attached to the right ids. - expect(definitionText(content, 'c')).toBe('note C'); - expect(definitionText(content, 'a')).toBe('note A'); - expect(definitionText(content, 'b')).toBe('note B'); + // Definitions ordered by FIRST REFERENCE (C, A, B) — NOT the markdown + // definition order (A, B, C) — with the orphan [^z] dropped and the reused + // [^a] collapsed to a single definition. (Ids are the parser's fresh `fn-*`, + // so we pin the BODIES.) + expect(footnoteListBodies(content)).toEqual(['note C', 'note A', 'note B']); // Orphan definition [^z] is dropped. - expect(footnoteListIds(content)).not.toContain('z'); + expect(footnoteListBodies(content)).not.toContain('orphan note'); // Reused [^a] yields exactly ONE definition, and exactly one list. const lists = (content.content ?? []).filter( (n: any) => n.type === 'footnotesList', ); expect(lists).toHaveLength(1); - expect(footnoteListIds(content).filter((id) => id === 'a')).toHaveLength(1); + expect( + footnoteListBodies(content).filter((b) => b === 'note A'), + ).toHaveLength(1); }); it('is idempotent: canonicalizing the stored output again is a no-op', async () => { @@ -134,6 +130,6 @@ describe('ImportService.importPage — footnote canonicalization (#228)', () => // time must not change it (safe to wire into every write path). const second = canonicalizeFootnotes(stored); expect(second).toEqual(stored); - expect(footnoteListIds(second)).toEqual(['c', 'a', 'b']); + expect(footnoteListBodies(second)).toEqual(['note C', 'note A', 'note B']); }); }); diff --git a/apps/server/src/integrations/import/services/import.service.ts b/apps/server/src/integrations/import/services/import.service.ts index 75418e55..dd86d71e 100644 --- a/apps/server/src/integrations/import/services/import.service.ts +++ b/apps/server/src/integrations/import/services/import.service.ts @@ -17,7 +17,9 @@ import { import { generateJitteredKeyBetween } from 'fractional-indexing-jittered'; import { TiptapTransformer } from '@hocuspocus/transformer'; import * as Y from 'yjs'; -import { markdownToHtml, canonicalizeFootnotes } from '@docmost/editor-ext'; +import { canonicalizeFootnotes } from '@docmost/editor-ext'; +import { markdownToProseMirror } from '@docmost/prosemirror-markdown'; +import { normalizeForeignMarkdown } from '../utils/foreign-markdown'; import { FileTaskStatus, FileTaskType, @@ -85,11 +87,13 @@ export class ImportService { const extracted = this.extractTitleAndRemoveHeading(prosemirrorState); const title = extracted.title; - // Imported markdown/HTML is built via markdownToHtml -> htmlToJson, which - // never runs the editor's footnoteSyncPlugin, so the footnote topology keeps - // the source's PHYSICAL definition order (out of order vs. references), - // retains orphan definitions, and is not deduped. Canonicalize before - // persisting so the stored page matches the editor's invariant (issue #228). + // The markdown path now canonicalizes footnotes itself (the package parser), + // but the HTML path (processHTML -> htmlToJson) does NOT run the editor's + // footnoteSyncPlugin, so an imported HTML doc can keep its source's PHYSICAL + // definition order (out of order vs. references), retain orphan definitions, + // and not be deduped. Canonicalize before persisting so the stored page + // matches the editor's invariant (issue #228); it is an idempotent no-op on + // the already-canonical markdown output. // Pure + idempotent + shape-safe: a doc with no footnotes is unchanged. // (Future consolidation, architecture B: this import path persists directly // via pageRepo.insertPage rather than through PageService.createPage, so the @@ -133,12 +137,15 @@ export class ImportService { } async processMarkdown(markdownInput: string): Promise { - try { - const html = await markdownToHtml(markdownInput); - return this.processHTML(html); - } catch (err) { - throw err; - } + // Canonical markdown -> ProseMirror JSON directly via + // `@docmost/prosemirror-markdown` (issue #345) — no HTML intermediate and no + // second editor-ext markdown layer. Foreign markdown surfaces the strict + // canonical parser does not accept (GFM `[^id]` reference footnotes) are + // rewritten to the canonical inline form by `normalizeForeignMarkdown` first. + // The HTML-cleanup pass (`normalizeImportHtml`) is intentionally skipped here: + // it targets foreign *HTML* (Notion/XWiki), which only ever arrives on the + // `.html` path (`processHTML`), never as canonical markdown. + return markdownToProseMirror(normalizeForeignMarkdown(markdownInput)); } async processHTML(htmlInput: string): Promise { diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts new file mode 100644 index 00000000..fb22f1dd --- /dev/null +++ b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts @@ -0,0 +1,113 @@ +import { + convertProseMirrorToMarkdown, + markdownToProseMirror, +} from '@docmost/prosemirror-markdown'; +import { normalizeForeignMarkdown } from './foreign-markdown'; + +/** + * STEP 2 goldens for issue #345: the foreign-markdown normalizer that runs at the + * import boundary BEFORE the strict canonical parser (`markdownToProseMirror`). + * + * Two layers: + * 1. PURE string→string cases pinning the normalizer's own behavior (GFM + * reference footnotes → inline `^[…]`). + * 2. END-TO-END acceptance: for a foreign corpus, `normalizeForeignMarkdown` + * then `markdownToProseMirror` then `convertProseMirrorToMarkdown` must leave + * NO literal `[^id]` / `:::` garbage in the document and must re-export in the + * canonical forms. + */ + +describe('normalizeForeignMarkdown — GFM reference footnotes', () => { + it('inlines a single-line reference footnote and drops its definition', () => { + const out = normalizeForeignMarkdown( + 'A note[^1] here.\n\n[^1]: The definition.', + ); + expect(out).toBe('A note^[The definition.] here.\n'); + }); + + it('inlines every reference to a reused id (downstream dedups)', () => { + const out = normalizeForeignMarkdown( + 'X[^a] and Y[^a].\n\n[^a]: shared.', + ); + expect(out).toBe('X^[shared.] and Y^[shared.].\n'); + }); + + it('joins indented continuation lines of a definition with a space', () => { + const out = normalizeForeignMarkdown( + 'See[^n].\n\n[^n]: line one\n line two', + ); + expect(out).toBe('See^[line one line two].\n'); + }); + + it('never rewrites a reference inside a fenced code block', () => { + const out = normalizeForeignMarkdown( + '```\ncode[^1] here\n```\n\n[^1]: def.', + ); + expect(out).toContain('code[^1] here'); + // The (now orphaned) definition line is still removed. + expect(out).not.toContain('[^1]: def.'); + }); + + it('leaves a reference with no matching definition literal (no body to inline)', () => { + const out = normalizeForeignMarkdown('Dangling[^x] ref.'); + expect(out).toBe('Dangling[^x] ref.'); + }); + + it('returns the input unchanged when there are no reference footnotes', () => { + const md = '# Title\n\nJust text with `inline code` and a [link](/x).'; + expect(normalizeForeignMarkdown(md)).toBe(md); + }); + + it('does NOT touch callout surfaces — the canonical parser handles them', () => { + const callouts = ':::info\nHi\n:::\n\n> [!warning]\n> Careful'; + expect(normalizeForeignMarkdown(callouts)).toBe(callouts); + }); +}); + +describe('foreign markdown import acceptance (normalizer + canonical parser)', () => { + const FOREIGN = [ + '# Doc', + '', + 'Body refs [^c] and [^a] and [^b] and again [^a].', + '', + ':::info', + 'A legacy callout.', + ':::', + '', + '| h1 | h2 |', + '| --- | --- |', + '| 1 | 2 |', + '', + '[^a]: note A', + '[^b]: note B', + '[^c]: note C', + '[^z]: orphan note', + ].join('\n'); + + it('leaves no literal [^id] or ::: in the imported doc and re-exports canonically', async () => { + const normalized = normalizeForeignMarkdown(FOREIGN); + const doc = await markdownToProseMirror(normalized); + const reexport = convertProseMirrorToMarkdown(doc); + + // No foreign garbage leaks into the document. + expect(reexport).not.toMatch(/\[\^/); // no reference footnote refs/defs + expect(reexport).not.toContain(':::'); // no legacy callout fences + + // Canonical forms are present. + expect(reexport).toContain('^[note C]'); + expect(reexport).toContain('> [!info]'); + expect(reexport).toContain('| h1 | h2 |'); + + // Footnotes: ordered by first reference (C, A, B), reused [^a] deduped to one, + // orphan [^z] dropped (it had no reference after normalization). + const list = doc.content.find((n: any) => n.type === 'footnotesList'); + const bodies = list.content.map( + (d: any) => d.content[0].content[0].text, + ); + expect(bodies).toEqual(['note C', 'note A', 'note B']); + expect(bodies).not.toContain('orphan note'); + expect( + doc.content.filter((n: any) => n.type === 'footnotesList'), + ).toHaveLength(1); + }); +}); diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.ts b/apps/server/src/integrations/import/utils/foreign-markdown.ts new file mode 100644 index 00000000..6c8e99f6 --- /dev/null +++ b/apps/server/src/integrations/import/utils/foreign-markdown.ts @@ -0,0 +1,167 @@ +/** + * Foreign-markdown normalizer — an input-liberal / output-canonical adapter that + * runs at the IMPORT boundary, BEFORE the canonical parser + * (`markdownToProseMirror` from `@docmost/prosemirror-markdown`). + * + * The canonical parser is deliberately STRICT: it only understands Docmost's + * canonical markdown surface (Obsidian-style `> [!type]` callouts, Pandoc/Obsidian + * inline footnotes `^[body]`, lossless `![alt](src) ` images, …). + * Import, however, ingests FOREIGN files (GitHub/GFM, Notion, old Docmost + * exports). Those use surfaces the canonical parser does not accept, most notably + * GitHub-flavoured *reference* footnotes: + * + * Text with a note[^1] and another[^long]. + * + * [^1]: The first definition. + * [^long]: A second one. + * + * Left untouched, the parser does NOT recognise `[^id]` (it only parses `^[body]`), + * so the reference leaks as literal text — and worse, the trailing `[^id]: def` + * line is a valid CommonMark *link-reference definition*, so `[^id]` is silently + * rendered as a bogus link. This normalizer rewrites reference footnotes into the + * canonical inline form so the parser materialises real footnote nodes. + * + * This is a TEXT pre-pass, NOT a second parser fork: it does not re-implement any + * converter logic. Callout surfaces (`:::type` and `> [!type]`) are intentionally + * NOT touched here — the canonical parser already accepts BOTH natively (its + * `preprocessCallouts` pass), so normalizing them would be redundant and would + * only risk degrading the parser's nesting/code-fence-aware handling. + */ + +/** Matches a fenced code block delimiter (``` or ~~~), capturing the marker run. */ +const CODE_FENCE_RE = /^(\s*)(`{3,}|~{3,})/; + +/** + * Matches a GFM footnote DEFINITION line: `[^id]: body`. The id is any run of + * non-`]` characters; the body is the remainder of the line (possibly empty). + */ +const FOOTNOTE_DEF_RE = /^\[\^([^\]]+)\]:[ \t]?(.*)$/; + +/** True when a line is a code-fence delimiter that toggles fenced-code state. */ +function fenceMarker(line: string): string | null { + const m = line.match(CODE_FENCE_RE); + return m ? m[2] : null; +} + +/** True when a line is indented (leading space/tab) and not blank — a continuation. */ +function isIndentedContinuation(line: string): boolean { + return /^[ \t]+\S/.test(line); +} + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Convert GFM reference footnotes (`[^id]` + `[^id]: def`) into canonical inline + * footnotes (`^[def]`). + * + * - Definitions are collected first (a leading `[^id]: text` line plus any + * immediately-following indented continuation lines, joined with a space) and + * removed from the output. + * - Each in-text reference `[^id]` for which a definition was found is replaced by + * `^[def]`. References with no matching definition are left literal (there is no + * body to inline; the parser fails them open the same way). + * - Code fences are respected on both passes: `[^id]` inside a ``` / ~~~ block is + * never rewritten, and a `[^id]:` line inside a fence is never treated as a + * definition. + * + * Deduplication / reference-ordering / orphan-dropping of the resulting footnotes + * is handled downstream by the canonical parser (`assembleFootnotes`); this pass + * only changes the surface syntax. + */ +function convertReferenceFootnotes(markdown: string): string { + const lines = markdown.split('\n'); + + // Pass 1: collect definitions and mark their lines for removal. + const defs = new Map(); + const dropped = new Array(lines.length).fill(false); + let inFence = false; + let fence = ''; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const marker = fenceMarker(line); + if (inFence) { + if (marker && marker[0] === fence[0] && marker.length >= fence.length) { + inFence = false; + fence = ''; + } + continue; + } + if (marker) { + inFence = true; + fence = marker; + continue; + } + + const def = line.match(FOOTNOTE_DEF_RE); + if (!def) continue; + + const id = def[1]; + const body: string[] = [def[2].trim()]; + dropped[i] = true; + + // Consume immediately-following indented continuation lines (GFM lazy + // continuation is not supported by design — keep it simple and predictable). + let j = i + 1; + while (j < lines.length && isIndentedContinuation(lines[j])) { + body.push(lines[j].trim()); + dropped[j] = true; + j++; + } + i = j - 1; + + // Last definition wins for a duplicated id (matches CommonMark link-ref + // semantics closely enough for a foreign-input adapter). + defs.set(id, body.filter((s) => s.length > 0).join(' ')); + } + + if (defs.size === 0) { + return markdown; + } + + // Pass 2: rewrite in-text references, skipping fenced code and dropped lines. + const out: string[] = []; + inFence = false; + fence = ''; + for (let i = 0; i < lines.length; i++) { + if (dropped[i]) continue; + let line = lines[i]; + + const marker = fenceMarker(line); + if (inFence) { + out.push(line); + if (marker && marker[0] === fence[0] && marker.length >= fence.length) { + inFence = false; + fence = ''; + } + continue; + } + if (marker) { + inFence = true; + fence = marker; + out.push(line); + continue; + } + + for (const [id, body] of defs) { + const ref = new RegExp('\\[\\^' + escapeRegExp(id) + '\\]', 'g'); + line = line.replace(ref, `^[${body}]`); + } + out.push(line); + } + + return out.join('\n'); +} + +/** + * Normalize a foreign markdown string into Docmost's canonical markdown surface + * so the strict canonical parser accepts it losslessly. Currently this rewrites + * GFM reference footnotes into inline footnotes; add further fixture-driven + * foreign-surface cases here as they are found. + */ +export function normalizeForeignMarkdown(markdown: string): string { + if (!markdown) return markdown; + return convertReferenceFootnotes(markdown); +} diff --git a/apps/server/test/stubs/tiptap-react.js b/apps/server/test/stubs/tiptap-react.js index e11ef884..edd98738 100644 --- a/apps/server/test/stubs/tiptap-react.js +++ b/apps/server/test/stubs/tiptap-react.js @@ -1,11 +1,21 @@ -// Jest stub for @tiptap/react. The server export/import code paths transitively -// import editor-ext, whose node extensions reference `ReactNodeViewRenderer` -// inside `addNodeView()` — code that only runs inside a live browser editor and -// is NEVER invoked on the server. The real module eagerly pulls react-dom, which -// throws `navigator is not defined` under jest's node environment. This stub -// supplies the named exports the extensions bind at import time; if any were -// actually called on the server that would (correctly) surface as a test error. +// Jest stub for @tiptap/react. +// +// The server export/import code paths transitively import editor-ext, whose node +// extensions import from `@tiptap/react`. The real module re-exports all of +// `@tiptap/core` (headless, safe under node) AND adds React view helpers +// (`ReactNodeViewRenderer`, …) that eagerly pull in react-dom — which throws +// `navigator is not defined` under jest's node environment. +// +// So this stub DELEGATES to the real `@tiptap/core` (keeping `mergeAttributes`, +// `Node`, `Mark`, `nodeInputRule`, … working — they are used by +// `jsonToHtml`/`htmlToJson` on the server) and overrides ONLY the React view +// helpers with no-ops. Those helpers are referenced solely inside `addNodeView()` +// — code that runs only in a live browser editor, never on the server; if any +// were actually invoked here it would (correctly) surface as a test failure. +const core = require('@tiptap/core'); + module.exports = { + ...core, ReactNodeViewRenderer: () => () => ({}), NodeViewWrapper: () => null, NodeViewContent: () => null,