diff --git a/packages/prosemirror-markdown/test/fixtures/counterexamples/columns-column-width-percent.json b/packages/prosemirror-markdown/test/fixtures/counterexamples/columns-column-width-percent.json new file mode 100644 index 00000000..c7af7dce --- /dev/null +++ b/packages/prosemirror-markdown/test/fixtures/counterexamples/columns-column-width-percent.json @@ -0,0 +1,16 @@ +{ + "_bug": "BUG #351: a `column` whose `width` is a percentage string (e.g. \"50%\") is NOT byte-stable across export->import->export (violates P2). The `column` schema's parseHTML does `parseFloat(getAttribute('data-width'))`, which silently drops the '%' unit and returns the NUMBER 50. So the first export emits data-width=\"50%\" but the re-import stores width=50, and the second export emits data-width=\"50\": md2 !== md1, a permanent GS-EDIT-REVERT churn (every git-sync pull rewrites the column width). The editor authors column widths as percentages, so this is a real data/round-trip defect. Fix belongs in src/lib/docmost-schema.ts column.width parseHTML (preserve the unit / keep the string), which is OUT OF SCOPE for this test-only PR and must be a separate, maintainer-approved change. This flat generator therefore keeps `column.width` frozen (never generates a non-default width).", + "doc": { + "type": "doc", + "content": [ + { + "type": "columns", + "attrs": { "layout": "two_equal", "widthMode": "normal" }, + "content": [ + { "type": "column", "attrs": { "width": "50%" }, "content": [{ "type": "paragraph", "content": [{ "type": "text", "text": "L" }] }] }, + { "type": "column", "attrs": { "width": "50%" }, "content": [{ "type": "paragraph", "content": [{ "type": "text", "text": "R" }] }] } + ] + } + ] + } +} diff --git a/packages/prosemirror-markdown/test/fixtures/counterexamples/ordered-list-start.json b/packages/prosemirror-markdown/test/fixtures/counterexamples/ordered-list-start.json new file mode 100644 index 00000000..36f3de9d --- /dev/null +++ b/packages/prosemirror-markdown/test/fixtures/counterexamples/ordered-list-start.json @@ -0,0 +1,19 @@ +{ + "doc": { + "type": "doc", + "content": [ + { + "type": "orderedList", + "attrs": { "type": null, "start": 5 }, + "content": [ + { + "type": "listItem", + "content": [ + { "type": "paragraph", "content": [{ "type": "text", "text": "alpha" }] } + ] + } + ] + } + ] + } +} diff --git a/packages/prosemirror-markdown/test/generative/attr-arbitraries.ts b/packages/prosemirror-markdown/test/generative/attr-arbitraries.ts new file mode 100644 index 00000000..50e66a5b --- /dev/null +++ b/packages/prosemirror-markdown/test/generative/attr-arbitraries.ts @@ -0,0 +1,294 @@ +/** + * Schema-DERIVED attribute-state fast-check arbitraries (#351, PR 1). + * + * This GENERALIZES the #350 stability-matrix helper (roundtrip-stability.helper.ts) + * to fast-check. Where that helper sweeps a HAND-WRITTEN 2-state matrix for one + * node spec, this module reads the attribute list straight from + * `schema.nodes[type].spec.attrs` (never a hand list) and, per attribute, + * generates over the FOUR states the issue calls for: + * + * - `absent` : the attribute is OMITTED entirely (the empty-string-vs- + * absent churn class the #350 fix targets). + * - `default` : the schema default value, authored explicitly. + * - `nonDefault` : a representative legal non-default value. + * - `degenerate` : `""` for strings, `0`/negative for numbers, the flipped + * value for booleans. + * + * ── Why a per-attribute override table ────────────────────────────────────── + * Everything that CAN be derived generically from the default's runtime type is + * (booleans flip; the degenerate value follows the runtime type). But two facts + * force a small, DOCUMENTED override table: + * + * 1. CONSTRAINED domains the schema does not encode. `image.align ∈ + * {left,center,right}`, `heading.level ∈ 1..6`, `callout.type ∈ + * {info,success,warning,danger}`, `columns.layout`, table-cell `align`, + * `status.color`, `orderedList.start ≥ 1`, etc. A generic "default + 1" + * would emit an ILLEGAL value, so these get an explicit legal domain. + * 2. ROUND-TRIP-safety, established EMPIRICALLY by probing the live converter + * (the classification captured in flat-roundtrip.property.test.ts). A frozen + * attribute falls into ONE of TWO explicitly-distinguished classes — never a + * silent "it just doesn't round-trip": + * + * (a) ACCEPTED LIMITATION — the attribute has NO markdown representation, + * so the loss is inherent to targeting markdown, not a converter + * defect. GFM/CommonMark simply cannot encode it. These: `paragraph`/ + * `heading` `indent`, `callout.icon`, `orderedList.type` (a/A/i + * markers), table `colspan`/`rowspan`/`colwidth`/`backgroundColor(Name)` + * (GFM tables are span-less/style-less). Each is tagged + * `// ACCEPTED:` inline. Freezing them is correct — there is nothing to + * preserve in the target format. + * + * (b) PINNED BUG — the attribute IS representable in markdown but the + * converter drops it anyway (a real defect). These are NOT silently + * frozen: each is captured as a LOUD `it.fails` counterexample in + * test/fixtures/counterexamples/ + counterexamples.test.ts, and the + * freeze here only keeps the P1/P2 union green until a MAINTAINER rules + * on accept-vs-fix (the epic guardrail reserves that call). These: + * `column.width` (parseFloat drops `%`), `orderedList.start` (non-1 + * start renders as `1.`). Tagged `// PINNED-BUG:` inline. + * - Several non-null-default attrs are MATERIALIZED on import but are not + * in canonicalize's KNOWN_DEFAULTS (`callout.type`, `status.color`, + * table `colspan`/`rowspan`, `columns.layout`/`widthMode`, + * `embed.width`/`height`, `heading.level`, `taskItem.checked`, + * `details.open`, `subpages.recursive`, `orderedList.start`). If left + * `absent` they re-materialize as a non-canonical default and diverge + * under P1. We mark them `always` so they are authored explicitly. + * - The documented numeric→string coercion set (`width height size + * aspectRatio`) is generated as STRINGS for the media family (a stored + * number re-parses as a string), EXCEPT `embed.width/height` which the + * embed schema keeps numeric — handled per-attr. + * + * Both PINNED-BUG attrs (`column.width` P2 churn, `orderedList.start` P1 loss) + * are captured as committed `it.fails` counterexamples — NOT hidden here. + */ +import fc from 'fast-check'; +import { getSchema } from '@tiptap/core'; +import { docmostExtensions } from '../../src/lib/index.js'; +import { phraseArb, letterPhraseArb, urlArb } from './text-arbitraries.js'; + +/** The exact ProseMirror schema the converter targets. */ +export const schema = getSchema(docmostExtensions as any); + +/** Sentinel: this attribute is OMITTED (the `absent` state). */ +export const ABSENT = Symbol('ABSENT'); + +/** The documented numeric→string coercion set (issue + roundtrip-stability.helper). */ +export const NUMERIC_STRING_ATTRS = ['width', 'height', 'size', 'aspectRatio']; + +/** Read the schema default for every attribute of a node type. */ +export function schemaAttrDefaults(type: string): Record { + const specAttrs = (schema.nodes[type]?.spec?.attrs ?? {}) as Record< + string, + { default: unknown } + >; + const out: Record = {}; + for (const [k, v] of Object.entries(specAttrs)) out[k] = v.default; + return out; +} + +/** Attribute names for a node type, straight from the schema (never hand-listed). */ +export function schemaAttrNames(type: string): string[] { + return Object.keys((schema.nodes[type]?.spec?.attrs ?? {}) as object); +} + +/** + * Per-attribute policy. Everything unlisted falls back to a generic policy: + * - a BOOLEAN default is fuzzable (its non-default is the flipped value); + * - any other default is `frozen` (only `absent`/`default` are generated) so + * we never invent an unverified non-default that might not round-trip. + * Listed attrs override this with a legal `arb` domain and/or flags. + */ +interface AttrPolicy { + /** Arbitrary for the `nonDefault` state's value. */ + arb?: fc.Arbitrary; + /** Value for the `degenerate` state (fuzz mode only). Omit to skip degenerate. */ + degen?: unknown; + /** Never emit `absent` — the attr must be authored (materialized default class). */ + always?: boolean; + /** Never emit the schema default value (required-ish attrs like `src`). Implies always. */ + noDefault?: boolean; + /** Never emit non-default/degenerate — attr has no md representation or churns. */ + frozen?: boolean; +} + +const num = (...xs: number[]) => fc.constantFrom(...xs); +const str = (...xs: string[]) => fc.constantFrom(...xs); +const widthStr = str('120', '320', '640'); + +// The documented override table, keyed `type.attr`. Every entry is grounded in +// the empirical converter probe (see flat-roundtrip.property.test.ts header). +const OVERRIDES: Record = { + // ── block text containers ──────────────────────────────────────────────── + // 'left' is the IMPLICIT default alignment: the converter drops it on export + // (empirically confirmed), so it never round-trips. Only center/right/justify + // carry through the `` comment. + 'paragraph.textAlign': { arb: str('center', 'right', 'justify') }, + 'paragraph.indent': { frozen: true }, // ACCEPTED: no md representation + 'heading.level': { always: true, arb: num(2, 3, 4, 5, 6) }, + 'heading.textAlign': { arb: str('center', 'right', 'justify') }, + 'heading.indent': { frozen: true }, // ACCEPTED: no md representation + // ── lists ──────────────────────────────────────────────────────────────── + // PINNED-BUG: markdown CAN express a non-1 start ("5."), but the converter + // renders "1." and drops it -> P1 loss. See counterexamples.test.ts + // (ordered-list-start.json). Frozen only until the maintainer rules accept-vs-fix. + 'orderedList.start': { always: true, frozen: true }, + 'orderedList.type': { frozen: true }, // ACCEPTED: a/A/i markers not expressible in GFM + 'taskItem.checked': { always: true, arb: fc.constant(true) }, // boolean, default false + // ── codeBlock ──────────────────────────────────────────────────────────── + 'codeBlock.language': { arb: str('js', 'ts', 'python', 'go', 'rust', 'bash') }, + // ── image / media (numeric→string width family) ────────────────────────── + 'image.src': { noDefault: true, arb: urlArb, degen: '' }, + 'image.align': { arb: str('left', 'right') }, + 'image.alt': { arb: letterPhraseArb, degen: '' }, + 'image.title': { arb: letterPhraseArb }, + 'image.width': { arb: widthStr, degen: '' }, + 'image.height': { arb: widthStr, degen: '' }, + 'video.src': { noDefault: true, arb: urlArb, degen: '' }, + 'video.alt': { arb: letterPhraseArb }, + 'video.width': { arb: widthStr }, + 'video.height': { arb: widthStr }, + 'audio.src': { noDefault: true, arb: urlArb, degen: '' }, + 'youtube.src': { noDefault: true, arb: urlArb }, + 'pdf.src': { noDefault: true, arb: urlArb }, + 'pdf.name': { arb: phraseArb }, + 'drawio.src': { noDefault: true, arb: urlArb }, + 'excalidraw.src': { noDefault: true, arb: urlArb }, + 'attachment.url': { noDefault: true, arb: urlArb }, + 'attachment.name': { arb: phraseArb }, + // ── callout / status ───────────────────────────────────────────────────── + 'callout.type': { always: true, arb: str('success', 'warning', 'danger') }, + 'callout.icon': { frozen: true }, // ACCEPTED: no md representation (dropped on export) + 'status.text': { noDefault: true, arb: phraseArb, degen: '' }, + 'status.color': { always: true, arb: str('green', 'orange', 'red', 'blue', 'yellow', 'purple') }, + // ── table cells — ACCEPTED: GFM tables cannot express spans / bg / colwidth ─ + 'tableCell.colspan': { always: true, frozen: true }, + 'tableCell.rowspan': { always: true, frozen: true }, + 'tableCell.colwidth': { frozen: true }, + 'tableCell.backgroundColor': { frozen: true }, + 'tableCell.backgroundColorName': { frozen: true }, + 'tableCell.align': { arb: str('left', 'center', 'right') }, + 'tableHeader.colspan': { always: true, frozen: true }, + 'tableHeader.rowspan': { always: true, frozen: true }, + 'tableHeader.colwidth': { frozen: true }, + 'tableHeader.backgroundColor': { frozen: true }, + 'tableHeader.backgroundColorName': { frozen: true }, + 'tableHeader.align': { arb: str('left', 'center', 'right') }, + // ── details ────────────────────────────────────────────────────────────── + 'details.open': { always: true, arb: fc.constant(true) }, // boolean, default false + // ── columns ────────────────────────────────────────────────────────────── + 'columns.layout': { always: true, arb: str('three_equal', 'left_sidebar', 'right_sidebar') }, + // widthMode round-trips via the `data-width-mode` attribute (verified P1+P2), + // so it is fuzzed, not frozen. + 'columns.widthMode': { always: true, arb: str('custom') }, + // PINNED-BUG: parseFloat import drops the `%` unit -> P2 churn. See + // counterexamples.test.ts (columns-column-width-percent.json). + 'column.width': { frozen: true }, + // ── embed (schema keeps width/height NUMERIC, not string-coerced) ───────── + 'embed.src': { noDefault: true, arb: urlArb, degen: '' }, + 'embed.provider': { noDefault: true, arb: str('iframe', 'youtube', 'vimeo') }, + 'embed.width': { always: true, frozen: true }, + 'embed.height': { always: true, frozen: true }, + // ── subpages / math / htmlEmbed ────────────────────────────────────────── + 'subpages.recursive': { always: true, arb: fc.constant(true) }, // boolean, default false + 'mathBlock.text': { noDefault: true, arb: str('x^2', 'a < b', '\\frac{1}{2}'), degen: '' }, + 'mathInline.text': { noDefault: true, arb: str('x^2', 'a < b', '\\frac{1}{2}'), degen: '' }, + 'htmlEmbed.source': { noDefault: true, arb: str('hi', 'x', 'y'), degen: '' }, + 'htmlEmbed.height': { arb: num(200, 300, 400) }, + // ── footnotes / transclusion / pageEmbed / mention ─────────────────────── + 'footnoteDefinition.id': { noDefault: true, arb: str('fn1', 'fn2', 'note') }, + 'footnoteReference.id': { noDefault: true, arb: str('fn1', 'fn2', 'note') }, + 'pageEmbed.sourcePageId': { noDefault: true, arb: fc.uuid() }, + 'transclusionSource.id': { noDefault: true, arb: str('src1', 'src2') }, + 'transclusionReference.sourcePageId': { noDefault: true, arb: fc.uuid() }, + 'transclusionReference.transclusionId': { noDefault: true, arb: str('tr1', 'tr2') }, + 'mention.id': { noDefault: true, arb: fc.uuid() }, + 'mention.label': { noDefault: true, arb: phraseArb }, + 'mention.entityType': { noDefault: true, arb: str('user') }, + 'mention.entityId': { noDefault: true, arb: fc.uuid() }, +}; + +/** Resolve the effective policy for one attribute (override merged over generic). */ +function policyFor(type: string, attr: string, def: unknown): AttrPolicy { + const override = OVERRIDES[`${type}.${attr}`]; + if (override) return override; + // Generic: booleans are fuzzable via their flipped value; everything else is + // frozen (only absent/default) so no unverified non-default is invented. + if (typeof def === 'boolean') return { arb: fc.constant(!def) }; + return { frozen: true }; +} + +/** + * Whether an attribute is actually exercised at a NON-DEFAULT value (i.e. its + * policy has an `arb`, which the generic fallback does not). Used by the + * attribute-coverage snapshot test to make the generic-frozen space VISIBLE: any + * string/number attr not in OVERRIDES is silently only tested at absent/default, + * so the snapshot pins exactly which attrs are NOT value-fuzzed and forces a + * reviewer to look when a new attr lands in that invisible bucket. + */ +export function attrIsValueFuzzed(type: string, attr: string): boolean { + const def = schemaAttrDefaults(type)[attr]; + return !!policyFor(type, attr, def).arb; +} + +/** Every `type.attr` in the schema (excluding the auto `id`), sorted. */ +export function allSchemaAttrKeys(): string[] { + const keys: string[] = []; + for (const type of Object.keys(schema.nodes)) { + for (const attr of schemaAttrNames(type)) { + if (attr === 'id') continue; + keys.push(`${type}.${attr}`); + } + } + return keys.sort(); +} + +export type AttrMode = 'p1' | 'fuzz'; + +/** + * Build an arbitrary for ONE attribute's value (or the ABSENT sentinel) across + * the states legal for `mode`: + * - p1 : absent / default / nonDefault (the round-trip-safe space). + * - fuzz : the above PLUS degenerate (P2 tolerates the one-time + * normalization; P3 only needs totality). + */ +export function attrValueArb( + type: string, + attr: string, + mode: AttrMode, +): fc.Arbitrary { + const def = schemaAttrDefaults(type)[attr]; + const p = policyFor(type, attr, def); + + const states: fc.Arbitrary[] = []; + if (!p.always && !p.noDefault) states.push(fc.constant(ABSENT)); + if (!p.noDefault) states.push(fc.constant(def)); + if (!p.frozen && p.arb) states.push(p.arb); + if (mode === 'fuzz' && !p.frozen && p.degen !== undefined) { + states.push(fc.constant(p.degen)); + } + if (states.length === 0) states.push(fc.constant(def)); + return fc.oneof(...states); +} + +/** + * Build an arbitrary for a node's full `attrs` object over all schema attrs. + * `base` pins caller-required attrs (e.g. a concrete `src`) verbatim; any attr + * present in `base` is NOT re-generated. Omitted (ABSENT) attrs are dropped. + */ +export function nodeAttrsArb( + type: string, + mode: AttrMode, + base: Record = {}, +): fc.Arbitrary> { + const names = schemaAttrNames(type).filter((n) => !(n in base) && n !== 'id'); + if (names.length === 0) return fc.constant({ ...base }); + return fc + .tuple(...names.map((n) => attrValueArb(type, n, mode))) + .map((vals) => { + const attrs: Record = { ...base }; + names.forEach((n, i) => { + if (vals[i] !== ABSENT) attrs[n] = vals[i]; + }); + return attrs; + }); +} diff --git a/packages/prosemirror-markdown/test/generative/counterexamples.test.ts b/packages/prosemirror-markdown/test/generative/counterexamples.test.ts new file mode 100644 index 00000000..4d6afd40 --- /dev/null +++ b/packages/prosemirror-markdown/test/generative/counterexamples.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from 'vitest'; +import { readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import path from 'node:path'; +import { convertProseMirrorToMarkdown } from '../../src/lib/markdown-converter.js'; +import { markdownToProseMirror } from '../../src/lib/markdown-to-prosemirror.js'; +import { docsCanonicallyEqual } from '../../src/lib/canonicalize.js'; + +// --------------------------------------------------------------------------- +// #351 committed counterexamples — REAL round-trip bugs surfaced by the flat +// generative probing (attribute level). Each is pinned here as an `it.fails` +// (vitest passes ONLY WHILE the assertion still fails), so that the day the +// underlying src/ bug is fixed, the `it.fails` starts PASSING and vitest turns +// this test RED — forcing us to delete the counterexample and (per the epic +// guardrail) tighten the generator. A bare `it.fails` would ship silent +// corruption, so every case below carries a loud `// BUG #351:` explanation. +// +// These bugs are NOT worked around by weakening any property: the offending +// attribute is kept OUT of the P1/P2 generators (documented in +// attr-arbitraries.ts), and the exact failing document lives here as the +// regression pin. FIXING the bug is a separate, maintainer-approved src/ change. +// --------------------------------------------------------------------------- + +const here = path.dirname(fileURLToPath(import.meta.url)); +const fixtureDir = path.resolve(here, '../fixtures/counterexamples'); + +function loadDoc(file: string): any { + return JSON.parse(readFileSync(path.join(fixtureDir, file), 'utf8')).doc; +} + +describe('#351 counterexamples (known round-trip bugs, pinned as it.fails)', () => { + // BUG #351: a `column` with a PERCENTAGE width ("50%") is not byte-stable. + // The column schema parses `data-width` with parseFloat, dropping the '%': + // md1 = '...data-width="50%"...' (first export) + // re-import stores width = 50 (number) + // md2 = '...data-width="50"...' (second export) => md2 !== md1 + // A permanent GS-EDIT-REVERT churn on every git-sync pull. The editor stores + // column widths as percentages, so this is a genuine defect. The fix is in + // src/lib/docmost-schema.ts (column.width parseHTML must preserve the unit) + // and is out of scope for this test-only PR. + it.fails('column percentage width is byte-stable (P2)', async () => { + const doc = loadDoc('columns-column-width-percent.json'); + const md1 = convertProseMirrorToMarkdown(doc); + const doc2 = await markdownToProseMirror(md1); + const md2 = convertProseMirrorToMarkdown(doc2); + // This assertion currently FAILS (md2 drops the '%'), which is exactly what + // `it.fails` expects. When the schema is fixed, it will PASS and flip this + // test red — our cue to remove the pin. + expect(md2).toBe(md1); + }); + + // BUG #351: an `orderedList` with a non-1 `start` loses its start number. + // CommonMark CAN express this ("5." starts the list at 5), but the converter + // always emits "1." and ignores `attrs.start` (markdown-converter.ts renders + // `${index + 1}.`; the
    HTML path also omits `start`): + // doc.start = 5 -> md1 = "1. alpha" (start dropped on export) + // re-import stores start = 1 => docsCanonicallyEqual(rt, doc) === false + // This is a P1 (semantic round-trip) loss of the SAME class as column.width: + // representable in markdown, silently dropped by the converter. It is pinned + // here as the LOUD counterexample rather than being masked as an "accepted + // normalization" in the generator — per the epic guardrail, deciding + // accept-vs-fix for a markdown-representable loss is a MAINTAINER call, so this + // stays a visible known-bug until the maintainer rules on it. The fix would be + // in src/lib/markdown-converter.ts (emit the start number on the first item) + // and is out of scope for this test-only PR. + it.fails('ordered list start number is preserved (P1)', async () => { + const doc = loadDoc('ordered-list-start.json'); + const md1 = convertProseMirrorToMarkdown(doc); + const doc2 = await markdownToProseMirror(md1); + // Currently FAILS: doc2.start === 1 while doc.start === 5. When the converter + // preserves `start`, this PASSES and flips the test red — remove the pin then. + expect(docsCanonicallyEqual(doc2, doc)).toBe(true); + }); +}); diff --git a/packages/prosemirror-markdown/test/generative/flat-roundtrip.property.test.ts b/packages/prosemirror-markdown/test/generative/flat-roundtrip.property.test.ts new file mode 100644 index 00000000..8543bc4a --- /dev/null +++ b/packages/prosemirror-markdown/test/generative/flat-roundtrip.property.test.ts @@ -0,0 +1,231 @@ +import { describe, expect, it, vi } from 'vitest'; +import fc from 'fast-check'; +// Real converter, imported the same way the sibling property test does. +import { convertProseMirrorToMarkdown } from '../../src/lib/markdown-converter.js'; +// Importing markdownToProseMirror mutates the global DOM via jsdom at module +// load (expected, required for @tiptap/html's generateJSON under Node). +import { markdownToProseMirror } from '../../src/lib/markdown-to-prosemirror.js'; +import { docsCanonicallyEqual, canonicalizeContent } from '../../src/lib/index.js'; +import { firstDivergence } from '../roundtrip-helpers.js'; +import { + schema, + allSchemaAttrKeys, + attrIsValueFuzzed, +} from './attr-arbitraries.js'; +import { + buildGenerators, + coveredTypes, + KNOWN_UNCOVERED, +} from './node-generators.js'; + +// ── Attribute-value coverage allowlist ────────────────────────────────────── +// The node/mark completeness contract guarantees every TYPE is generated, but +// NOT that every attribute is exercised at a NON-DEFAULT value. An attribute +// with no `arb` in attr-arbitraries.ts is only ever tested at absent/default — +// an INVISIBLE coverage hole (the reviewer's concern). This allowlist makes that +// hole EXPLICIT: it is the exact set of attrs deliberately not value-fuzzed, so +// a NEW attribute (or a newly-frozen one) that lands in this bucket flips the +// snapshot test red and forces a reviewer to classify it. Each belongs to one of: +// - internal/opaque ids & placeholders (attachmentId, slugId, placeholder, +// creatorId, anchorId) — no meaningful non-default to assert; +// - dimensions/among the media family with no standalone md form here +// (aspectRatio, size, caption, drawio/excalidraw/pdf/video/youtube w/h/align) +// — round-trip candidates deferred to a later PR, not silently dropped; +// - ACCEPTED limitations with no md representation (indent, callout.icon, +// orderedList.type, table spans/bg/colwidth); +// - PINNED bugs (column.width, orderedList.start) tracked in +// counterexamples.test.ts. +const ATTR_VALUE_FUZZ_ALLOWLIST = new Set([ + 'attachment.attachmentId', 'attachment.mime', 'attachment.placeholder', 'attachment.size', + 'audio.attachmentId', 'audio.placeholder', 'audio.size', + 'callout.icon', 'column.width', + 'drawio.align', 'drawio.alt', 'drawio.aspectRatio', 'drawio.attachmentId', + 'drawio.height', 'drawio.size', 'drawio.title', 'drawio.width', + 'embed.align', 'embed.height', 'embed.width', + 'excalidraw.align', 'excalidraw.alt', 'excalidraw.aspectRatio', 'excalidraw.attachmentId', + 'excalidraw.height', 'excalidraw.size', 'excalidraw.title', 'excalidraw.width', + 'heading.indent', + 'image.aspectRatio', 'image.attachmentId', 'image.caption', 'image.placeholder', 'image.size', + 'mention.anchorId', 'mention.creatorId', 'mention.slugId', + 'orderedList.start', 'orderedList.type', 'paragraph.indent', + 'pdf.attachmentId', 'pdf.height', 'pdf.placeholder', 'pdf.size', 'pdf.width', + 'tableCell.backgroundColor', 'tableCell.backgroundColorName', 'tableCell.colspan', + 'tableCell.colwidth', 'tableCell.rowspan', + 'tableHeader.backgroundColor', 'tableHeader.backgroundColorName', 'tableHeader.colspan', + 'tableHeader.colwidth', 'tableHeader.rowspan', + 'video.align', 'video.aspectRatio', 'video.attachmentId', 'video.placeholder', 'video.size', + 'youtube.align', 'youtube.height', 'youtube.width', +]); + +// Each run does a real convert + marked + jsdom parse (~ms). Give ample headroom +// so the suite is deterministic regardless of parallel worker load (like the +// sibling property file). +vi.setConfig({ testTimeout: 30000 }); + +// --------------------------------------------------------------------------- +// #351 PR 1 — GENERATIVE (property-based) round-trip over FLAT (single-node) +// documents at the ATTRIBUTE level. +// +// We assert three invariants for ANY generated valid flat document `d` +// (pmToMd = convertProseMirrorToMarkdown, mdToPm = markdownToProseMirror): +// +// P1 — semantic round-trip (nothing lost): +// docsCanonicallyEqual(await mdToPm(pmToMd(d)), d) === true +// P2 — byte fixpoint (anti "GS-EDIT-REVERT" churn): +// pmToMd(await mdToPm(pmToMd(d))) === pmToMd(d) +// P3 — totality: neither converter throws; bounded. +// +// The generators are schema-DERIVED (attribute lists come from +// schema.nodes[type].spec.attrs) and stay inside the round-trip-supported space +// proven empirically by probing the live converter (see attr-arbitraries.ts and +// text-arbitraries.ts). P1 runs over the safe attribute space; P2/P3 run over +// the wider 'fuzz' space that also injects degenerate attribute states, which +// P2 tolerates via a one-time first-pass normalization and P3 via totality only. +// --------------------------------------------------------------------------- + +// Fixed seed so every failure is reproducible; fast-check also prints the +// shrunk counterexample. numRuns starts modest to keep CI under budget — the +// issue's CI target is ~300-500 per property; the nightly / PR 3 will crank +// this up further. Each property runs over the UNION (fc.oneof) of all flat +// node generators, so the runs are shared across node types (one test per +// property keeps the jsdom import cost and memory bounded — a per-generator × +// per-property matrix is ~200 heavy tests that OOMs the worker). +const SEED = 20250705; +const NUM_RUNS = 300; + +const P1_GENERATORS = buildGenerators('p1'); +const FUZZ_GENERATORS = buildGenerators('fuzz'); + +// Union arbitraries: a single draw picks one node generator, then a document +// from it. On failure fast-check prints the shrunk counterexample doc, which +// names the offending node type directly. +const p1Union = fc.oneof(...P1_GENERATORS.map((g) => g.arb)); +const fuzzUnion = fc.oneof(...FUZZ_GENERATORS.map((g) => g.arb)); + +async function roundTrip(doc: unknown): Promise<{ md1: string; md2: string; doc2: any }> { + const md1 = convertProseMirrorToMarkdown(doc); + const doc2 = await markdownToProseMirror(md1); + const md2 = convertProseMirrorToMarkdown(doc2); + return { md1, md2, doc2 }; +} + +describe('#351 flat generative round-trip — completeness contract', () => { + it('every schema node and mark is covered by a generator or explicitly allowlisted', () => { + const covered = coveredTypes(); + const uncovered: string[] = []; + + for (const nodeType of Object.keys(schema.nodes)) { + if (covered.has(nodeType)) continue; + if (nodeType in KNOWN_UNCOVERED) continue; + uncovered.push(`node:${nodeType}`); + } + for (const markType of Object.keys(schema.marks)) { + if (covered.has(`mark:${markType}`)) continue; + if (markType in KNOWN_UNCOVERED) continue; + uncovered.push(`mark:${markType}`); + } + + // A new node/mark added to the schema with no generator AND no allowlist + // entry MUST turn this test red — that is the whole point (no silent blind + // spots). + expect( + uncovered, + `these schema types have no generator and no KNOWN_UNCOVERED reason:\n ${uncovered.join( + '\n ', + )}`, + ).toEqual([]); + }); + + it('every KNOWN_UNCOVERED entry is a real schema type (no stale allowlist rows)', () => { + const all = new Set([...Object.keys(schema.nodes), ...Object.keys(schema.marks)]); + for (const t of Object.keys(KNOWN_UNCOVERED)) { + expect(all.has(t), `stale KNOWN_UNCOVERED entry: ${t}`).toBe(true); + } + }); + + it('every attribute is value-fuzzed OR explicitly allowlisted (no invisible hole)', () => { + // Makes the "generic-frozen" coverage hole VISIBLE: any schema attr not + // exercised at a non-default value must be a KNOWN entry in the allowlist. + // A new attr (or one that loses its `arb`) that falls into the not-fuzzed + // bucket without an allowlist row turns this red — no silent blind spots. + const unaccounted: string[] = []; + for (const key of allSchemaAttrKeys()) { + const i = key.indexOf('.'); + const fuzzed = attrIsValueFuzzed(key.slice(0, i), key.slice(i + 1)); + if (!fuzzed && !ATTR_VALUE_FUZZ_ALLOWLIST.has(key)) unaccounted.push(key); + } + expect( + unaccounted, + `these attrs are not value-fuzzed and not in ATTR_VALUE_FUZZ_ALLOWLIST:\n ${unaccounted.join( + '\n ', + )}`, + ).toEqual([]); + }); + + it('the attribute allowlist has no stale rows (every entry is really not-fuzzed)', () => { + const notFuzzed = new Set( + allSchemaAttrKeys().filter((key) => { + const i = key.indexOf('.'); + return !attrIsValueFuzzed(key.slice(0, i), key.slice(i + 1)); + }), + ); + for (const key of ATTR_VALUE_FUZZ_ALLOWLIST) { + expect( + notFuzzed.has(key), + `stale allowlist row (attr is now value-fuzzed, remove it): ${key}`, + ).toBe(true); + } + }); +}); + +describe('#351 flat generative round-trip — properties', () => { + it('generator validity: every generated doc passes schema.check()', () => { + // A generator that emits an invalid ProseMirror document is a GENERATOR bug. + fc.assert( + fc.property(fuzzUnion, (doc) => { + schema.nodeFromJSON(doc).check(); // throws on an invalid doc + return true; + }), + { numRuns: NUM_RUNS, seed: SEED }, + ); + }); + + it('P1 — semantic round-trip: docsCanonicallyEqual(mdToPm(pmToMd(d)), d)', async () => { + await fc.assert( + fc.asyncProperty(p1Union, async (doc) => { + const { doc2 } = await roundTrip(doc); + if (!docsCanonicallyEqual(doc2, doc)) { + // Surface the precise divergence in the failure message. + const div = firstDivergence( + JSON.parse(JSON.stringify(canonicalizeContent(doc2))), + JSON.parse(JSON.stringify(canonicalizeContent(doc))), + ); + throw new Error( + `P1 divergence @ ${div?.path}: got=${JSON.stringify(div?.a)} want=${JSON.stringify(div?.b)}`, + ); + } + }), + { numRuns: NUM_RUNS, seed: SEED }, + ); + }); + + it('P2 — byte fixpoint: pmToMd(mdToPm(pmToMd(d))) === pmToMd(d)', async () => { + await fc.assert( + fc.asyncProperty(fuzzUnion, async (doc) => { + const { md1, md2 } = await roundTrip(doc); + expect(md2).toBe(md1); + }), + { numRuns: NUM_RUNS, seed: SEED }, + ); + }); + + it('P3 — totality: neither converter throws', async () => { + await fc.assert( + fc.asyncProperty(fuzzUnion, async (doc) => { + // Throwing here fails the property; fast-check shrinks to a minimal doc. + await roundTrip(doc); + }), + { numRuns: NUM_RUNS, seed: SEED }, + ); + }); +}); diff --git a/packages/prosemirror-markdown/test/generative/node-generators.ts b/packages/prosemirror-markdown/test/generative/node-generators.ts new file mode 100644 index 00000000..f3d72881 --- /dev/null +++ b/packages/prosemirror-markdown/test/generative/node-generators.ts @@ -0,0 +1,310 @@ +/** + * Flat single-node document generators (#351, PR 1). + * + * For every schema node type that can stand alone, a fast-check arbitrary + * producing `{ type:'doc', content:[ ] }` with generated attrs + * (via nodeAttrsArb) and the minimal REQUIRED immediate children the schema + * demands (a heading's inline text, a listItem's one paragraph, a table's + * minimal rows, details' summary+content, a callout's one paragraph). Kept + * FLAT: a single target node, no deep nesting — nested structural generation is + * PR 2. + * + * The `mode` threads through to the attribute arbitraries: + * - 'p1' : the round-trip-safe attribute space (P1 semantic round-trip). + * - 'fuzz' : adds degenerate attribute states (P2 byte-fixpoint tolerates the + * one-time normalization; P3 only needs totality). + * + * A COMPLETENESS CONTRACT (see flat-roundtrip.property.test.ts) enumerates the + * whole schema and asserts every node/mark is EITHER produced by a generator + * here OR listed in KNOWN_UNCOVERED with a reason — so a new schema type with no + * generator turns the suite RED. + */ +import fc from 'fast-check'; +import { type AttrMode, nodeAttrsArb } from './attr-arbitraries.js'; +import { + inlineContentArb, + headingInlineContentArb, + plainInlineContentArb, + phraseArb, + markedTextRunArb, +} from './text-arbitraries.js'; + +const doc = (node: any) => ({ type: 'doc', content: [node] }); +const para = (content: any[]) => ({ type: 'paragraph', content }); + +/** A named flat-document generator. */ +export interface NamedGen { + name: string; + arb: fc.Arbitrary; +} + +// --------------------------------------------------------------------------- +// Per-target generators, each a function of mode. +// --------------------------------------------------------------------------- + +const gen = { + paragraph: (m: AttrMode) => + fc.tuple(nodeAttrsArb('paragraph', m), inlineContentArb).map(([attrs, content]) => + doc({ type: 'paragraph', attrs, content }), + ), + + heading: (m: AttrMode) => + fc.tuple(nodeAttrsArb('heading', m), headingInlineContentArb).map(([attrs, content]) => + doc({ type: 'heading', attrs, content }), + ), + + blockquote: (_m: AttrMode) => + inlineContentArb.map((content) => doc({ type: 'blockquote', content: [para(content)] })), + + bulletList: (_m: AttrMode) => + fc + .array(inlineContentArb, { minLength: 1, maxLength: 3 }) + .map((items) => + doc({ + type: 'bulletList', + content: items.map((c) => ({ type: 'listItem', content: [para(c)] })), + }), + ), + + orderedList: (m: AttrMode) => + fc + .tuple(nodeAttrsArb('orderedList', m), fc.array(inlineContentArb, { minLength: 1, maxLength: 3 })) + .map(([attrs, items]) => + doc({ + type: 'orderedList', + attrs, + content: items.map((c) => ({ type: 'listItem', content: [para(c)] })), + }), + ), + + taskList: (m: AttrMode) => + fc + .array(fc.tuple(nodeAttrsArb('taskItem', m), inlineContentArb), { minLength: 1, maxLength: 3 }) + .map((items) => + doc({ + type: 'taskList', + content: items.map(([attrs, c]) => ({ type: 'taskItem', attrs, content: [para(c)] })), + }), + ), + + codeBlock: (m: AttrMode) => + fc + .tuple( + nodeAttrsArb('codeBlock', m), + // A fenced code block always re-imports with a TRAILING NEWLINE in its + // text (empirically confirmed). Author the newline so the doc is already + // at the round-trip fixpoint (supported-space shaping, not a masked bug). + fc.array(phraseArb, { minLength: 1, maxLength: 3 }).map((lines) => lines.join('\n') + '\n'), + ) + .map(([attrs, code]) => + doc({ type: 'codeBlock', attrs, content: [{ type: 'text', text: code }] }), + ), + + horizontalRule: (_m: AttrMode) => fc.constant(doc({ type: 'horizontalRule' })), + + pageBreak: (_m: AttrMode) => fc.constant(doc({ type: 'pageBreak' })), + + image: (m: AttrMode) => nodeAttrsArb('image', m).map((attrs) => doc({ type: 'image', attrs })), + + callout: (m: AttrMode) => + fc.tuple(nodeAttrsArb('callout', m), inlineContentArb).map(([attrs, content]) => + doc({ type: 'callout', attrs, content: [para(content)] }), + ), + + mathBlock: (m: AttrMode) => + nodeAttrsArb('mathBlock', m).map((attrs) => doc({ type: 'mathBlock', attrs })), + + details: (m: AttrMode) => + fc + .tuple(nodeAttrsArb('details', m), plainInlineContentArb, inlineContentArb) + .map(([attrs, summary, body]) => + doc({ + type: 'details', + attrs, + content: [ + { type: 'detailsSummary', content: summary }, + { type: 'detailsContent', content: [para(body)] }, + ], + }), + ), + + table: (_m: AttrMode) => + fc.integer({ min: 1, max: 3 }).chain((cols) => { + // GFM alignment is column-wide (encoded in the header separator), so a + // column's alignment must be identical on the header and every body cell, + // else the second export re-aligns and churns. Pick ONE align per column. + const alignsArb = fc.array(fc.constantFrom(undefined, 'left', 'center', 'right'), { + minLength: cols, + maxLength: cols, + }); + const cell = (header: boolean, align?: string) => + phraseArb.map((t) => ({ + type: header ? 'tableHeader' : 'tableCell', + // colspan/rowspan pinned to 1 (GFM cannot express spans); optional + // column-consistent align. + attrs: { colspan: 1, rowspan: 1, ...(align ? { align } : {}) }, + content: [para([{ type: 'text', text: t }])], + })); + return alignsArb.chain((aligns) => { + const headerRow = fc + .tuple(...aligns.map((a) => cell(true, a))) + .map((cells) => ({ type: 'tableRow', content: cells })); + const bodyRow = fc + .tuple(...aligns.map((a) => cell(false, a))) + .map((cells) => ({ type: 'tableRow', content: cells })); + return fc + .tuple(headerRow, fc.array(bodyRow, { minLength: 1, maxLength: 2 })) + .map(([h, body]) => doc({ type: 'table', content: [h, ...body] })); + }); + }), + + columns: (m: AttrMode) => + // Couple the column count to the layout so the two stay consistent + // (two_equal/left_sidebar/right_sidebar -> 2, three_equal -> 3). + fc + .constantFrom('two_equal', 'three_equal', 'left_sidebar', 'right_sidebar') + .chain((layout) => { + const count = layout === 'three_equal' ? 3 : 2; + return fc + .tuple( + nodeAttrsArb('columns', m, { layout, widthMode: 'normal' }), + fc.array(inlineContentArb, { minLength: count, maxLength: count }), + ) + .map(([attrs, bodies]) => + doc({ + type: 'columns', + attrs, + content: bodies.map((c) => ({ type: 'column', content: [para(c)] })), + }), + ); + }), + + subpages: (m: AttrMode) => + nodeAttrsArb('subpages', m).map((attrs) => doc({ type: 'subpages', attrs })), + + audio: (m: AttrMode) => nodeAttrsArb('audio', m).map((attrs) => doc({ type: 'audio', attrs })), + video: (m: AttrMode) => nodeAttrsArb('video', m).map((attrs) => doc({ type: 'video', attrs })), + pdf: (m: AttrMode) => nodeAttrsArb('pdf', m).map((attrs) => doc({ type: 'pdf', attrs })), + youtube: (m: AttrMode) => nodeAttrsArb('youtube', m).map((attrs) => doc({ type: 'youtube', attrs })), + embed: (m: AttrMode) => nodeAttrsArb('embed', m).map((attrs) => doc({ type: 'embed', attrs })), + drawio: (m: AttrMode) => nodeAttrsArb('drawio', m).map((attrs) => doc({ type: 'drawio', attrs })), + excalidraw: (m: AttrMode) => + nodeAttrsArb('excalidraw', m).map((attrs) => doc({ type: 'excalidraw', attrs })), + attachment: (m: AttrMode) => + nodeAttrsArb('attachment', m).map((attrs) => doc({ type: 'attachment', attrs })), + htmlEmbed: (m: AttrMode) => + nodeAttrsArb('htmlEmbed', m).map((attrs) => doc({ type: 'htmlEmbed', attrs })), + pageEmbed: (m: AttrMode) => + nodeAttrsArb('pageEmbed', m).map((attrs) => doc({ type: 'pageEmbed', attrs })), + transclusionReference: (m: AttrMode) => + nodeAttrsArb('transclusionReference', m).map((attrs) => + doc({ type: 'transclusionReference', attrs }), + ), + + transclusionSource: (m: AttrMode) => + fc.tuple(nodeAttrsArb('transclusionSource', m), inlineContentArb).map(([attrs, content]) => + doc({ type: 'transclusionSource', attrs, content: [para(content)] }), + ), + + // A footnote reference PLUS its definition (the reference has no standalone + // markdown form without its definition — see KNOWN_UNCOVERED note for the + // bare reference). Both carry the same id. The definition body uses + // headingInlineContentArb (NO hard breaks): a footnote is serialized inline as + // `^[...]`, so a hard break inside it collapses to a single space on re-parse + // (empirically confirmed) — that is the container's markdown limitation, not + // an attribute-level concern. The reference-bearing paragraph is a NORMAL + // paragraph and keeps the full inline corpus. + footnotes: (m: AttrMode) => + fc.tuple(fc.constantFrom('fn1', 'fn2', 'note'), inlineContentArb, headingInlineContentArb).map( + ([id, refText, noteBody]) => ({ + type: 'doc', + content: [ + para([...refText, { type: 'footnoteReference', attrs: { id } }]), + { + type: 'footnotesList', + content: [{ type: 'footnoteDefinition', attrs: { id }, content: [para(noteBody)] }], + }, + ], + }), + ), + + // ── inline targets wrapped in a paragraph ──────────────────────────────── + mention: (m: AttrMode) => + nodeAttrsArb('mention', m).map((attrs) => doc(para([{ type: 'mention', attrs }]))), + + mathInline: (m: AttrMode) => + fc.tuple(phraseArb, nodeAttrsArb('mathInline', m)).map(([t, attrs]) => + doc(para([{ type: 'text', text: t }, { type: 'mathInline', attrs }])), + ), + + status: (m: AttrMode) => + nodeAttrsArb('status', m).map((attrs) => doc(para([{ type: 'status', attrs }]))), + + hardBreak: (_m: AttrMode) => + fc.tuple(phraseArb, phraseArb).map(([a, b]) => + doc(para([{ type: 'text', text: a }, { type: 'hardBreak' }, { type: 'text', text: b }])), + ), + + // ── marks: a paragraph of marked runs (covers every mark type) ─────────── + marksOnText: (_m: AttrMode) => + fc.array(markedTextRunArb, { minLength: 1, maxLength: 5 }).map((runs) => { + // Merge adjacent same-mark runs (see text-arbitraries.normalizeInline). + const out: any[] = []; + for (const r of runs) { + const prev = out[out.length - 1]; + if (prev && JSON.stringify(prev.marks ?? []) === JSON.stringify(r.marks ?? [])) { + prev.text += r.text; + } else out.push({ ...r }); + } + return doc(para(out)); + }), +}; + +/** Build the full list of named generators for a given mode. */ +export function buildGenerators(mode: AttrMode): NamedGen[] { + return Object.entries(gen).map(([name, f]) => ({ name, arb: f(mode) })); +} + +// --------------------------------------------------------------------------- +// Completeness contract support. +// --------------------------------------------------------------------------- + +/** + * Schema node/mark types deliberately NOT covered by a P1/P2 generator, each + * with a one-line reason. Excluding a type means it is kept OUT of the round- + * trip generators — it does NOT weaken any property. + * + * NOTE (empirical): the candidates the issue flagged for review — pageEmbed, + * subpages, transclusionSource/Reference, mention, status — were PROBED against + * the live converter and DO round-trip P1/P2 with placeholder ids, so they are + * COVERED by real generators rather than allowlisted here. The allowlist below + * holds only types with no standalone flat generator by construction. + */ +export const KNOWN_UNCOVERED: Record = { + // The root node; it is the wrapper every generated doc already is, never a + // "target" content node, so it has no standalone generator of its own. + doc: 'the document root wrapper, not a content node with a standalone generator', +}; + +/** Recursively collect every node type and `mark:` under a tree. */ +export function collectTypes(node: any, seen = new Set()): Set { + if (!node || typeof node !== 'object') return seen; + if (node.type) seen.add(node.type); + for (const m of node.marks ?? []) if (m?.type) seen.add(`mark:${m.type}`); + for (const c of node.content ?? []) collectTypes(c, seen); + return seen; +} + +/** + * Sample every generator and return the union of node/mark types they produce. + * Deterministic (fixed seed) so the completeness contract is stable. + */ +export function coveredTypes(seed = 12345, perGen = 60): Set { + const seen = new Set(); + for (const { arb } of buildGenerators('p1')) { + for (const sample of fc.sample(arb, { numRuns: perGen, seed })) { + collectTypes(sample, seen); + } + } + return seen; +} diff --git a/packages/prosemirror-markdown/test/generative/text-arbitraries.ts b/packages/prosemirror-markdown/test/generative/text-arbitraries.ts new file mode 100644 index 00000000..76353b76 --- /dev/null +++ b/packages/prosemirror-markdown/test/generative/text-arbitraries.ts @@ -0,0 +1,258 @@ +/** + * Hostile inline-text corpus for the generative flat-document round-trip suite + * (#351, PR 1). + * + * These arbitraries are a DIRECT PORT of the "supported space" guardrails that + * `test/markdown-roundtrip.property.test.ts` proved empirically against the live + * converter. That file's long header documents WHY each guardrail exists; rather + * than re-derive them, we reuse the exact same shapes here so the attribute-level + * generative suite inherits the same byte-stable text space. Each guardrail is + * cited back to that file below. + * + * The corpus deliberately spans the CommonMark / canon hostile alphabet + * (`* _ [ ] ( ) { } | < > & # ! ~ = + -`), unicode / emoji / RTL, and the legal + * mark combinations on runs (including the `code` mark, which the schema's + * `excludes: "_"` makes suppress every co-occurring mark — so it is never + * combined with another mark in the byte-stable space). + */ +import fc from 'fast-check'; + +// --------------------------------------------------------------------------- +// Words and the hostile special-character alphabet. +// (Ported from markdown-roundtrip.property.test.ts, "Inline text arbitraries".) +// --------------------------------------------------------------------------- + +/** Alphanumeric "word" (no markdown-significant characters). Length 1..6. */ +export const wordArb = fc + .stringMatching(/^[A-Za-z0-9]{1,6}$/) + .filter((w) => w.length > 0); + +/** + * A SINGLE markdown-significant character, emitted only as an isolated, + * space-flanked token. Every char the task calls out plus a few more; each was + * verified byte-stable in this position by the sibling property test. + * + * NOTE: the backtick (`) is DELIBERATELY excluded from free-floating plain text + * (it is a code-span delimiter that re-pairs globally). It is exercised only via + * the `code` mark and code blocks — see markdown-roundtrip.property.test.ts. + */ +export const specialCharArb = fc.constantFrom( + '*', '_', '[', ']', '(', ')', '{', '}', '|', '<', '>', '&', '#', '!', '~', '=', '+', '-', +); + +// A pinch of unicode / emoji / RTL, always word-like (no markdown specials) so +// it stays inside the space-flanked corpus. Kept letter/emoji-bearing so it is +// never coerced to a number (see letterPhraseArb rationale). +export const unicodeWordArb = fc.constantFrom( + 'café', 'naïve', 'Zürich', 'Москва', 'こんにちは', '你好', '😀', '🚀x', 'مرحبا', 'שלום', +); + +/** + * A "safe special" text string: a space-joined sequence of tokens that always + * BEGINS and ENDS with an alphanumeric word, with any isolated special chars (or + * unicode words) confined to the MIDDLE, each space-flanked by words. + * + * Both boundary guarantees matter (verbatim from the sibling test): + * * Leading word: the line never opens with a block/inline trigger + * (">", "*", "-", "#", "1." ...). + * * Trailing word: adjacent text runs CONCATENATE with no separator, so a run + * ending in a bare "<" beside a run starting with a letter would form a fake + * HTML tag. Ending every run with a word keeps every special internal and + * space-flanked even after concatenation. + */ +export const safeTextArb: fc.Arbitrary = fc + .tuple( + wordArb, + fc.array(fc.oneof(wordArb, specialCharArb, unicodeWordArb), { + minLength: 0, + maxLength: 3, + }), + wordArb, + ) + .map(([first, middle, last]) => [first, ...middle, last].join(' ')); + +/** + * A plain alphanumeric phrase (1..3 words) for places where even isolated + * specials are not wanted (e.g. code-block language, mention labels, status + * text, table cells rendered on the plain-markdown path). + */ +export const phraseArb: fc.Arbitrary = fc + .array(wordArb, { minLength: 1, maxLength: 3 }) + .map((ws) => ws.join(' ')); + +/** + * A phrase guaranteed to contain at least one letter. Used for image/media alt + * text and link titles: a PURELY numeric alt/title (e.g. "0") is parsed back as + * a NUMBER and then dropped by the converter's `value || ""` coercion — not + * byte-stable. A letter anywhere keeps it a string. (Ported verbatim.) + */ +export const letterPhraseArb: fc.Arbitrary = fc + .tuple( + fc.stringMatching(/^[A-Za-z]{1,4}$/), + fc.array(wordArb, { minLength: 0, maxLength: 2 }), + ) + .map(([head, rest]) => [head, ...rest].join(' ')); + +/** A paren/space-free URL — safe inside markdown link/image `(...)` syntax. */ +export const urlArb: fc.Arbitrary = fc + .webUrl() + .filter((u) => !/[()\s]/.test(u)); + +// --------------------------------------------------------------------------- +// Marked inline runs. +// (Ported from markdown-roundtrip.property.test.ts "markedTextRunArb".) +// --------------------------------------------------------------------------- + +/** + * A text run with an OPTIONAL single non-code formatting mark (bold/italic/ + * strike/underline/superscript/subscript/spoiler), or a SOLE `code` mark, or a + * link, or an inline comment anchor. `code` is NEVER combined with another mark + * in the byte-stable space (that combination is a documented converter + * limitation — the schema's `code` mark declares `excludes: "_"`). Marks wrap + * `safeTextArb`, which stays stable even when it contains isolated specials. + * + * The mark set here is broadened past the sibling test's {bold,italic,strike} + * to also cover underline / superscript / subscript / spoiler / textStyle / + * highlight (all single, non-code marks), so the marks-on-text generator + * exercises every mark the schema declares except the deliberately-excluded + * `code`+other combination. + */ +export const markedTextRunArb: fc.Arbitrary = fc.oneof( + // Plain text. + safeTextArb.map((t) => ({ type: 'text', text: t })), + // Single formatting mark (attribute-free marks). + fc + .tuple( + safeTextArb, + fc.constantFrom('bold', 'italic', 'strike', 'underline', 'superscript', 'subscript', 'spoiler'), + ) + .map(([t, m]) => ({ type: 'text', text: t, marks: [{ type: m }] })), + // highlight with a color attr. + fc + .tuple(safeTextArb, fc.constantFrom('#ffcc00', '#a0e0ff', 'yellow')) + .map(([t, color]) => ({ type: 'text', text: t, marks: [{ type: 'highlight', attrs: { color } }] })), + // textStyle with a color attr. + fc + .tuple(safeTextArb, fc.constantFrom('#123456', '#ff0000', '#00aa88')) + .map(([t, color]) => ({ type: 'text', text: t, marks: [{ type: 'textStyle', attrs: { color } }] })), + // Sole code mark (backtick span). safeTextArb is backtick-free, so the span + // content cannot contain an inner backtick. + safeTextArb.map((t) => ({ type: 'text', text: t, marks: [{ type: 'code' }] })), + // Link with safe text, a paren/space-free href, optionally a letter-bearing + // title (a purely numeric title is coerced to a number and dropped). + fc + .tuple(phraseArb, urlArb, fc.option(letterPhraseArb, { nil: undefined })) + .map(([t, href, title]) => ({ + type: 'text', + text: t, + marks: [{ type: 'link', attrs: title ? { href, title } : { href } }], + })), + // Inline comment anchor: a span[data-comment-id] that must survive byte-for- + // byte. commentId is an alphanumeric token; `resolved` rides only when true. + fc + .tuple(safeTextArb, fc.stringMatching(/^[A-Za-z0-9]{4,10}$/), fc.boolean()) + .map(([t, commentId, resolved]) => ({ + type: 'text', + text: t, + marks: [ + { type: 'comment', attrs: resolved ? { commentId, resolved: true } : { commentId } }, + ], + })), +); + +// --------------------------------------------------------------------------- +// Inline atoms and inline-content assembly. +// (Ported from markdown-roundtrip.property.test.ts.) +// --------------------------------------------------------------------------- + +/** Inline math node carrying LaTeX that includes the `a < b` the task asks for. */ +export const mathInlineArb: fc.Arbitrary = fc + .constantFrom('a < b', 'x^2 + y^2', 'a < b < c', '\\frac{1}{2}', 'E = mc^2') + .map((text) => ({ type: 'mathInline', attrs: { text } })); + +/** Mention node; label/id/entity are plain phrases / uuids. */ +export const mentionArb: fc.Arbitrary = fc + .tuple(phraseArb, fc.uuid(), fc.uuid()) + .map(([label, id, entityId]) => ({ + type: 'mention', + attrs: { id, label, entityType: 'user', entityId }, + })); + +export const hardBreakArb: fc.Arbitrary = fc.constant({ type: 'hardBreak' }); + +const sameMarks = (a: any[] | undefined, b: any[] | undefined): boolean => + JSON.stringify(a ?? []) === JSON.stringify(b ?? []); + +/** + * Canonicalize a generated inline-content array the way ProseMirror stores it, + * then trim the markdown-fragile edges. (Ported verbatim from + * markdown-roundtrip.property.test.ts "normalizeInline":) + * 1) MERGE adjacent text runs with IDENTICAL marks (the editor coalesces + * them; split same-mark runs export to ambiguous "**a****b**"). + * 2) Collapse CONSECUTIVE hard breaks (two render a blank line marked eats). + * 3) Drop a TRAILING hard break (removed by the converter's .trim()). + */ +export function normalizeInline(nodes: any[]): any[] { + const out: any[] = []; + for (const node of nodes) { + const prev = out[out.length - 1]; + if (node.type === 'hardBreak' && prev && prev.type === 'hardBreak') continue; + if ( + node.type === 'text' && + prev && + prev.type === 'text' && + sameMarks(prev.marks, node.marks) + ) { + prev.text += node.text; + continue; + } + out.push(node.type === 'text' ? { ...node } : node); + } + while (out.length > 1 && out[out.length - 1].type === 'hardBreak') out.pop(); + return out; +} + +/** + * Inline content for a paragraph: at least one marked text run, optionally with + * inline atoms (math/mention) and hard breaks interspersed. Always starts with a + * text run so the paragraph never opens with a block trigger. (Ported.) + */ +export const inlineContentArb: fc.Arbitrary = fc + .tuple( + markedTextRunArb, + fc.array( + fc.oneof( + { weight: 5, arbitrary: markedTextRunArb }, + { weight: 1, arbitrary: mathInlineArb }, + { weight: 1, arbitrary: mentionArb }, + { weight: 1, arbitrary: hardBreakArb }, + ), + { minLength: 0, maxLength: 4 }, + ), + ) + .map(([first, rest]) => normalizeInline([first, ...rest])); + +/** + * Inline content for a HEADING — identical to a paragraph's, but WITHOUT hard + * breaks. A hard break inside an ATX heading is not byte-stable (marked splits + * the heading). (Ported.) + */ +export const headingInlineContentArb: fc.Arbitrary = fc + .tuple( + markedTextRunArb, + fc.array( + fc.oneof( + { weight: 5, arbitrary: markedTextRunArb }, + { weight: 1, arbitrary: mathInlineArb }, + { weight: 1, arbitrary: mentionArb }, + ), + { minLength: 0, maxLength: 4 }, + ), + ) + .map(([first, rest]) => normalizeInline([first, ...rest])); + +/** Simple plain-text inline content (single run) for containers rendered on the + * raw-HTML path (table cells / column bodies) where fancy inline is undesirable. */ +export const plainInlineContentArb: fc.Arbitrary = phraseArb.map((t) => [ + { type: 'text', text: t }, +]);