fix(git-sync,converter): 3 HIGH round-trip/data-loss fixes found by QA (#359)

Implements the 3 HIGH fixes agent_qa found via the RALPH test cycle (analysis theirs,
implementation here), fixtures-first.

- T2E-1 [converter]: a multi-paragraph callout collapsed to one paragraph on round-trip.
  markdown-converter `case "callout"` now joins block children with "\n>\n" (a blank ">"
  separator), byte-identical to the blockquote serializer, so two paragraphs survive
  re-import. Fixture 12-callout-multiblock; two existing tests updated (their old
  expectations encoded the collapse bug).
- T6-listnest [converter]: a callout/blockquote nested in a list item corrupted on
  round-trip (callout type lost, `[!type]` leaked into text — confirmed DB corruption).
  Add bridgeNestedCallouts: a post-marked, nesting-agnostic JSDOM pass that reconstructs
  callouts from `[!type]`-opening blockquotes at ANY depth (the indented `  > [!type]`
  form the column-0-anchored preprocessor misses), emitting the same callout div as the
  top-level path (disjoint inputs, no double-processing). Strict `^[!type]` guard, so a
  real blockquote — or one with `[!` mid-line — is NOT converted. Fixture 13-callout-in-list
  + a no-lead control + false-positive controls (proven non-vacuous).
- D-P3-1 [git-sync]: a "ghost" file (a tracked id that was never a page) was silently
  absence-deleted on pull-reconcile -> data loss. Add GitSyncClient.pageIdsExist (datasource:
  workspace-scoped SELECT over any space incl. trashed/moved, non-UUID ids filtered first);
  cycle.ts computes the candidate-delete set and passes existing ids as `deletableIds` into
  planReconciliation, which now absence-deletes ONLY real page rows — a ghost is preserved.
  A cycle-level e2e test proves the ghost survives runCycle end-to-end.

Suites green: prosemirror-markdown 687, git-sync 272 (no type errors), datasource 36,
orchestrator 26; both packages tsc clean. Scope: the 3 fixes + tests/fixtures only, no
develop re-merge. T4E-esc (LOW escaping) left for the maintainer, per agent_qa.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-07-05 21:27:21 +03:00
parent 6956563961
commit 5ac08b8f4b
17 changed files with 657 additions and 23 deletions
@@ -114,6 +114,10 @@ function build(opts: BuildOptions = {}): Built {
// The read-side / write-side client the datasource hands back.
const client: Record<string, AnyMock> = {
listSpaceTree: jest.fn(async () => ({ pages: [], complete: true })),
// Historical behavior: every candidate id is a real page row. Present so the
// fake does not rely on `listTrackedFiles -> []` to avoid the ghost-guard
// probe, and won't break if a future test seeds tracked files.
pageIdsExist: jest.fn(async (ids: string[]) => ids),
deletePage: jest.fn(async () => undefined),
createPage: jest.fn(async () => undefined),
updatePageBody: jest.fn(async () => undefined),
@@ -200,6 +200,51 @@ describe('GitmostDataSourceService', () => {
});
});
// D-P3-1 ghost guard: the pull reconciler asks which candidate-delete ids are
// REAL page rows before deleting a tracked vault file. Only rows returned here
// may be absence-deleted; a ghost id (never a page) is preserved.
describe('pageIdsExist (D-P3-1 ghost guard)', () => {
const UUID_A = '11111111-1111-4111-8111-111111111111';
const UUID_B = '22222222-2222-4222-8222-222222222222';
it('returns the subset that are real page rows, workspace-scoped (any space/state)', async () => {
// The DB reports only UUID_A exists as a row.
const { service, mocks } = build([{ id: UUID_A }]);
const res = await service.bind(CTX).pageIdsExist([UUID_A, UUID_B]);
expect(res).toEqual([UUID_A]);
// The query is workspace-scoped and restricted to the candidate ids.
// Crucially there is NO deletedAt / spaceId filter: a trashed page or a
// page moved to another space still counts as "exists" (its file is
// cleaned up), while a ghost has no row (its file is preserved).
expect(mocks.db.selectFrom).toHaveBeenCalledWith('pages');
const qb = mocks.db.selectFrom.mock.results[0].value;
expect(qb.where).toHaveBeenCalledWith('workspaceId', '=', 'ws-1');
expect(qb.where).toHaveBeenCalledWith('id', 'in', [UUID_A, UUID_B]);
const filterCols = qb.where.mock.calls.map((c: any[]) => c[0]);
expect(filterCols).not.toContain('deletedAt');
expect(filterCols).not.toContain('spaceId');
});
it('filters malformed non-UUID ids BEFORE the predicate (and skips the query when none remain)', async () => {
// A mix of one valid uuid + malformed tokens: only the valid uuid reaches
// the predicate, so a bad `gitmost_id` cannot trigger a 22P02 that would
// reject the whole existence probe.
const { service, mocks } = build([]);
const res = await service
.bind(CTX)
.pageIdsExist([UUID_A, 'not-a-uuid', '[unclosed']);
const qb = mocks.db.selectFrom.mock.results[0].value;
expect(qb.where).toHaveBeenCalledWith('id', 'in', [UUID_A]);
expect(res).toEqual([]);
// When EVERY candidate is malformed, the query is skipped entirely.
const { service: s2, mocks: m2 } = build([]);
const res2 = await s2.bind(CTX).pageIdsExist(['not-a-uuid', '[unclosed']);
expect(res2).toEqual([]);
expect(m2.db.selectFrom).not.toHaveBeenCalled();
});
});
describe('getPageJson', () => {
it('returns the engine page shape with ISO updatedAt + content', async () => {
const { service, mocks } = build();
@@ -78,6 +78,7 @@ export class GitmostDataSourceService {
return {
listSpaceTree: (spaceId, rootPageId) =>
this.listSpaceTree(ctx, spaceId, rootPageId),
pageIdsExist: (pageIds) => this.pageIdsExist(ctx, pageIds),
getPageJson: (pageId) => this.getPageJson(ctx, pageId),
// The id-scoped WRITE ops are wrapped so a malformed (non-UUID) `pageId`
// from a broken vault `gitmost_id` frontmatter cannot wedge the space's sync
@@ -191,6 +192,36 @@ export class GitmostDataSourceService {
return { pages, complete: true };
}
/**
* Existence probe for the pull-side ghost guard (D-P3-1). Return the subset of
* `pageIds` that correspond to a REAL page ROW in this workspace — INCLUDING
* soft-deleted (trashed) rows and pages in ANY OTHER space (no `deletedAt` /
* `spaceId` filter), i.e. any id that has ever been a page. The engine only
* absence-deletes a tracked vault file whose id is returned here; a GHOST id
* (a git file whose id was never a page) has no row and is preserved.
*
* Malformed (non-UUID) ids are filtered out BEFORE the `id = ANY(...)`
* predicate: an invalid uuid literal would make Postgres reject the whole
* query (error 22P02), and a ghost id is often exactly such a hand-edited
* token. A filtered-out id is never returned, so its file is preserved.
*/
private async pageIdsExist(
ctx: GitSyncBindContext,
pageIds: string[],
): Promise<string[]> {
const validIds = pageIds.filter((id) => isValidUUID(id));
if (validIds.length === 0) return [];
const rows = await this.db
.selectFrom('pages')
.select(['id'])
.where('workspaceId', '=', ctx.workspaceId)
.where('id', 'in', validIds)
.execute();
return rows.map((row) => row.id);
}
/**
* One page WITH its ProseMirror body content (editor-ext schema). `updatedAt`
* is serialized to an ISO string for the loop-guard.
@@ -51,6 +51,23 @@ export interface GitSyncClient {
rootPageId?: string,
): Promise<{ pages: GitSyncPageNodeLite[]; complete: boolean }>;
/**
* Existence probe for the pull-side ghost guard (D-P3-1). Given a set of
* candidate pageIds, return the subset that corresponds to a REAL page ROW —
* INCLUDING soft-deleted (trashed) and pages in ANY OTHER space, i.e. any
* `id` that has ever been a page (`SELECT id FROM pages WHERE id = ANY(...)`,
* workspace-scoped). Malformed (non-UUID) ids are filtered out before the
* query and are never returned.
*
* The pull reconciler only absence-deletes a tracked file whose pageId is
* returned here: a deleted/moved/trashed page HAS a row -> its vault file is
* cleaned up; a GHOST id (a git file whose id was NEVER a page) has NO row ->
* it is preserved rather than silently deleted (the data-loss bug). Called on
* only the small candidate-delete set (tracked ids absent from the live tree),
* so the query is bounded and usually empty.
*/
pageIdsExist(pageIds: string[]): Promise<string[]>;
/**
* One page WITH its ProseMirror body content. `applyPullActions` reads
* `id`, `slugId`, `title`, `parentPageId`, `spaceId` (for the file meta) and
+28
View File
@@ -170,10 +170,38 @@ export async function runCycle(deps: RunCycleDeps): Promise<RunCycleResult> {
});
const tree = await client.listSpaceTree(spaceId);
// D-P3-1 ghost guard: an absence-delete must not silently remove a git file
// whose pageId was NEVER a page (a hand-authored file with an unknown id).
// Compute the candidate-delete set (tracked ids absent from the live tree)
// and ask the datasource which of them are REAL page rows (incl. trashed /
// other spaces). Only those may be absence-deleted; a ghost id is preserved
// (adopted/skipped by the push side).
//
// Size note: on a COMPLETE fetch this set is usually small/empty (only
// genuinely removed pages look absent), so the `id IN (...)` probe is cheap.
// On an INCOMPLETE fetch (`tree.complete === false`) MANY live pages look
// absent, so the set — and the probe — can be large. That is only a perf
// consideration, not a correctness one: `decideAbsenceDeletions` (inside
// `computePullActions`) still SUPPRESSES every absence delete on an
// incomplete fetch, so no ghost-guarded deletion is applied that cycle
// regardless of what the probe returns.
const livePageIds = new Set(
tree.pages.filter((p) => p && p.id).map((p) => p.id),
);
const candidateDeleteIds = existing
.map((e) => e.pageId)
.filter((id) => !livePageIds.has(id));
const deletableIds =
candidateDeleteIds.length > 0
? await client.pageIdsExist(candidateDeleteIds)
: [];
const pullActions = computePullActions({
pages: tree.pages,
treeComplete: tree.complete,
existing,
deletableIds,
});
// Bail before the first destructive write phase if the lock was lost.
+17 -3
View File
@@ -148,6 +148,14 @@ export interface PullActionsInput {
treeComplete: boolean;
/** Parsed tracked files: `{ pageId, relPath }` (from `readExisting`). */
existing: { pageId: string; relPath: string }[];
/**
* The subset of tracked pageIds that correspond to a REAL page row (D-P3-1
* ghost guard, from `client.pageIdsExist`). Only ids in this set may be
* absence-deleted; a ghost id (never a page) is preserved. When omitted, all
* absent ids are deletable (the historical behavior; pure unit callers that
* do not model ghosts).
*/
deletableIds?: string[];
}
/**
@@ -191,7 +199,7 @@ export interface PullActions {
* thin `applyPullActions`.
*/
export function computePullActions(input: PullActionsInput): PullActions {
const { pages, treeComplete, existing } = input;
const { pages, treeComplete, existing, deletableIds } = input;
const layout = buildVaultLayout(pages);
const live: LiveEntry[] = [];
@@ -206,8 +214,14 @@ export function computePullActions(input: PullActionsInput): PullActions {
}
// Plan reconciliation (pure). `plan.toDelete` is ABSENCE-based only;
// `plan.moved` carries move old-path removals separately.
const plan = planReconciliation(live, existing);
// `plan.moved` carries move old-path removals separately. The ghost guard
// (D-P3-1) gates absence-deletes to ids that are a real page row; when
// `deletableIds` is omitted, all absent ids are deletable (historical).
const plan = planReconciliation(
live,
existing,
deletableIds === undefined ? undefined : new Set(deletableIds),
);
// Decide whether the ABSENCE-based deletions may be applied this cycle
// (SPEC §8): incomplete-fetch suppression + empty-live + mass-delete guard.
+20 -2
View File
@@ -96,6 +96,16 @@ export interface ReconciliationPlan {
export function planReconciliation(
live: LiveEntry[],
existing: ExistingEntry[],
/**
* The subset of tracked pageIds that correspond to a REAL page row (D-P3-1
* ghost guard). When provided, a tracked file whose pageId is ABSENT from
* `live` is absence-deleted ONLY if its id is in this set — a deleted/moved/
* trashed page has a row (delete its stale vault file), while a GHOST id (a
* git file whose id was never a page) has NO row and is PRESERVED. When
* `undefined` (pure unit callers that do not model ghosts), every absent id is
* treated as deletable — the historical behavior.
*/
deletableIds?: ReadonlySet<string>,
): ReconciliationPlan {
// Desired path for each live pageId.
const liveByPageId = new Map<string, string>();
@@ -119,9 +129,17 @@ export function planReconciliation(
for (const ex of existing) {
const liveRel = liveByPageId.get(ex.pageId);
if (liveRel === undefined) {
// Tracked page is gone from the live tree -> absence delete.
// Tracked page is gone from the live tree -> candidate absence delete.
// D-P3-1: a candidate is only deleted when its id corresponds to a real
// page row (deleted/moved/trashed). A GHOST id (never a page) is NOT in
// `deletableIds` and is preserved rather than silently deleted. When the
// gate is not supplied (pure unit callers), fall back to the historical
// "all absent ids deletable" behavior.
const deletable = deletableIds === undefined || deletableIds.has(ex.pageId);
// Never queue a path a live page will (re)write (path reuse -> no loss).
if (!liveTargetPaths.has(ex.relPath)) toDeleteSet.add(ex.relPath);
if (deletable && !liveTargetPaths.has(ex.relPath)) {
toDeleteSet.add(ex.relPath);
}
continue;
}
if (liveRel !== ex.relPath) {
@@ -77,6 +77,8 @@ const nodeFs: CycleFs = {
function makeEmptyClientFake() {
return {
listSpaceTree: vi.fn(async () => ({ pages: [], complete: true })),
// Default: every candidate id is a real page row (historical behavior).
pageIdsExist: vi.fn(async (ids: string[]) => ids),
getPageJson: vi.fn(),
importPageMarkdown: vi.fn(async () => ({ updatedAt: "2026-06-20T00:00:00.000Z" })),
createPage: vi.fn(async (title: string) => ({
+87
View File
@@ -1,5 +1,6 @@
import { describe, it, expect, vi } from "vitest";
import { runCycle, type RunCycleDeps } from "../src/engine/cycle";
import { serializePageFile } from "@docmost/prosemirror-markdown";
// A fake VaultGit recording the staging calls. An EMPTY vault/tree lets the real
// readExisting/computePullActions/applyPullActions/runPush run trivially (no
@@ -46,6 +47,8 @@ function baseDeps(vault: any, over: Partial<RunCycleDeps> = {}): RunCycleDeps {
spaceId: "space-1",
client: {
listSpaceTree: vi.fn(async () => ({ pages: [], complete: true })),
// Default: every candidate id is a real page row (historical behavior).
pageIdsExist: vi.fn(async (ids: string[]) => ids),
getPageJson: vi.fn(),
importPageMarkdown: vi.fn(),
createPage: vi.fn(),
@@ -235,4 +238,88 @@ describe("runCycle (composition)", () => {
expect(deps.client.listSpaceTree).toHaveBeenCalledTimes(1);
expect(vault.diffNameStatus).not.toHaveBeenCalled();
});
// D-P3-1 ghost guard, end-to-end through runCycle: a tracked file whose id is
// absent from live AND is NOT returned by `pageIdsExist` (a ghost — never a
// page) must SURVIVE the whole cycle (no rm), while a sibling id that IS
// returned (a genuinely deleted page row) is absence-deleted. This proves the
// guard flows from pageIdsExist -> computePullActions -> applyPullActions,
// not just at the pure `planReconciliation` unit.
it("GHOST GUARD (e2e): a ghost tracked file SURVIVES runCycle; a real deleted-page file is removed", async () => {
const ghostId = "019f2500-dead-7000-8000-000000000009"; // never a page
const deletedId = "019f2500-0000-7000-8000-000000000001"; // a real (deleted) row
const liveId = "019f2500-0000-7000-8000-0000000000aa"; // keeps empty-live from firing
// Two tracked files, both ABSENT from the live tree (deletion candidates).
const vault = fakeVault({
listTrackedFiles: vi.fn(async () => ["Ghost.md", "Deleted.md"]),
});
// The datasource reports ONLY the deleted page as a real row; the ghost has
// no row (a PROPER SUBSET of the probed ids, not the identity shim).
const pageIdsExist = vi.fn(async (ids: string[]) =>
ids.filter((id) => id === deletedId),
);
const rm = vi.fn(async () => undefined);
const deps = baseDeps(vault, {
fs: {
readFile: vi.fn(async (absPath: string) => {
if (absPath.includes("Ghost.md"))
return serializePageFile(ghostId, "a body authored in git");
if (absPath.includes("Deleted.md"))
return serializePageFile(deletedId, "a deleted page body");
return "";
}),
writeFile: vi.fn(async () => undefined),
mkdir: vi.fn(async () => undefined),
rm,
lstat: vi.fn(async () => ({ isSymbolicLink: false })),
realpath: vi.fn(async (p: string) => p),
},
client: {
...baseDeps(vault).client,
// A single live page so the empty-live suppression does NOT fire (which
// would mask the guard by suppressing every delete this cycle).
listSpaceTree: vi.fn(async () => ({
pages: [
{
id: liveId,
slugId: "live",
title: "Live",
parentPageId: null,
position: "a0",
hasChildren: false,
},
],
complete: true,
})),
pageIdsExist,
getPageJson: vi.fn(async (pageId: string) => ({
id: pageId,
slugId: "live",
title: "Live",
parentPageId: null,
spaceId: "space-1",
updatedAt: "2026-06-20T00:00:00.000Z",
content: { type: "doc", content: [] },
})),
} as any,
});
const res = await runCycle(deps);
expect(res.ran).toBe(true);
// The guard probed the datasource for EXACTLY the absent candidate ids.
expect(pageIdsExist).toHaveBeenCalledTimes(1);
expect(new Set(pageIdsExist.mock.calls[0][0])).toEqual(
new Set([ghostId, deletedId]),
);
// The real deleted-page file is removed; the ghost file is PRESERVED.
const rmPaths = rm.mock.calls.map((c) => c[0] as string);
expect(rmPaths.some((p) => p.includes("Deleted.md"))).toBe(true);
expect(rmPaths.some((p) => p.includes("Ghost.md"))).toBe(false);
expect(res.pull?.deleted).toBe(1);
});
});
@@ -65,6 +65,16 @@ describe('GitSyncClient contract (type-level)', () => {
expect(true).toBe(true);
});
it('pageIdsExist(ids) -> ids subset (D-P3-1 ghost guard seam)', () => {
expectTypeOf<GitSyncClient['pageIdsExist']>().parameters.toEqualTypeOf<
[string[]]
>();
expectTypeOf<
Awaited<ReturnType<GitSyncClient['pageIdsExist']>>
>().toEqualTypeOf<string[]>();
expect(true).toBe(true);
});
it('a structurally-correct adapter satisfies GitSyncClient (drift => compile error)', () => {
// A minimal dummy adapter mirroring the EXACT result shapes the engine reads.
// The `satisfies GitSyncClient` clause is the contract guard: any drift in a
@@ -74,6 +84,7 @@ describe('GitSyncClient contract (type-level)', () => {
pages: [] as GitSyncPageNodeLite[],
complete: true,
}),
pageIdsExist: async (_pageIds: string[]) => [] as string[],
getPageJson: async (pageId: string) => ({
id: pageId,
slugId: 'slug',
@@ -130,6 +141,7 @@ describe('GitSyncClient contract (type-level)', () => {
// in BOTH directions).
const bad = {
listSpaceTree: async () => ({ pages: [] as GitSyncPageNodeLite[], complete: true }),
pageIdsExist: async () => [] as string[],
getPageJson: async (pageId: string) => ({
id: pageId,
slugId: 's',
+37
View File
@@ -59,6 +59,43 @@ describe('planReconciliation', () => {
expect(plan.moved).toEqual([]);
});
// D-P3-1 ghost guard: when `deletableIds` is supplied, an absence-delete fires
// ONLY for an id that is a REAL page row. A tracked file whose id was NEVER a
// page (a hand-authored git file with an unknown id) is absent from `live` AND
// absent from `deletableIds` -> it MUST be preserved, not silently deleted.
it('GHOST GUARD: an absent id NOT in deletableIds is PRESERVED (not deleted)', () => {
const live: LiveEntry[] = [{ pageId: 'p1', relPath: 'Space/Keep.md' }];
const existing: ExistingEntry[] = [
{ pageId: 'p1', relPath: 'Space/Keep.md' },
// A ghost: its id is not live and not a real page row.
{ pageId: 'ghost', relPath: 'Space/Ghost.md' },
// A genuinely deleted page: absent from live but IS a real row.
{ pageId: 'gone', relPath: 'Space/Gone.md' },
];
// Only the real page row ('gone') is deletable; 'ghost' has no row.
const deletableIds = new Set(['gone']);
const plan = planReconciliation(live, existing, deletableIds);
expect(plan.toWrite).toEqual([{ pageId: 'p1', relPath: 'Space/Keep.md' }]);
// The genuine delete is applied; the ghost file is preserved.
expect(plan.toDelete).toEqual(['Space/Gone.md']);
expect(plan.toDelete).not.toContain('Space/Ghost.md');
expect(plan.moved).toEqual([]);
});
// The empty gate is the maximally-safe case: no id is a real row, so NOTHING
// is absence-deleted (every absent tracked file is treated as a ghost). This
// is what a cycle sees when `pageIdsExist` returns nothing for the candidates.
it('GHOST GUARD: an EMPTY deletableIds set suppresses every absence delete', () => {
const live: LiveEntry[] = [{ pageId: 'p1', relPath: 'Space/Keep.md' }];
const existing: ExistingEntry[] = [
{ pageId: 'p1', relPath: 'Space/Keep.md' },
{ pageId: 'gone', relPath: 'Space/Gone.md' },
];
const plan = planReconciliation(live, existing, new Set<string>());
expect(plan.toDelete).toEqual([]);
expect(plan.moved).toEqual([]);
});
it('NO-OP: live and existing identical -> writes (re-emit) but no deletes/moves', () => {
const live: LiveEntry[] = [
{ pageId: 'p1', relPath: 'A.md' },
@@ -787,12 +787,18 @@ export function convertProseMirrorToMarkdown(
// blockquote-prefixed; a blank line becomes a bare `>` so the callout is
// not split.
const calloutType = (node.attrs?.type || "info").toLowerCase();
// Prefix EVERY line of EVERY child with "> " and separate block-level
// children with a blank ">" line (mirrors the blockquote serializer),
// so a multi-paragraph callout does not collapse into one paragraph on
// re-import.
const calloutBody = nodeContent
.map(processNode)
.join("\n")
.split("\n")
.map((l: string) => (l.length ? `> ${l}` : ">"))
.join("\n");
.map((n: any) =>
processNode(n)
.split("\n")
.map((line: string) => (line.length ? `> ${line}` : ">"))
.join("\n"),
)
.join("\n>\n");
return `> [!${calloutType}]\n${calloutBody}`;
}
@@ -548,6 +548,77 @@ function bridgeTaskLists(html: string): string {
return document.body.innerHTML;
}
/**
* Matches a callout marker at the START of a blockquote's first paragraph:
* `[!type]` plus an optional Obsidian title that runs to the end of that line.
* The whole match (marker + title + trailing newline) is stripped; the Docmost
* callout schema has no title slot, so exactly like the line-based
* `preprocessCallouts` opener the title is dropped. The `\w+` requires a bare
* `[!word]`, so a real blockquote whose text merely contains `[!` does not match.
*/
const CALLOUT_MARKER_HTML_RE = /^(\s*)\[!(\w+)\][^\n]*(?:\n|$)/;
/**
* Reconstruct callouts that survived `marked` as `[!type]`-prefixed blockquotes.
*
* `preprocessCallouts` recognizes a callout only when its `> [!type]` opener sits
* at column 0 (the regex is `^`-anchored). A callout NESTED inside a list item is
* exported INDENTED (` > [!info]`), so the line-based preprocessor cannot see it
* and `marked` renders it as an ordinary `<blockquote>` with the `[!type]` marker
* leaking into the body text losing the callout type and, for a lead-less list
* item, breaking the list marker.
*
* This post-`marked` DOM pass is NESTING-AGNOSTIC: it walks EVERY `<blockquote>`
* (at any depth inside `<li>`, inside another callout, ) whose first `<p>`
* opens with a `[!type]` marker and rewrites it into the same
* `<div data-type="callout" data-callout-type="TYPE">` element the top-level
* path emits, so a nested callout is parsed identically to a top-level one.
* Top-level callouts are already `<div>`s (converted before `marked`), so they
* are not `<blockquote>`s here and are left untouched.
*/
function bridgeNestedCallouts(html: string): string {
// Cheap early-out: no callout marker anywhere -> nothing to bridge.
if (!html.includes("[!")) {
return html;
}
// Defensive cap (consistent with the other post-marked passes): skip the
// bridge for pathologically large inputs rather than running a JSDOM parse.
if (html.length > MAX_CALLOUT_PREPROCESS_BYTES) {
return html;
}
const dom = new JSDOM(html);
const document = dom.window.document;
// Process deepest-first (document order reversed) so a callout nested inside
// another callout's blockquote is converted before its parent.
const blockquotes = Array.from(
document.querySelectorAll("blockquote"),
).reverse();
for (const bq of blockquotes) {
// marked wraps each blockquote text line in a <p>; the marker lives in the
// first direct <p> child.
const firstP = bq.querySelector(":scope > p");
if (!firstP) continue;
const match = firstP.innerHTML.match(CALLOUT_MARKER_HTML_RE);
if (!match) continue;
const type = match[2].toLowerCase();
// Strip the marker (and dropped title) from the first paragraph. If that
// leaves the paragraph empty (opener on its own line, body in later <p>s),
// remove the now-empty paragraph so the callout has no phantom lead block.
firstP.innerHTML = firstP.innerHTML.slice(match[0].length);
if (firstP.innerHTML.trim() === "") {
firstP.remove();
}
const div = document.createElement("div");
div.setAttribute("data-type", "callout");
div.setAttribute("data-callout-type", type);
while (bq.firstChild) {
div.appendChild(bq.firstChild);
}
bq.replaceWith(div);
}
return document.body.innerHTML;
}
/**
* Re-apply ATTACHED HTML comments (#293 canon) before the DOM/generateJSON
* stage drops them.
@@ -999,10 +1070,15 @@ export async function markdownToProseMirror(
): Promise<any> {
const withCallouts = await preprocessCallouts(markdownContent);
const html = await markedInstance.parse(withCallouts);
// Reconstruct callouts that survived as `[!type]`-prefixed blockquotes because
// they were nested (e.g. inside a list item) and the column-0-anchored line
// preprocessor could not see them. Runs first so the rest of the pipeline
// treats a nested callout identically to a top-level one.
const withNestedCallouts = bridgeNestedCallouts(html);
// Materialize comment directives (#293 #9 attached textAlign; #5 standalone
// subpages/pageBreak) while the comment nodes still exist, before generateJSON
// drops them.
const withAttrs = applyCommentDirectives(html);
const withAttrs = applyCommentDirectives(withNestedCallouts);
// #293 canon #2: assemble the doc-level footnote list from the `<sup
// data-fn-text>` markers (from `^[…]` or the raw-HTML column form) before
// generateJSON, so references + definitions materialize into the schema model.
@@ -0,0 +1,173 @@
import { describe, expect, it } from 'vitest';
import {
convertProseMirrorToMarkdown,
markdownToProseMirror,
} from 'docmost-client';
// T6-listnest regression: a callout nested inside a list item is exported
// INDENTED (` > [!info]`), which the column-0-anchored line preprocessor cannot
// see, so on re-import it degraded to a plain blockquote with the `[!type]`
// marker leaking into the body text (confirmed DB corruption). The post-marked
// `bridgeNestedCallouts` pass reconstructs the callout regardless of nesting.
/** Collect every node of a given type anywhere in the doc tree. */
function collect(node: any, type: string, acc: any[] = []): any[] {
if (!node || typeof node !== 'object') return acc;
if (node.type === type) acc.push(node);
if (Array.isArray(node.content)) {
for (const child of node.content) collect(child, type, acc);
}
return acc;
}
/** Concatenate all text in a doc tree. */
function allText(node: any): string {
if (!node || typeof node !== 'object') return '';
let s = node.type === 'text' ? node.text ?? '' : '';
if (Array.isArray(node.content)) {
for (const child of node.content) s += allText(child);
}
return s;
}
// The primary case (a callout WITH a lead paragraph inside the list item) is
// covered by the round-trip corpus fixture `13-callout-in-list.json`, which
// asserts both byte-stability and canonical equality (doc === doc2). This file
// adds the no-lead CONTROL, which the corpus cannot cover: with no lead
// paragraph the StarterKit listItem schema hoists the callout out of the item,
// so the doc is not canonically stable and cannot live in the corpus loop.
describe('T6-listnest: callout nested in a list item', () => {
it('no-lead control: a callout as the first list-item child keeps its type and does not leak the list marker', async () => {
// With no lead paragraph the StarterKit listItem schema (`paragraph block*`)
// cannot hold a callout-first child, so generateJSON hoists the callout out
// of the item — that is correct schema normalization. What MUST hold is that
// the callout type survives, the `[!danger]` marker does not leak into text,
// and the literal `-` list marker does not leak as text (both were the
// confirmed pre-fix corruption).
const doc = {
type: 'doc',
content: [
{
type: 'bulletList',
content: [
{
type: 'listItem',
content: [
{
type: 'callout',
attrs: { type: 'danger' },
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'DangerText' }] },
],
},
],
},
],
},
],
};
const md1 = convertProseMirrorToMarkdown(doc);
const doc2 = await markdownToProseMirror(md1);
// The callout survives with its type; it did NOT degrade to a blockquote.
const callouts = collect(doc2, 'callout');
expect(callouts).toHaveLength(1);
expect(callouts[0].attrs.type).toBe('danger');
expect(allText(callouts[0])).toBe('DangerText');
expect(collect(doc2, 'blockquote')).toHaveLength(0);
// No marker leak and no literal list-marker leak.
const text = allText(doc2);
expect(text).not.toContain('[!');
expect(text).not.toContain('-');
});
});
// False-positive controls for `bridgeNestedCallouts`: it must reconstruct a
// callout ONLY from a blockquote whose FIRST paragraph OPENS with a `[!type]`
// marker (mirroring the column-0 `preprocessCallouts` rule), and must NOT
// over-convert an ordinary nested blockquote. If the detection were loosened to
// fire on any blockquote merely CONTAINING `[!`, both tests below would fail.
describe('T6-listnest: bridgeNestedCallouts does NOT over-convert', () => {
it('a REAL blockquote nested in a list item round-trips STILL as a blockquote', async () => {
const doc = {
type: 'doc',
content: [
{
type: 'bulletList',
content: [
{
type: 'listItem',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'item lead' }] },
{
type: 'blockquote',
content: [
{
type: 'paragraph',
content: [{ type: 'text', text: 'just a quote' }],
},
],
},
],
},
],
},
],
};
const md1 = convertProseMirrorToMarkdown(doc);
const doc2 = await markdownToProseMirror(md1);
// It stays a blockquote nested in the list item; NO callout is invented.
const listItems = collect(doc2, 'listItem');
expect(listItems).toHaveLength(1);
expect(listItems[0].content.map((c: any) => c.type)).toEqual([
'paragraph',
'blockquote',
]);
expect(collect(doc2, 'callout')).toHaveLength(0);
const blockquotes = collect(doc2, 'blockquote');
expect(blockquotes).toHaveLength(1);
expect(allText(blockquotes[0])).toBe('just a quote');
// Byte-stable on re-export (no callout div injected on the way in).
expect(convertProseMirrorToMarkdown(doc2)).toBe(md1);
});
it('a nested blockquote with `[!` MID-line (not at the opener) stays a blockquote', async () => {
const doc = {
type: 'doc',
content: [
{
type: 'bulletList',
content: [
{
type: 'listItem',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'item lead' }] },
{
type: 'blockquote',
content: [
{
type: 'paragraph',
content: [{ type: 'text', text: 'see [!note] inline' }],
},
],
},
],
},
],
},
],
};
const md1 = convertProseMirrorToMarkdown(doc);
const doc2 = await markdownToProseMirror(md1);
// The `[!note]` is mid-paragraph, not an opener -> NO callout is created.
expect(collect(doc2, 'callout')).toHaveLength(0);
const blockquotes = collect(doc2, 'blockquote');
expect(blockquotes).toHaveLength(1);
// The literal `[!note]` text is preserved inside the surviving blockquote.
expect(allText(blockquotes[0])).toContain('[!note]');
expect(convertProseMirrorToMarkdown(doc2)).toBe(md1);
});
});
@@ -0,0 +1,31 @@
{
"type": "doc",
"content": [
{
"type": "callout",
"attrs": {
"type": "info"
},
"content": [
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": "line1"
}
]
},
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": "line2"
}
]
}
]
}
]
}
@@ -0,0 +1,50 @@
{
"type": "doc",
"content": [
{
"type": "bulletList",
"content": [
{
"type": "listItem",
"content": [
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": "item lead"
}
]
},
{
"type": "callout",
"attrs": {
"type": "info"
},
"content": [
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": "ca"
}
]
},
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": "cb"
}
]
}
]
}
]
}
]
}
]
}
@@ -581,16 +581,18 @@ describe('convertProseMirrorToMarkdown', () => {
}),
);
// NOTE(review): the spec predicted ':::warning\nline1\n\nline2\n:::' (a
// The converter joins the callout's rendered children with a single '\n'
// and emits an Obsidian-native callout: a `> [!type]` opener plus one
// `>`-prefixed body line per content line. We pin the lowercasing
// (WARNING -> warning) and the multi-child join.
expect(out).toBe('> [!warning]\n> line1\n> line2');
// The converter emits an Obsidian-native callout: a `> [!type]` opener plus
// one `>`-prefixed body line per content line, with block-level children
// separated by a blank `>` line (mirrors the blockquote serializer) so a
// multi-paragraph callout does not collapse into one paragraph on re-import.
// We pin the lowercasing (WARNING -> warning) and the multi-child join.
expect(out).toBe('> [!warning]\n> line1\n>\n> line2');
// The type is lowercased (an uppercase `[!WARNING]` would not re-import).
expect(out.startsWith('> [!warning]\n')).toBe(true);
expect(out).not.toContain('[!WARNING]');
// Both paragraph children are present, each blockquote-prefixed.
expect(out).toContain('> line1\n> line2');
// Both paragraph children are present, each blockquote-prefixed and
// separated by a blank `>` line.
expect(out).toContain('> line1\n>\n> line2');
});
// Spec 4 — blockquote per-line prefixer over a multi-line nested callout.
@@ -609,11 +611,12 @@ describe('convertProseMirrorToMarkdown', () => {
);
// NOTE(review): the spec predicted '> :::info\n> a\n>\n> b\n> :::',
// assuming the nested callout body contains a blank line between 'a' and
// The nested callout renders as an Obsidian callout '> [!info]\n> a\n> b'
// (single-'\n' join, no blank line). The outer blockquote prefixer then
// prefixes each of those lines with '> ' again, yielding a doubly-nested
// blockquote — the realistic per-line-prefix loop over a multi-line child.
expect(out).toBe('> > [!info]\n> > a\n> > b');
// The nested callout renders as an Obsidian callout '> [!info]\n> a\n>\n> b'
// (block-level children separated by a blank `>` line). The outer blockquote
// prefixer then prefixes each of those lines with '> ' again, yielding a
// doubly-nested blockquote — the realistic per-line-prefix loop over a
// multi-block child.
expect(out).toBe('> > [!info]\n> > a\n> >\n> > b');
// Every produced line carries the '> ' prefix (no line escapes to col 0).
for (const line of out.split('\n')) {
expect(line.startsWith('>')).toBe(true);