refactor(transclusion): unify the ProseMirror collectors into collectNodes (#94)

The three collect*FromPmJson collectors shared the same recursion (and the #55
depth cap) but were copy-pasted, so a future edit could diverge them. Extract a
generic collectNodes(doc, {type, map, key, lastWins, skipChildrenOfType}) and
reimplement all three on it, byte-output-identical (transclusions last-wins;
references/embeds first-wins + transclusionSource skip). Documents (not removes)
the write-only page_template_references graph and the near-duplicate client
lookup-context as tracked follow-ups, per the issue's guidance.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
claude code agent 227
2026-06-21 04:04:09 +03:00
parent 3f46496192
commit 31fcb764d7
3 changed files with 136 additions and 91 deletions

View File

@@ -25,6 +25,15 @@ const PageEmbedLookupContext = createContext<ContextValue | null>(null);
* transclusion lookup context but keys purely on `sourcePageId`. On public
* shares there is no lookup in MVP, so the context simply isn't mounted (the
* node view renders a placeholder when the context is absent).
*
* NOTE (intentional near-duplicate of `transclusion-lookup-context.tsx`): this
* provider duplicates that file's batching / de-dup / cache machinery; only the
* lookup key (sourcePageId here vs sourcePageId+transclusionId there) and the
* API call differ. Unifying them now would mean a generic, parameterised lookup
* provider — a larger client refactor that isn't worth it for just two
* consumers. Per Gitea #94, extract a shared generic provider when a THIRD
* lookup consumer appears; until then keep the two in sync by hand. (Tracked,
* deliberately deferred — not forgotten.)
*/
export function PageEmbedLookupProvider({
children,

View File

@@ -273,6 +273,18 @@ export class TransclusionService {
* accumulate cross-workspace edges, but rows are still NOT per-viewer
* permission-filtered. EVERY consumer of these rows MUST permission-filter at
* read time (as `lookupTemplate` does via `filterViewerAccessiblePageIds`).
*
* NOTE (write-only graph — intentional, not dead): as of now the
* `page_template_references` table is WRITE-ONLY in production. It is populated
* by three paths (this diff-sync, `insertTemplateReferencesForPages` for new
* pages, and the collab persistence flush) but has NO production reader: the
* only read of the table is `findByReferencePageId` below, used purely to
* compute this sync's insert/delete diff — there is no reverse-navigation
* consumer yet (issue #34's dead `findReferencePageIdsBySource` reader was
* already removed). The graph is retained deliberately for an upcoming
* "used in N pages" reverse-navigation consumer; keep writing it so that
* feature has correct history when it lands. Do not remove the write graph or
* its migration just because nothing reads it today. (See Gitea #94.)
*/
async syncPageTemplateReferences(
referencePageId: string,

View File

@@ -20,6 +20,79 @@ export type PageEmbedSnapshot = {
sourcePageId: string;
};
/**
* Generic, internal "collect every node of one PM type from a doc" walker that
* the three public `collect*FromPmJson` collectors are built on. They all share
* the exact same recursion (block-container descent + the #55 depth cap), and
* differed only in (target type, how a matched node maps to an output snapshot,
* how matches are deduped, and whether the walk descends into a
* `transclusionSource`). Centralising the recursion here keeps that shared logic
* — especially the depth guard — in one place so the collectors can't drift.
*
* Behaviour knobs (each collector wires these to reproduce its EXACT prior output):
* - `type`: only nodes whose `node.type` equals this are passed to `map`.
* - `map`: turns a matched node into a snapshot, or returns `undefined` to skip
* it (e.g. a transclusion with no id, or a reference missing attrs).
* - `key`: dedup key for a produced snapshot. Snapshots sharing a key collapse
* to a single entry; `lastWins` decides which one survives.
* - `lastWins`: when true (transclusions), a later duplicate overwrites the
* earlier one (Map semantics); when false (references, page embeds), the
* first occurrence wins and later duplicates are ignored. Either way the
* surviving entries keep first-seen insertion order.
* - `skipChildrenOfType`: a node type whose subtree the walk must NOT enter.
* References/embeds pass `transclusionSource` here (the schema forbids them
* inside a source, so a malformed inbound doc can't smuggle one in). The
* transclusion collector leaves this undefined because the matched type IS
* `transclusionSource` and matched nodes already short-circuit recursion.
*
* A matched node never recurses into its own children: every target type here is
* either an atom (reference/pageEmbed) or a boundary we deliberately don't nest
* into (transclusionSource), exactly as the original collectors behaved.
*/
function collectNodes<T>(
doc: unknown,
opts: {
type: string;
map: (node: any) => T | undefined;
key: (snapshot: T) => string;
lastWins?: boolean;
skipChildrenOfType?: string;
},
): T[] {
if (!doc || typeof doc !== 'object') return [];
const { type, map, key, lastWins = false, skipChildrenOfType } = opts;
const byKey = new Map<string, T>();
const visit = (node: any, depth: number): void => {
if (!node || typeof node !== 'object') return;
// Depth guard against a pathological/cyclic non-JSON input (see
// MAX_PM_WALK_DEPTH); unreachable on real docs.
if (depth > MAX_PM_WALK_DEPTH) return;
if (node.type === type) {
const snapshot = map(node);
if (snapshot !== undefined) {
const k = key(snapshot);
if (lastWins || !byKey.has(k)) byKey.set(k, snapshot);
}
return; // matched node: atom or boundary — do not recurse into children
}
// Don't descend into an isolated subtree (schema-enforced boundary).
if (skipChildrenOfType !== undefined && node.type === skipChildrenOfType) {
return;
}
if (Array.isArray(node.content)) {
for (const child of node.content) visit(child, depth + 1);
}
};
visit(doc, 0);
return Array.from(byKey.values());
}
/**
* Walks a ProseMirror JSON document and returns one snapshot per top-level
* `transclusion` node. Does not recurse into transclusions (schema disallows
@@ -30,34 +103,23 @@ export type PageEmbedSnapshot = {
export function collectTransclusionsFromPmJson(
doc: unknown,
): TransclusionNodeSnapshot[] {
if (!doc || typeof doc !== 'object') return [];
const byId = new Map<string, TransclusionNodeSnapshot>();
const visit = (node: any, depth: number): void => {
if (!node || typeof node !== 'object') return;
// Depth guard against a pathological/cyclic non-JSON input (see
// MAX_PM_WALK_DEPTH); unreachable on real docs.
if (depth > MAX_PM_WALK_DEPTH) return;
if (node.type === TRANSCLUSION_TYPE) {
// last-wins on duplicate ids (Map.set overwrites) — matches prior behaviour.
return collectNodes<TransclusionNodeSnapshot>(doc, {
type: TRANSCLUSION_TYPE,
map: (node) => {
const id = node.attrs?.id;
if (typeof id === 'string' && id.length > 0) {
byId.set(id, {
transclusionId: id,
content: { type: 'doc', content: node.content ?? [] },
});
}
return; // do not recurse into transclusion children
}
if (Array.isArray(node.content)) {
for (const child of node.content) visit(child, depth + 1);
}
};
visit(doc, 0);
return Array.from(byId.values());
if (typeof id !== 'string' || id.length === 0) return undefined;
return {
transclusionId: id,
content: { type: 'doc', content: node.content ?? [] },
};
},
key: (snapshot) => snapshot.transclusionId,
lastWins: true,
// No skipChildrenOfType: TRANSCLUSION_TYPE is itself the matched type, and a
// matched node already short-circuits recursion (the schema also forbids a
// transclusion nested inside another).
});
}
/**
@@ -70,46 +132,26 @@ export function collectTransclusionsFromPmJson(
export function collectReferencesFromPmJson(
doc: unknown,
): TransclusionReferenceSnapshot[] {
if (!doc || typeof doc !== 'object') return [];
const seen = new Set<string>();
const out: TransclusionReferenceSnapshot[] = [];
const visit = (node: any, depth: number): void => {
if (!node || typeof node !== 'object') return;
// Depth guard against a pathological/cyclic non-JSON input (see
// MAX_PM_WALK_DEPTH); unreachable on real docs.
if (depth > MAX_PM_WALK_DEPTH) return;
if (node.type === REFERENCE_TYPE) {
// first-wins dedup on (sourcePageId, transclusionId); skip recursing into a
// transclusionSource (schema forbids references inside one).
return collectNodes<TransclusionReferenceSnapshot>(doc, {
type: REFERENCE_TYPE,
map: (node) => {
const sourcePageId = node.attrs?.sourcePageId;
const transclusionId = node.attrs?.transclusionId;
if (
typeof sourcePageId === 'string' &&
sourcePageId.length > 0 &&
typeof transclusionId === 'string' &&
transclusionId.length > 0
typeof sourcePageId !== 'string' ||
sourcePageId.length === 0 ||
typeof transclusionId !== 'string' ||
transclusionId.length === 0
) {
const key = `${sourcePageId}::${transclusionId}`;
if (!seen.has(key)) {
seen.add(key);
out.push({ sourcePageId, transclusionId });
}
return undefined;
}
return; // atom node - no children
}
// References cannot live inside a source (schema-enforced); skip recursing
// so a malformed inbound doc can't sneak in a nested reference here.
if (node.type === TRANSCLUSION_TYPE) return;
if (Array.isArray(node.content)) {
for (const child of node.content) visit(child, depth + 1);
}
};
visit(doc, 0);
return out;
return { sourcePageId, transclusionId };
},
key: (snapshot) => `${snapshot.sourcePageId}::${snapshot.transclusionId}`,
skipChildrenOfType: TRANSCLUSION_TYPE,
});
}
/**
@@ -182,35 +224,17 @@ export function remapPageEmbedSourceIds<T>(
export function collectPageEmbedsFromPmJson(
doc: unknown,
): PageEmbedSnapshot[] {
if (!doc || typeof doc !== 'object') return [];
const seen = new Set<string>();
const out: PageEmbedSnapshot[] = [];
const visit = (node: any, depth: number): void => {
if (!node || typeof node !== 'object') return;
// Stop before the stack can overflow on a pathological/cyclic non-JSON
// input; far above any real doc nesting so genuine docs are unaffected.
if (depth > MAX_PM_WALK_DEPTH) return;
if (node.type === PAGE_EMBED_TYPE) {
// first-wins dedup on sourcePageId; skip recursing into a transclusionSource.
return collectNodes<PageEmbedSnapshot>(doc, {
type: PAGE_EMBED_TYPE,
map: (node) => {
const sourcePageId = node.attrs?.sourcePageId;
if (typeof sourcePageId === 'string' && sourcePageId.length > 0) {
if (!seen.has(sourcePageId)) {
seen.add(sourcePageId);
out.push({ sourcePageId });
}
if (typeof sourcePageId !== 'string' || sourcePageId.length === 0) {
return undefined;
}
return; // atom node - no children
}
if (node.type === TRANSCLUSION_TYPE) return;
if (Array.isArray(node.content)) {
for (const child of node.content) visit(child, depth + 1);
}
};
visit(doc, 0);
return out;
return { sourcePageId };
},
key: (snapshot) => snapshot.sourcePageId,
skipChildrenOfType: TRANSCLUSION_TYPE,
});
}