From b7ea8c850ebec2731716c9c6bb6726a90b101992 Mon Sep 17 00:00:00 2001 From: claude code agent 227 Date: Sat, 20 Jun 2026 23:02:01 +0300 Subject: [PATCH] fix(html-embed): preserve admin's existing embed on a non-admin co-editor's store (#29) The collab persist strip keyed to the storing connection's user, so when a non-admin co-editor stored, it removed an admin's legitimately-authored embed too (data loss). Now: toggle OFF still strips all (feature disabled); toggle ON + non-admin storer strips only NEWLY-introduced embeds and preserves those already present in the persisted content (admin-vetted), via new helpers collectHtmlEmbedSources + stripDisallowedHtmlEmbedNodes (identity = attrs.source, already-vetted HTML). The ydoc reflect is now guarded by a deep-equal check so an unrelated non-admin edit that touches no new embed doesn't churn the doc. A non-admin still cannot add a new embed. Documents the allow-list TOCTOU (best-effort snapshot read outside the lock; converges on next store). Co-Authored-By: Claude Opus 4.8 --- .../persistence.extension.html-embed.spec.ts | 67 ++++++- .../extensions/persistence.extension.ts | 85 ++++++--- .../helpers/prosemirror/html-embed.spec.ts | 165 ++++++++++++++++++ .../helpers/prosemirror/html-embed.util.ts | 101 +++++++++++ 4 files changed, 394 insertions(+), 24 deletions(-) diff --git a/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts b/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts index a78173a6..55f06ea9 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts @@ -3,6 +3,7 @@ import { TiptapTransformer } from '@hocuspocus/transformer'; import { PersistenceExtension } from './persistence.extension'; import { tiptapExtensions } from '../collaboration.util'; import { + collectHtmlEmbedSources, hasHtmlEmbedNode, HTML_EMBED_NODE_NAME, } from '../../common/helpers/prosemirror/html-embed.util'; @@ -121,7 +122,7 @@ function nodeTypeCounts(json: any): Record { * onStoreDocument to reach the strip + persist branch, and capture the content * that would be written to the page row. */ -function buildExtension(featureEnabled = true) { +function buildExtension(featureEnabled = true, priorContent?: any) { const captured: { content?: any } = {}; const existingPage = { @@ -131,7 +132,10 @@ function buildExtension(featureEnabled = true) { workspaceId: 'ws-1', creatorId: 'creator-1', contributorIds: [], - content: { type: 'doc', content: [] }, // differs from new content -> persist runs + // The currently-persisted content. Defaults to an empty doc (differs from + // new content -> persist runs); a test may pass a prior admin embed here to + // exercise the preserve-admin-embed branch. + content: priorContent ?? { type: 'doc', content: [] }, createdAt: new Date(), lastUpdatedSource: 'user', }; @@ -186,8 +190,9 @@ async function runStore( role: string | null | undefined, doc: Y.Doc, featureEnabled = true, + priorContent?: any, ) { - const { ext, captured } = buildExtension(featureEnabled); + const { ext, captured } = buildExtension(featureEnabled, priorContent); // hocuspocus augments the Y.Doc with broadcastStateless; a bare Y.Doc has // none, so stub it (the post-persist broadcast is not under test here). (doc as any).broadcastStateless = () => undefined; @@ -268,6 +273,62 @@ describe('PersistenceExtension.onStoreDocument htmlEmbed admin gate (real code)' ).toBe(false); }); + it('toggle ON + non-admin store: PRESERVES an admin embed already in the persisted content through an unrelated edit', async () => { + // Prior persisted content already holds an admin-authored embed. + const ADMIN_SOURCE = ''; + const prior = { + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'intro' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } }, + ], + }; + // A non-admin makes an UNRELATED edit (tweaks the paragraph) but the embed + // is still present in the merged doc. + const edited = { + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } }, + ], + }; + + const captured = await runStore('member', buildYdoc(edited), true, prior); + expect(captured.content).toBeDefined(); + // The admin's pre-existing embed survives the non-admin store. + expect(collectHtmlEmbedSources(captured.content)).toEqual( + new Set([ADMIN_SOURCE]), + ); + }); + + it('toggle ON + non-admin store: strips a NEWLY-added embed while keeping the prior admin one', async () => { + const ADMIN_SOURCE = ''; + const prior = { + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'intro' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } }, + ], + }; + // Non-admin keeps the admin embed, makes an unrelated paragraph edit (so the + // store is not a no-op and is persisted), and ALSO adds a brand-new embed. + const edited = { + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: '' } }, + ], + }; + + const captured = await runStore('member', buildYdoc(edited), true, prior); + expect(captured.content).toBeDefined(); + // Only the admin-vetted source remains; the newly-introduced one is stripped. + expect(collectHtmlEmbedSources(captured.content)).toEqual( + new Set([ADMIN_SOURCE]), + ); + }); + it('empty-fragment ydoc (no content) does not throw and persists no embed', async () => { const emptyDoc = buildYdoc({ type: 'doc', diff --git a/apps/server/src/collaboration/extensions/persistence.extension.ts b/apps/server/src/collaboration/extensions/persistence.extension.ts index 23c94631..7c2ff963 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.ts @@ -40,9 +40,11 @@ import { } from '../constants'; import { TransclusionService } from '../../core/page/transclusion/transclusion.service'; import { + collectHtmlEmbedSources, hasHtmlEmbedNode, htmlEmbedAllowed, isHtmlEmbedFeatureEnabled, + stripDisallowedHtmlEmbedNodes, stripHtmlEmbedNodes, } from '../../common/helpers/prosemirror/html-embed.util'; import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo'; @@ -130,12 +132,31 @@ export class PersistenceExtension implements Extension { // every htmlEmbed node before it is written to the page row AND before the // ydoc state is re-encoded, so the node cannot be reintroduced by a // non-admin via the collab socket. - // NOTE (residual risk): the gate is keyed to the storing connection's user. - // If an admin already authored an htmlEmbed and a non-admin's later store - // does not touch it, this strip would remove the admin's embed on that - // non-admin store. This is intentionally conservative (fail closed): the - // admin re-adds/keeps the node on their own next edit. A future refinement - // could diff against the previously persisted admin-authored embeds. + // NOTE (defense-in-depth refinement, Gitea #29): the gate is keyed to the + // storing connection's user, but it no longer blindly strips EVERY embed on + // a non-admin store. We distinguish two cases inside the !allowed branch: + // - Feature toggle OFF => strip ALL embeds (the feature is disabled for + // everyone; existing embeds get cleaned up on the next save). + // - Toggle ON but the storer is a NON-admin => strip only NEWLY-introduced + // embeds and PRESERVE embeds already present in the currently-persisted + // page content (admin-authored, already vetted). So a non-admin still + // cannot ADD an embed, but an unrelated edit (e.g. a paragraph tweak) no + // longer destroys an admin's existing embed (the prior data-loss bug). + // The pre-existing-embed identity is the raw `attrs.source` (see + // collectHtmlEmbedSources). A non-admin who copies an existing admin embed's + // exact source elsewhere passes — acceptable, that HTML is already vetted. + // + // ACCEPTED RESIDUAL RISK (toggle-ON allow-list TOCTOU): the allow-list is a + // best-effort snapshot read OUTSIDE the locked transaction (the prior content + // is pre-read above, but inside executeTx the row is re-read withLock without + // recomputing the allow-list). A concurrent admin store that changes the + // persisted embeds between the pre-read and this write can make the preserve + // decision use a slightly stale snapshot — worst case one embed transiently + // kept or dropped; it converges on the next store, with no auth bypass or + // broader data loss. The race is accepted because it only affects concurrent + // authenticated editors on the (rare) toggle-ON non-admin path, converges on + // the next store, and the persisted row plus every share/readonly read path + // remain protected by the strip. // // ACCEPTED RESIDUAL RISK (pre-persist broadcast window): this strip runs in // the debounced onStoreDocument, but hocuspocus broadcasts each inbound Yjs @@ -157,22 +178,44 @@ export class PersistenceExtension implements Extension { ); if (!htmlEmbedAllowed(htmlEmbedEnabled, context?.user?.role)) { if (hasHtmlEmbedNode(tiptapJson)) { - this.logger.warn( - `Stripping htmlEmbed node(s) from collab store by user ${context?.user?.id} on ${documentName}`, - ); - tiptapJson = stripHtmlEmbedNodes(tiptapJson); - // Reflect the stripped content back into the shared ydoc so the node is - // removed for all connected clients, not just the persisted row. - const fragment = document.getXmlFragment('default'); - if (fragment.length > 0) { - fragment.delete(0, fragment.length); + let strippedJson: typeof tiptapJson; + if (htmlEmbedEnabled === false) { + // Toggle OFF: feature disabled for everyone -> strip ALL embeds. + strippedJson = stripHtmlEmbedNodes(tiptapJson); + } else { + // Toggle ON, non-admin storer: preserve embeds already present in the + // currently-persisted (admin-vetted) page content; strip only the + // newly-introduced ones. Pre-read the prior content — a small extra + // query only on this rare non-admin + toggle-ON path. + const prior = await this.pageRepo.findById(pageId, { + includeContent: true, + }); + const allowed = collectHtmlEmbedSources(prior?.content); + strippedJson = stripDisallowedHtmlEmbedNodes(tiptapJson, allowed); + } + + // Only mutate the ydoc + log when the strip actually removed something; + // an unnecessary ydoc rewrite would churn the doc for all clients. With + // the toggle-ON branch a non-admin store that only touches admin-vetted + // embeds leaves the content unchanged here. + if (!isDeepStrictEqual(strippedJson, tiptapJson)) { + this.logger.warn( + `Stripping htmlEmbed node(s) from collab store by user ${context?.user?.id} on ${documentName}`, + ); + tiptapJson = strippedJson; + // Reflect the stripped content back into the shared ydoc so the node + // is removed for all connected clients, not just the persisted row. + const fragment = document.getXmlFragment('default'); + if (fragment.length > 0) { + fragment.delete(0, fragment.length); + } + const cleanDoc = TiptapTransformer.toYdoc( + tiptapJson, + 'default', + tiptapExtensions, + ); + Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc)); } - const cleanDoc = TiptapTransformer.toYdoc( - tiptapJson, - 'default', - tiptapExtensions, - ); - Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc)); } } diff --git a/apps/server/src/common/helpers/prosemirror/html-embed.spec.ts b/apps/server/src/common/helpers/prosemirror/html-embed.spec.ts index b48e9e73..4351673a 100644 --- a/apps/server/src/common/helpers/prosemirror/html-embed.spec.ts +++ b/apps/server/src/common/helpers/prosemirror/html-embed.spec.ts @@ -1,8 +1,10 @@ import { canAuthorHtmlEmbed, + collectHtmlEmbedSources, hasHtmlEmbedNode, htmlEmbedAllowed, isHtmlEmbedFeatureEnabled, + stripDisallowedHtmlEmbedNodes, stripHtmlEmbedNodes, } from './html-embed.util'; import { htmlToJson, jsonToHtml } from '../../../collaboration/collaboration.util'; @@ -105,6 +107,169 @@ describe('stripHtmlEmbedNodes', () => { }); }); +describe('collectHtmlEmbedSources', () => { + it('collects the source of every htmlEmbed node, including nested ones', () => { + const doc = { + type: 'doc', + content: [ + { type: 'htmlEmbed', attrs: { source: 'top' } }, + { + type: 'columns', + content: [ + { + type: 'column', + content: [ + { type: 'htmlEmbed', attrs: { source: 'nested' } }, + { type: 'paragraph', content: [{ type: 'text', text: 'x' }] }, + ], + }, + ], + }, + ], + }; + const sources = collectHtmlEmbedSources(doc); + expect(sources).toEqual(new Set(['top', 'nested'])); + }); + + it('returns an empty set for a doc with no embeds', () => { + const doc = { + type: 'doc', + content: [{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] }], + }; + expect(collectHtmlEmbedSources(doc).size).toBe(0); + }); + + it('gracefully skips embeds with absent attrs or non-string source', () => { + const doc = { + type: 'doc', + content: [ + { type: 'htmlEmbed' }, // no attrs + { type: 'htmlEmbed', attrs: {} }, // no source + { type: 'htmlEmbed', attrs: { source: 42 } }, // non-string + { type: 'htmlEmbed', attrs: { source: '' } }, + ], + }; + expect(collectHtmlEmbedSources(doc)).toEqual(new Set([''])); + }); + + it('returns an empty set for non-object input', () => { + expect(collectHtmlEmbedSources(null).size).toBe(0); + expect(collectHtmlEmbedSources(undefined).size).toBe(0); + expect(collectHtmlEmbedSources('x' as any).size).toBe(0); + }); +}); + +describe('stripDisallowedHtmlEmbedNodes', () => { + it('keeps an embed whose source is allowed and removes the rest', () => { + const doc = { + type: 'doc', + content: [ + { type: 'htmlEmbed', attrs: { source: '' } }, + { type: 'htmlEmbed', attrs: { source: '' } }, + { type: 'paragraph', content: [{ type: 'text', text: 'keep' }] }, + ], + }; + const result = stripDisallowedHtmlEmbedNodes(doc, new Set([''])); + expect(collectHtmlEmbedSources(result)).toEqual(new Set([''])); + // The allowed embed and the paragraph survive; the new embed is gone. + expect(result.content).toHaveLength(2); + expect(result.content[0].attrs.source).toBe(''); + expect(result.content[1].type).toBe('paragraph'); + }); + + it('keeps BOTH embeds when two nodes share the same allowed source', () => { + // Source-identity semantics: identity is the raw `attrs.source`, so a + // non-admin who duplicates an existing admin-vetted source keeps both copies. + // This is intended — the raw HTML is already vetted, so a duplicate is safe. + const doc = { + type: 'doc', + content: [ + { type: 'htmlEmbed', attrs: { source: '' } }, + { type: 'paragraph', content: [{ type: 'text', text: 'mid' }] }, + { type: 'htmlEmbed', attrs: { source: '' } }, + ], + }; + const result = stripDisallowedHtmlEmbedNodes(doc, new Set([''])); + expect(hasHtmlEmbedNode(result)).toBe(true); + const embeds = result.content.filter( + (n: any) => n.type === 'htmlEmbed', + ); + expect(embeds).toHaveLength(2); + expect(embeds.every((n: any) => n.attrs.source === '')).toBe(true); + }); + + it('removes a newly-introduced embed when nothing is allowed', () => { + const doc = { + type: 'doc', + content: [{ type: 'htmlEmbed', attrs: { source: '' } }], + }; + const result = stripDisallowedHtmlEmbedNodes(doc, new Set()); + expect(hasHtmlEmbedNode(result)).toBe(false); + }); + + it('filters nested embeds by the allow-list (e.g. inside columns)', () => { + const doc = { + type: 'doc', + content: [ + { + type: 'columns', + content: [ + { + type: 'column', + content: [ + { type: 'htmlEmbed', attrs: { source: '' } }, + { type: 'htmlEmbed', attrs: { source: '' } }, + ], + }, + ], + }, + ], + }; + const result = stripDisallowedHtmlEmbedNodes(doc, new Set([''])); + const col = findFirstChild(result, 'column'); + expect(col.content).toHaveLength(1); + expect(col.content[0].attrs.source).toBe(''); + }); + + it('treats an embed with absent/non-string source as not allowed (stripped)', () => { + const doc = { + type: 'doc', + content: [ + { type: 'htmlEmbed' }, + { type: 'htmlEmbed', attrs: {} }, + ], + }; + const result = stripDisallowedHtmlEmbedNodes(doc, new Set([''])); + expect(hasHtmlEmbedNode(result)).toBe(false); + }); + + it('does not mutate the input document', () => { + const doc = { + type: 'doc', + content: [{ type: 'htmlEmbed', attrs: { source: '' } }], + }; + stripDisallowedHtmlEmbedNodes(doc, new Set()); + expect(doc.content).toHaveLength(1); + expect(doc.content[0].type).toBe('htmlEmbed'); + }); + + it('neutralizes a root node that is itself a disallowed htmlEmbed', () => { + const root = { type: 'htmlEmbed', attrs: { source: '' } }; + const result = stripDisallowedHtmlEmbedNodes(root, new Set()); + expect(hasHtmlEmbedNode(result)).toBe(false); + }); + + it('keeps a root node that is an allowed htmlEmbed (defensive branch)', () => { + const root = { type: 'htmlEmbed', attrs: { source: '' } }; + const result = stripDisallowedHtmlEmbedNodes(root, new Set([''])); + expect(collectHtmlEmbedSources(result)).toEqual(new Set([''])); + }); + + it('returns non-object input unchanged', () => { + expect(stripDisallowedHtmlEmbedNodes(null as any, new Set())).toBeNull(); + }); +}); + describe('canAuthorHtmlEmbed', () => { it('allows owner and admin', () => { expect(canAuthorHtmlEmbed('owner')).toBe(true); diff --git a/apps/server/src/common/helpers/prosemirror/html-embed.util.ts b/apps/server/src/common/helpers/prosemirror/html-embed.util.ts index aa5d579d..146ea9d3 100644 --- a/apps/server/src/common/helpers/prosemirror/html-embed.util.ts +++ b/apps/server/src/common/helpers/prosemirror/html-embed.util.ts @@ -48,6 +48,107 @@ export function stripHtmlEmbedNodes(pmJson: T): T { return { ...node } as unknown as T; } +/** + * Walk the document and collect a stable identity for every `htmlEmbed` node. + * + * The identity is the node's `attrs.source` string — the raw HTML the embed + * renders. Two embeds that render the exact same HTML are treated as the same + * identity. Used by the collab persist path to know which embeds are ALREADY + * present in the currently-persisted (admin-vetted) page content, so a later + * non-admin store can strip only NEWLY-introduced embeds while preserving the + * pre-existing admin-authored ones. + * + * Absent attrs or a non-string/absent `source` are skipped gracefully (such a + * node contributes no identity to the set). + */ +export function collectHtmlEmbedSources(pmJson: unknown): Set { + const sources = new Set(); + + const walk = (node: unknown): void => { + if (!node || typeof node !== 'object') { + return; + } + const n = node as JSONContent; + if (n.type === HTML_EMBED_NODE_NAME) { + const source = (n.attrs as Record | undefined)?.source; + if (typeof source === 'string') { + sources.add(source); + } + } + if (Array.isArray(n.content)) { + for (const child of n.content) { + walk(child); + } + } + }; + + walk(pmJson); + return sources; +} + +/** + * Like {@link stripHtmlEmbedNodes}, but KEEP any `htmlEmbed` node whose + * `attrs.source` is in `allowedSources`; remove the rest. + * + * Used on the collab persist path when the feature toggle is ON but the storing + * user is a NON-admin: `allowedSources` is the set of embed sources already + * present in the currently-persisted page content (admin-authored, already + * vetted). A non-admin therefore cannot ADD a new embed, but their unrelated + * edit also cannot destroy an admin's existing one. + * + * NOTE: identity is the raw source string, so a non-admin who COPIES an existing + * admin embed's exact source into a NEW location passes this check. That is + * acceptable — the source is already admin-vetted content present in the doc; no + * new untrusted HTML is introduced. + * + * Returns a NEW document; the input is not mutated. Same defensive root-type + * check pattern as {@link stripHtmlEmbedNodes}. + */ +export function stripDisallowedHtmlEmbedNodes( + pmJson: T, + allowedSources: Set, +): T { + if (!pmJson || typeof pmJson !== 'object') { + return pmJson; + } + + const node = pmJson as unknown as JSONContent; + + // Defensive root-type check (mirrors stripHtmlEmbedNodes): if the ROOT node is + // itself an htmlEmbed and its source is NOT allowed, the children-filtering + // below could never drop it, so neutralize it here. Unreachable in normal use + // (the PM document root is always a `doc`). + if (node.type === HTML_EMBED_NODE_NAME) { + const source = (node.attrs as Record | undefined)?.source; + if (typeof source === 'string' && allowedSources.has(source)) { + return { ...node } as unknown as T; + } + return { type: 'doc', content: [] } as unknown as T; + } + + if (Array.isArray(node.content)) { + const filtered: JSONContent[] = []; + for (const child of node.content) { + // Drop a disallowed htmlEmbed child (newly introduced); keep an allowed + // one (already present in the persisted, admin-vetted content). + if (child && child.type === HTML_EMBED_NODE_NAME) { + const source = (child.attrs as Record | undefined) + ?.source; + if (typeof source === 'string' && allowedSources.has(source)) { + filtered.push({ ...child }); + } + continue; + } + // Recurse so nested htmlEmbed nodes (e.g. inside columns/callouts) are + // also filtered by the same allow-list. + filtered.push(stripDisallowedHtmlEmbedNodes(child, allowedSources)); + } + return { ...node, content: filtered } as unknown as T; + } + + return { ...node } as unknown as T; +} + /** * Returns true if the document contains at least one `htmlEmbed` node anywhere * in its tree. Useful to decide whether a strip pass actually changed anything