fix(html-embed): preserve admin's existing embed on a non-admin co-editor's store (#29)

The collab persist strip keyed to the storing connection's user, so when a
non-admin co-editor stored, it removed an admin's legitimately-authored embed
too (data loss). Now: toggle OFF still strips all (feature disabled); toggle ON
+ non-admin storer strips only NEWLY-introduced embeds and preserves those
already present in the persisted content (admin-vetted), via new helpers
collectHtmlEmbedSources + stripDisallowedHtmlEmbedNodes (identity = attrs.source,
already-vetted HTML). The ydoc reflect is now guarded by a deep-equal check so
an unrelated non-admin edit that touches no new embed doesn't churn the doc.
A non-admin still cannot add a new embed. Documents the allow-list TOCTOU
(best-effort snapshot read outside the lock; converges on next store).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
claude code agent 227
2026-06-20 23:02:01 +03:00
parent 8191c37daa
commit b7ea8c850e
4 changed files with 394 additions and 24 deletions

View File

@@ -3,6 +3,7 @@ import { TiptapTransformer } from '@hocuspocus/transformer';
import { PersistenceExtension } from './persistence.extension';
import { tiptapExtensions } from '../collaboration.util';
import {
collectHtmlEmbedSources,
hasHtmlEmbedNode,
HTML_EMBED_NODE_NAME,
} from '../../common/helpers/prosemirror/html-embed.util';
@@ -121,7 +122,7 @@ function nodeTypeCounts(json: any): Record<string, number> {
* onStoreDocument to reach the strip + persist branch, and capture the content
* that would be written to the page row.
*/
function buildExtension(featureEnabled = true) {
function buildExtension(featureEnabled = true, priorContent?: any) {
const captured: { content?: any } = {};
const existingPage = {
@@ -131,7 +132,10 @@ function buildExtension(featureEnabled = true) {
workspaceId: 'ws-1',
creatorId: 'creator-1',
contributorIds: [],
content: { type: 'doc', content: [] }, // differs from new content -> persist runs
// The currently-persisted content. Defaults to an empty doc (differs from
// new content -> persist runs); a test may pass a prior admin embed here to
// exercise the preserve-admin-embed branch.
content: priorContent ?? { type: 'doc', content: [] },
createdAt: new Date(),
lastUpdatedSource: 'user',
};
@@ -186,8 +190,9 @@ async function runStore(
role: string | null | undefined,
doc: Y.Doc,
featureEnabled = true,
priorContent?: any,
) {
const { ext, captured } = buildExtension(featureEnabled);
const { ext, captured } = buildExtension(featureEnabled, priorContent);
// hocuspocus augments the Y.Doc with broadcastStateless; a bare Y.Doc has
// none, so stub it (the post-persist broadcast is not under test here).
(doc as any).broadcastStateless = () => undefined;
@@ -268,6 +273,62 @@ describe('PersistenceExtension.onStoreDocument htmlEmbed admin gate (real code)'
).toBe(false);
});
it('toggle ON + non-admin store: PRESERVES an admin embed already in the persisted content through an unrelated edit', async () => {
// Prior persisted content already holds an admin-authored embed.
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// A non-admin makes an UNRELATED edit (tweaks the paragraph) but the embed
// is still present in the merged doc.
const edited = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
const captured = await runStore('member', buildYdoc(edited), true, prior);
expect(captured.content).toBeDefined();
// The admin's pre-existing embed survives the non-admin store.
expect(collectHtmlEmbedSources(captured.content)).toEqual(
new Set([ADMIN_SOURCE]),
);
});
it('toggle ON + non-admin store: strips a NEWLY-added embed while keeping the prior admin one', async () => {
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// Non-admin keeps the admin embed, makes an unrelated paragraph edit (so the
// store is not a no-op and is persisted), and ALSO adds a brand-new embed.
const edited = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>evil()</script>' } },
],
};
const captured = await runStore('member', buildYdoc(edited), true, prior);
expect(captured.content).toBeDefined();
// Only the admin-vetted source remains; the newly-introduced one is stripped.
expect(collectHtmlEmbedSources(captured.content)).toEqual(
new Set([ADMIN_SOURCE]),
);
});
it('empty-fragment ydoc (no content) does not throw and persists no embed', async () => {
const emptyDoc = buildYdoc({
type: 'doc',

View File

@@ -40,9 +40,11 @@ import {
} from '../constants';
import { TransclusionService } from '../../core/page/transclusion/transclusion.service';
import {
collectHtmlEmbedSources,
hasHtmlEmbedNode,
htmlEmbedAllowed,
isHtmlEmbedFeatureEnabled,
stripDisallowedHtmlEmbedNodes,
stripHtmlEmbedNodes,
} from '../../common/helpers/prosemirror/html-embed.util';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
@@ -130,12 +132,31 @@ export class PersistenceExtension implements Extension {
// every htmlEmbed node before it is written to the page row AND before the
// ydoc state is re-encoded, so the node cannot be reintroduced by a
// non-admin via the collab socket.
// NOTE (residual risk): the gate is keyed to the storing connection's user.
// If an admin already authored an htmlEmbed and a non-admin's later store
// does not touch it, this strip would remove the admin's embed on that
// non-admin store. This is intentionally conservative (fail closed): the
// admin re-adds/keeps the node on their own next edit. A future refinement
// could diff against the previously persisted admin-authored embeds.
// NOTE (defense-in-depth refinement, Gitea #29): the gate is keyed to the
// storing connection's user, but it no longer blindly strips EVERY embed on
// a non-admin store. We distinguish two cases inside the !allowed branch:
// - Feature toggle OFF => strip ALL embeds (the feature is disabled for
// everyone; existing embeds get cleaned up on the next save).
// - Toggle ON but the storer is a NON-admin => strip only NEWLY-introduced
// embeds and PRESERVE embeds already present in the currently-persisted
// page content (admin-authored, already vetted). So a non-admin still
// cannot ADD an embed, but an unrelated edit (e.g. a paragraph tweak) no
// longer destroys an admin's existing embed (the prior data-loss bug).
// The pre-existing-embed identity is the raw `attrs.source` (see
// collectHtmlEmbedSources). A non-admin who copies an existing admin embed's
// exact source elsewhere passes — acceptable, that HTML is already vetted.
//
// ACCEPTED RESIDUAL RISK (toggle-ON allow-list TOCTOU): the allow-list is a
// best-effort snapshot read OUTSIDE the locked transaction (the prior content
// is pre-read above, but inside executeTx the row is re-read withLock without
// recomputing the allow-list). A concurrent admin store that changes the
// persisted embeds between the pre-read and this write can make the preserve
// decision use a slightly stale snapshot — worst case one embed transiently
// kept or dropped; it converges on the next store, with no auth bypass or
// broader data loss. The race is accepted because it only affects concurrent
// authenticated editors on the (rare) toggle-ON non-admin path, converges on
// the next store, and the persisted row plus every share/readonly read path
// remain protected by the strip.
//
// ACCEPTED RESIDUAL RISK (pre-persist broadcast window): this strip runs in
// the debounced onStoreDocument, but hocuspocus broadcasts each inbound Yjs
@@ -157,22 +178,44 @@ export class PersistenceExtension implements Extension {
);
if (!htmlEmbedAllowed(htmlEmbedEnabled, context?.user?.role)) {
if (hasHtmlEmbedNode(tiptapJson)) {
this.logger.warn(
`Stripping htmlEmbed node(s) from collab store by user ${context?.user?.id} on ${documentName}`,
);
tiptapJson = stripHtmlEmbedNodes(tiptapJson);
// Reflect the stripped content back into the shared ydoc so the node is
// removed for all connected clients, not just the persisted row.
const fragment = document.getXmlFragment('default');
if (fragment.length > 0) {
fragment.delete(0, fragment.length);
let strippedJson: typeof tiptapJson;
if (htmlEmbedEnabled === false) {
// Toggle OFF: feature disabled for everyone -> strip ALL embeds.
strippedJson = stripHtmlEmbedNodes(tiptapJson);
} else {
// Toggle ON, non-admin storer: preserve embeds already present in the
// currently-persisted (admin-vetted) page content; strip only the
// newly-introduced ones. Pre-read the prior content — a small extra
// query only on this rare non-admin + toggle-ON path.
const prior = await this.pageRepo.findById(pageId, {
includeContent: true,
});
const allowed = collectHtmlEmbedSources(prior?.content);
strippedJson = stripDisallowedHtmlEmbedNodes(tiptapJson, allowed);
}
// Only mutate the ydoc + log when the strip actually removed something;
// an unnecessary ydoc rewrite would churn the doc for all clients. With
// the toggle-ON branch a non-admin store that only touches admin-vetted
// embeds leaves the content unchanged here.
if (!isDeepStrictEqual(strippedJson, tiptapJson)) {
this.logger.warn(
`Stripping htmlEmbed node(s) from collab store by user ${context?.user?.id} on ${documentName}`,
);
tiptapJson = strippedJson;
// Reflect the stripped content back into the shared ydoc so the node
// is removed for all connected clients, not just the persisted row.
const fragment = document.getXmlFragment('default');
if (fragment.length > 0) {
fragment.delete(0, fragment.length);
}
const cleanDoc = TiptapTransformer.toYdoc(
tiptapJson,
'default',
tiptapExtensions,
);
Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc));
}
const cleanDoc = TiptapTransformer.toYdoc(
tiptapJson,
'default',
tiptapExtensions,
);
Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc));
}
}

View File

@@ -1,8 +1,10 @@
import {
canAuthorHtmlEmbed,
collectHtmlEmbedSources,
hasHtmlEmbedNode,
htmlEmbedAllowed,
isHtmlEmbedFeatureEnabled,
stripDisallowedHtmlEmbedNodes,
stripHtmlEmbedNodes,
} from './html-embed.util';
import { htmlToJson, jsonToHtml } from '../../../collaboration/collaboration.util';
@@ -105,6 +107,169 @@ describe('stripHtmlEmbedNodes', () => {
});
});
describe('collectHtmlEmbedSources', () => {
it('collects the source of every htmlEmbed node, including nested ones', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<b>top</b>' } },
{
type: 'columns',
content: [
{
type: 'column',
content: [
{ type: 'htmlEmbed', attrs: { source: '<i>nested</i>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'x' }] },
],
},
],
},
],
};
const sources = collectHtmlEmbedSources(doc);
expect(sources).toEqual(new Set(['<b>top</b>', '<i>nested</i>']));
});
it('returns an empty set for a doc with no embeds', () => {
const doc = {
type: 'doc',
content: [{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] }],
};
expect(collectHtmlEmbedSources(doc).size).toBe(0);
});
it('gracefully skips embeds with absent attrs or non-string source', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed' }, // no attrs
{ type: 'htmlEmbed', attrs: {} }, // no source
{ type: 'htmlEmbed', attrs: { source: 42 } }, // non-string
{ type: 'htmlEmbed', attrs: { source: '<ok/>' } },
],
};
expect(collectHtmlEmbedSources(doc)).toEqual(new Set(['<ok/>']));
});
it('returns an empty set for non-object input', () => {
expect(collectHtmlEmbedSources(null).size).toBe(0);
expect(collectHtmlEmbedSources(undefined).size).toBe(0);
expect(collectHtmlEmbedSources('x' as any).size).toBe(0);
});
});
describe('stripDisallowedHtmlEmbedNodes', () => {
it('keeps an embed whose source is allowed and removes the rest', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'htmlEmbed', attrs: { source: '<new-evil/>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'keep' }] },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(collectHtmlEmbedSources(result)).toEqual(new Set(['<vetted/>']));
// The allowed embed and the paragraph survive; the new embed is gone.
expect(result.content).toHaveLength(2);
expect(result.content[0].attrs.source).toBe('<vetted/>');
expect(result.content[1].type).toBe('paragraph');
});
it('keeps BOTH embeds when two nodes share the same allowed source', () => {
// Source-identity semantics: identity is the raw `attrs.source`, so a
// non-admin who duplicates an existing admin-vetted source keeps both copies.
// This is intended — the raw HTML is already vetted, so a duplicate is safe.
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'mid' }] },
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(hasHtmlEmbedNode(result)).toBe(true);
const embeds = result.content.filter(
(n: any) => n.type === 'htmlEmbed',
);
expect(embeds).toHaveLength(2);
expect(embeds.every((n: any) => n.attrs.source === '<vetted/>')).toBe(true);
});
it('removes a newly-introduced embed when nothing is allowed', () => {
const doc = {
type: 'doc',
content: [{ type: 'htmlEmbed', attrs: { source: '<new/>' } }],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set());
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('filters nested embeds by the allow-list (e.g. inside columns)', () => {
const doc = {
type: 'doc',
content: [
{
type: 'columns',
content: [
{
type: 'column',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'htmlEmbed', attrs: { source: '<new/>' } },
],
},
],
},
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
const col = findFirstChild(result, 'column');
expect(col.content).toHaveLength(1);
expect(col.content[0].attrs.source).toBe('<vetted/>');
});
it('treats an embed with absent/non-string source as not allowed (stripped)', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed' },
{ type: 'htmlEmbed', attrs: {} },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('does not mutate the input document', () => {
const doc = {
type: 'doc',
content: [{ type: 'htmlEmbed', attrs: { source: '<new/>' } }],
};
stripDisallowedHtmlEmbedNodes(doc, new Set());
expect(doc.content).toHaveLength(1);
expect(doc.content[0].type).toBe('htmlEmbed');
});
it('neutralizes a root node that is itself a disallowed htmlEmbed', () => {
const root = { type: 'htmlEmbed', attrs: { source: '<new/>' } };
const result = stripDisallowedHtmlEmbedNodes(root, new Set());
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('keeps a root node that is an allowed htmlEmbed (defensive branch)', () => {
const root = { type: 'htmlEmbed', attrs: { source: '<vetted/>' } };
const result = stripDisallowedHtmlEmbedNodes(root, new Set(['<vetted/>']));
expect(collectHtmlEmbedSources(result)).toEqual(new Set(['<vetted/>']));
});
it('returns non-object input unchanged', () => {
expect(stripDisallowedHtmlEmbedNodes(null as any, new Set())).toBeNull();
});
});
describe('canAuthorHtmlEmbed', () => {
it('allows owner and admin', () => {
expect(canAuthorHtmlEmbed('owner')).toBe(true);

View File

@@ -48,6 +48,107 @@ export function stripHtmlEmbedNodes<T = JSONContent>(pmJson: T): T {
return { ...node } as unknown as T;
}
/**
* Walk the document and collect a stable identity for every `htmlEmbed` node.
*
* The identity is the node's `attrs.source` string — the raw HTML the embed
* renders. Two embeds that render the exact same HTML are treated as the same
* identity. Used by the collab persist path to know which embeds are ALREADY
* present in the currently-persisted (admin-vetted) page content, so a later
* non-admin store can strip only NEWLY-introduced embeds while preserving the
* pre-existing admin-authored ones.
*
* Absent attrs or a non-string/absent `source` are skipped gracefully (such a
* node contributes no identity to the set).
*/
export function collectHtmlEmbedSources(pmJson: unknown): Set<string> {
const sources = new Set<string>();
const walk = (node: unknown): void => {
if (!node || typeof node !== 'object') {
return;
}
const n = node as JSONContent;
if (n.type === HTML_EMBED_NODE_NAME) {
const source = (n.attrs as Record<string, unknown> | undefined)?.source;
if (typeof source === 'string') {
sources.add(source);
}
}
if (Array.isArray(n.content)) {
for (const child of n.content) {
walk(child);
}
}
};
walk(pmJson);
return sources;
}
/**
* Like {@link stripHtmlEmbedNodes}, but KEEP any `htmlEmbed` node whose
* `attrs.source` is in `allowedSources`; remove the rest.
*
* Used on the collab persist path when the feature toggle is ON but the storing
* user is a NON-admin: `allowedSources` is the set of embed sources already
* present in the currently-persisted page content (admin-authored, already
* vetted). A non-admin therefore cannot ADD a new embed, but their unrelated
* edit also cannot destroy an admin's existing one.
*
* NOTE: identity is the raw source string, so a non-admin who COPIES an existing
* admin embed's exact source into a NEW location passes this check. That is
* acceptable — the source is already admin-vetted content present in the doc; no
* new untrusted HTML is introduced.
*
* Returns a NEW document; the input is not mutated. Same defensive root-type
* check pattern as {@link stripHtmlEmbedNodes}.
*/
export function stripDisallowedHtmlEmbedNodes<T = JSONContent>(
pmJson: T,
allowedSources: Set<string>,
): T {
if (!pmJson || typeof pmJson !== 'object') {
return pmJson;
}
const node = pmJson as unknown as JSONContent;
// Defensive root-type check (mirrors stripHtmlEmbedNodes): if the ROOT node is
// itself an htmlEmbed and its source is NOT allowed, the children-filtering
// below could never drop it, so neutralize it here. Unreachable in normal use
// (the PM document root is always a `doc`).
if (node.type === HTML_EMBED_NODE_NAME) {
const source = (node.attrs as Record<string, unknown> | undefined)?.source;
if (typeof source === 'string' && allowedSources.has(source)) {
return { ...node } as unknown as T;
}
return { type: 'doc', content: [] } as unknown as T;
}
if (Array.isArray(node.content)) {
const filtered: JSONContent[] = [];
for (const child of node.content) {
// Drop a disallowed htmlEmbed child (newly introduced); keep an allowed
// one (already present in the persisted, admin-vetted content).
if (child && child.type === HTML_EMBED_NODE_NAME) {
const source = (child.attrs as Record<string, unknown> | undefined)
?.source;
if (typeof source === 'string' && allowedSources.has(source)) {
filtered.push({ ...child });
}
continue;
}
// Recurse so nested htmlEmbed nodes (e.g. inside columns/callouts) are
// also filtered by the same allow-list.
filtered.push(stripDisallowedHtmlEmbedNodes(child, allowedSources));
}
return { ...node, content: filtered } as unknown as T;
}
return { ...node } as unknown as T;
}
/**
* Returns true if the document contains at least one `htmlEmbed` node anywhere
* in its tree. Useful to decide whether a strip pass actually changed anything