diff --git a/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts b/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts index 55f06ea9..7941a1e6 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.html-embed.spec.ts @@ -339,3 +339,118 @@ describe('PersistenceExtension.onStoreDocument htmlEmbed admin gate (real code)' await expect(runStore('member', emptyDoc)).resolves.toBeDefined(); }); }); + +// Exercises the REAL early onChange guard (Gitea #26): guardHtmlEmbed converges +// the shared ydoc sub-second, before the 10s store debounce. We call it directly +// (it is the debounced timer body) and assert the ydoc fragment no longer yields +// an htmlEmbed for the non-admin's transient embed, while admin-vetted embeds +// already in the persisted content survive. +describe('PersistenceExtension.guardHtmlEmbed early onChange guard (real code)', () => { + async function runGuard( + role: string | null | undefined, + doc: Y.Doc, + featureEnabled = true, + priorContent?: any, + ) { + const { ext } = buildExtension(featureEnabled, priorContent); + await (ext as any).guardHtmlEmbed( + 'page-1', + doc, + { user: { id: 'u1', role, workspaceId: 'ws-1' } }, + ); + } + + it('toggle ON + non-admin: strips a newly-added embed from the shared ydoc', async () => { + // Prior persisted content has NO embed; the live doc has one a non-admin + // just added. + const doc = buildYdoc({ + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'hi' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: '' } }, + ], + }); + expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe( + true, + ); + + await runGuard('member', doc, true, { type: 'doc', content: [] }); + + // The shared ydoc fragment no longer yields any htmlEmbed. + expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe( + false, + ); + }); + + it('toggle ON + non-admin: preserves a prior admin embed, strips the new one', async () => { + const ADMIN_SOURCE = ''; + const prior = { + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'intro' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } }, + ], + }; + // Live doc keeps the admin embed AND adds a brand-new one. + const doc = buildYdoc({ + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'intro' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: '' } }, + ], + }); + + await runGuard('member', doc, true, prior); + + // Only the admin-vetted source survives in the shared ydoc. + expect( + collectHtmlEmbedSources(TiptapTransformer.fromYdoc(doc, 'default')), + ).toEqual(new Set([ADMIN_SOURCE])); + }); + + it('toggle OFF + non-admin: strips ALL embeds (allow-list is null)', async () => { + // Even an embed that matches the prior content is stripped when the toggle + // is OFF, because the OFF path passes allowed=null (strip everything) and + // never reads the prior content for an allow-list. + const SOURCE = ''; + const doc = buildYdoc({ + type: 'doc', + content: [ + { type: 'paragraph', content: [{ type: 'text', text: 'hi' }] }, + { type: HTML_EMBED_NODE_NAME, attrs: { source: SOURCE } }, + ], + }); + await runGuard('member', doc, false, { + type: 'doc', + content: [{ type: HTML_EMBED_NODE_NAME, attrs: { source: SOURCE } }], + }); + expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe( + false, + ); + }); + + it('admin role: guard is a defensive no-op (embed preserved)', async () => { + const doc = buildYdoc({ + type: 'doc', + content: [ + { type: HTML_EMBED_NODE_NAME, attrs: { source: '' } }, + ], + }); + await runGuard('admin', doc, true, { type: 'doc', content: [] }); + expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe( + true, + ); + }); + + it('no embed present: guard is a cheap no-op (loop-safe re-fire)', async () => { + const doc = buildYdoc({ + type: 'doc', + content: [{ type: 'paragraph', content: [{ type: 'text', text: 'plain' }] }], + }); + await runGuard('member', doc, true, { type: 'doc', content: [] }); + expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe( + false, + ); + }); +}); diff --git a/apps/server/src/collaboration/extensions/persistence.extension.ts b/apps/server/src/collaboration/extensions/persistence.extension.ts index 7c2ff963..b5dd49ed 100644 --- a/apps/server/src/collaboration/extensions/persistence.extension.ts +++ b/apps/server/src/collaboration/extensions/persistence.extension.ts @@ -40,6 +40,7 @@ import { } from '../constants'; import { TransclusionService } from '../../core/page/transclusion/transclusion.service'; import { + canAuthorHtmlEmbed, collectHtmlEmbedSources, hasHtmlEmbedNode, htmlEmbedAllowed, @@ -58,6 +59,21 @@ export class PersistenceExtension implements Extension { // coalescing window" per document and OR it across all edits in the window, // so the snapshot is marked 'agent' regardless of who wrote last. private agentTouched: Map = new Map(); + // Per-document debounce timers for the early htmlEmbed guard (Gitea #26). + // onChange schedules a short (~300ms) debounced strip that converges the + // shared ydoc for all connected clients well before the 10s store debounce, + // shrinking the pre-persist broadcast window of a non-admin's transient embed. + private htmlEmbedGuardTimers: Map = new Map(); + // Per-document cache of the workspace htmlEmbed toggle (Gitea #26). Populated + // in onLoadDocument (which already loads the page + has workspace context) and + // read in onChange to gate early-guard scheduling: when the toggle is OFF (the + // common default) we schedule NOTHING — no timer, no fromYdoc, no DB read — and + // rely on the onStoreDocument strip as the backstop (when OFF the embed does + // not execute in editable mode anyway). Cleared in afterUnloadDocument. + // STALENESS: if an admin flips the toggle ON mid-session this cache stays OFF + // until the document is reloaded, so the early guard won't schedule — accepted, + // the onStoreDocument backstop still strips on persist. + private htmlEmbedToggleByDoc: Map = new Map(); constructor( private readonly pageRepo: PageRepo, @@ -89,6 +105,23 @@ export class PersistenceExtension implements Extension { return; } + // Cache the workspace htmlEmbed toggle for this document (Gitea #26). We + // already have the page (hence its workspaceId) here, so resolve the toggle + // once and cache it keyed by documentName. onChange reads this to decide + // whether to schedule the early guard at all — when OFF we skip the guard + // entirely (no timer, no fromYdoc, no DB read). Cleared in + // afterUnloadDocument. See htmlEmbedToggleByDoc for the staleness note. + try { + const enabled = isHtmlEmbedFeatureEnabled( + (await this.workspaceRepo.findById(page.workspaceId))?.settings, + ); + this.htmlEmbedToggleByDoc.set(documentName, enabled); + } catch (err) { + // Fail OFF: if the toggle can't be resolved, never schedule the early + // guard; the onStoreDocument backstop still strips on persist. + this.htmlEmbedToggleByDoc.set(documentName, false); + } + if (page.ydoc) { this.logger.debug(`ydoc loaded from db: ${pageId}`); @@ -158,18 +191,25 @@ export class PersistenceExtension implements Extension { // the next store, and the persisted row plus every share/readonly read path // remain protected by the strip. // - // ACCEPTED RESIDUAL RISK (pre-persist broadcast window): this strip runs in - // the debounced onStoreDocument, but hocuspocus broadcasts each inbound Yjs - // update to connected clients immediately, so a non-admin's transient - // htmlEmbed can execute in OTHER open editors' browsers in the brief window - // before this persist strips it. The exposure is limited to concurrent - // AUTHENTICATED space members who have the doc open with Edit rights - // (semi-trusted) — anonymous public-share/readonly viewers do NOT open a - // collab socket (ReadonlyPageEditor renders fetched, already-stripped - // content; HocuspocusProvider is only used by the authenticated editable - // page-editor), and the PERSISTED page row plus every share/readonly read - // path are protected by this strip. The window is therefore accepted rather - // than mitigated with an inbound beforeBroadcast strip. + // RESIDUAL RISK (pre-persist broadcast window) — NOW MITIGATED (Gitea #26): + // this strip runs in the debounced onStoreDocument (up to 10s), but + // hocuspocus broadcasts each inbound Yjs update to connected clients + // immediately, so a non-admin's transient htmlEmbed can execute in OTHER open + // editors' browsers in the window before this persist strips it. The exposure + // is limited to concurrent AUTHENTICATED space members who have the doc open + // with Edit rights (semi-trusted) — anonymous public-share/readonly viewers do + // NOT open a collab socket (ReadonlyPageEditor renders fetched, + // already-stripped content; HocuspocusProvider is only used by the + // authenticated editable page-editor), and the PERSISTED page row plus every + // share/readonly read path are protected by this strip. + // The window is now SHRUNK to sub-second by an onChange-debounced early guard + // (~300ms) — see guardHtmlEmbed() — which runs the SAME preserve/strip gate as + // this block and re-encodes the cleaned ydoc, converging the doc for all + // clients long before this 10s store debounce fires. This onStoreDocument + // strip remains the authoritative backstop for persistence. The irreducible + // residual is only the VERY FIRST inbound broadcast before the ~300ms debounce + // fires: hocuspocus exposes no synchronous beforeBroadcast filter to drop the + // node before that first relay, so it cannot be eliminated entirely. // Toggle-AND-admin gate: htmlEmbed survives only when the workspace feature // toggle is ON and the storing user is an admin/owner. OFF (default) => // stripped for everyone (existing embeds get cleaned up on next save). @@ -389,12 +429,168 @@ export class PersistenceExtension implements Extension { if (data.context?.actor === 'agent') { this.agentTouched.set(documentName, true); } + + // Early htmlEmbed guard scheduling (Gitea #26). Schedule the short debounced + // guard ONLY when (a) this document's workspace toggle is cached ON and + // (b) the changing connection's user is a NON-admin (cannot author + // htmlEmbed). When the toggle is OFF/unknown we schedule NOTHING — no timer, + // no fromYdoc, no DB read — killing the OFF-case overhead (the common + // default); the onStoreDocument strip is the backstop and an OFF embed does + // not execute in editable mode anyway. We do NO expensive work here — we only + // (re)schedule the timer; the debounce coalesces rapid edits into a single + // guard check. + if ( + userId && + this.htmlEmbedToggleByDoc.get(documentName) === true && + !canAuthorHtmlEmbed(data.context?.user?.role) + ) { + const existing = this.htmlEmbedGuardTimers.get(documentName); + if (existing) { + clearTimeout(existing); + } + const timer = setTimeout(() => { + this.htmlEmbedGuardTimers.delete(documentName); + void this.guardHtmlEmbed(documentName, data.document, data.context); + }, 300); + this.htmlEmbedGuardTimers.set(documentName, timer); + } + } + + /** + * Early, onChange-debounced htmlEmbed strip (Gitea #26). Mirrors the + * onStoreDocument admin gate but runs ~300ms after a non-admin edit instead of + * waiting for the 10s store debounce, so a non-admin's transient embed is + * removed from the shared ydoc — and re-broadcast as cleaned state — for all + * connected clients in sub-second time. onStoreDocument remains the + * authoritative persistence backstop; this is an ADDITIONAL early pass. + * + * CONCURRENCY (the critical invariant): the Y.Doc mutation is a single + * SYNCHRONOUS block with NO `await` between the fromYdoc snapshot and the + * applyUpdate write. ALL async work (the workspace toggle lookup and the + * persisted-content read for the allow-list) happens FIRST, before that block. + * Because JS is single-threaded, a synchronous block cannot interleave with + * inbound Yjs update handlers, so a concurrent edit that lands while we await + * cannot be CLOBBERED: we re-snapshot the live doc only after all awaits, then + * delete + rebuild + applyUpdate without yielding. (An earlier version awaited + * DB reads BETWEEN the snapshot and the write, so a concurrent edit in that gap + * was lost — this restructure fixes that.) + * + * The allow-list is a best-effort snapshot read outside any lock (TOCTOU + * accepted, same as onStoreDocument): worst case one embed is transiently kept + * or dropped; it converges on the next guard/store, with no auth bypass. + * + * Loop-safety: the corrective applyUpdate has a null origin, so the re-fired + * onChange carries no userId and is not rescheduled; and after a strip no + * htmlEmbed remains, so a subsequent guard fire is a cheap no-op (the + * hasHtmlEmbedNode early-exit). NEVER throws — an unhandled rejection in a timer + * would crash the process — so the whole body is wrapped in try/catch. + */ + private async guardHtmlEmbed( + documentName: string, + document: Y.Doc, + context: any, + ): Promise { + // Defensive: ensure no stale timer entry survives for this document. + this.htmlEmbedGuardTimers.delete(documentName); + try { + // Re-check defensively: onChange only schedules for non-admins, but if an + // admin/owner somehow reaches here, the embed is authored content — do + // nothing (onStoreDocument's toggle-AND-admin gate handles persistence). + if (canAuthorHtmlEmbed(context?.user?.role)) { + return; + } + + // ---- ASYNC PHASE: do ALL awaits up front, before touching the ydoc. ---- + // Resolve the workspace toggle exactly as onStoreDocument does. When OFF we + // strip everything; when ON we use the preserve logic (keep admin-vetted + // embeds, strip only the non-admin's newly-introduced ones). + const enabled = isHtmlEmbedFeatureEnabled( + (await this.workspaceRepo.findById(context?.user?.workspaceId)) + ?.settings, + ); + + // The allow-list (admin-vetted sources already in the persisted content). + // null => strip ALL (toggle OFF). Read here, BEFORE the synchronous block, + // so no await sits between the doc snapshot and the doc write. + let allowed: Set | null = null; + if (enabled !== false) { + const prior = await this.pageRepo.findById(getPageId(documentName), { + includeContent: true, + }); + allowed = collectHtmlEmbedSources(prior?.content); + } + + // The awaits above may have let the document be unloaded/destroyed. If so, + // bail — mutating a destroyed doc is pointless and could throw (the + // try/catch is the ultimate safety net regardless). + if ((document as { isDestroyed?: boolean }).isDestroyed) { + return; + } + + // ---- SYNCHRONOUS PHASE: snapshot -> strip -> reflect, NO await here. ---- + // Because there is no await between fromYdoc and applyUpdate, no inbound + // Yjs update can interleave, so a concurrent edit cannot be lost. + const json = TiptapTransformer.fromYdoc(document, 'default'); + + // Cheap exit: nothing to guard if the doc has no embed at all. This is also + // why a post-strip re-fire is a no-op (loop-safe). + if (!hasHtmlEmbedNode(json)) { + return; + } + + const strippedJson = + allowed === null + ? stripHtmlEmbedNodes(json) + : stripDisallowedHtmlEmbedNodes(json, allowed); + + // Nothing was stripped (e.g. the only embed is an admin-vetted one) — do + // not churn the shared ydoc for all clients. + if (isDeepStrictEqual(strippedJson, json)) { + return; + } + + // Reflect the stripped content back into the shared ydoc EXACTLY as + // onStoreDocument does, so the node is removed for all connected clients, + // not just on the eventual persist. This re-encode broadcasts the cleaned + // state; after it hasHtmlEmbedNode is false, so any later guard fire is a + // cheap no-op (loop-safe). + const fragment = document.getXmlFragment('default'); + if (fragment.length > 0) { + fragment.delete(0, fragment.length); + } + const cleanDoc = TiptapTransformer.toYdoc( + strippedJson, + 'default', + tiptapExtensions, + ); + Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc)); + + this.logger.warn( + `Stripping htmlEmbed node(s) via early onChange guard by user ${context?.user?.id} on ${documentName}`, + ); + } catch (err) { + // NEVER rethrow out of a timer-scheduled call. + this.logger.error( + `Early htmlEmbed guard failed on ${documentName}`, + err, + ); + } } async afterUnloadDocument(data: afterUnloadDocumentPayload) { const documentName = data.documentName; this.contributors.delete(documentName); this.agentTouched.delete(documentName); + // Drop the cached toggle for this document so a reload re-resolves it (and + // picks up a mid-session admin toggle flip). + this.htmlEmbedToggleByDoc.delete(documentName); + // Clear any pending early-guard timer so it cannot fire after the document + // is unloaded (leak / use-after-unload prevention). + const timer = this.htmlEmbedGuardTimers.get(documentName); + if (timer) { + clearTimeout(timer); + this.htmlEmbedGuardTimers.delete(documentName); + } } private consumeContributors(documentName: string): string[] {