Merge pull request 'fix(html-embed): complete kill-switch on read paths (#28) + total strip helper (#30)' (#46) from fix/html-embed-hardening into develop
Some checks failed
Develop / test (push) Has been cancelled
Develop / build (push) Has been cancelled

This commit was merged in pull request #46.
This commit is contained in:
claude_code
2026-06-21 01:59:40 +03:00
7 changed files with 1223 additions and 213 deletions

View File

@@ -3,6 +3,7 @@ import { TiptapTransformer } from '@hocuspocus/transformer';
import { PersistenceExtension } from './persistence.extension';
import { tiptapExtensions } from '../collaboration.util';
import {
collectHtmlEmbedSources,
hasHtmlEmbedNode,
HTML_EMBED_NODE_NAME,
} from '../../common/helpers/prosemirror/html-embed.util';
@@ -121,7 +122,7 @@ function nodeTypeCounts(json: any): Record<string, number> {
* onStoreDocument to reach the strip + persist branch, and capture the content
* that would be written to the page row.
*/
function buildExtension(featureEnabled = true) {
function buildExtension(featureEnabled = true, priorContent?: any) {
const captured: { content?: any } = {};
const existingPage = {
@@ -131,7 +132,10 @@ function buildExtension(featureEnabled = true) {
workspaceId: 'ws-1',
creatorId: 'creator-1',
contributorIds: [],
content: { type: 'doc', content: [] }, // differs from new content -> persist runs
// The currently-persisted content. Defaults to an empty doc (differs from
// new content -> persist runs); a test may pass a prior admin embed here to
// exercise the preserve-admin-embed branch.
content: priorContent ?? { type: 'doc', content: [] },
createdAt: new Date(),
lastUpdatedSource: 'user',
};
@@ -186,8 +190,9 @@ async function runStore(
role: string | null | undefined,
doc: Y.Doc,
featureEnabled = true,
priorContent?: any,
) {
const { ext, captured } = buildExtension(featureEnabled);
const { ext, captured } = buildExtension(featureEnabled, priorContent);
// hocuspocus augments the Y.Doc with broadcastStateless; a bare Y.Doc has
// none, so stub it (the post-persist broadcast is not under test here).
(doc as any).broadcastStateless = () => undefined;
@@ -268,6 +273,62 @@ describe('PersistenceExtension.onStoreDocument htmlEmbed admin gate (real code)'
).toBe(false);
});
it('toggle ON + non-admin store: PRESERVES an admin embed already in the persisted content through an unrelated edit', async () => {
// Prior persisted content already holds an admin-authored embed.
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// A non-admin makes an UNRELATED edit (tweaks the paragraph) but the embed
// is still present in the merged doc.
const edited = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
const captured = await runStore('member', buildYdoc(edited), true, prior);
expect(captured.content).toBeDefined();
// The admin's pre-existing embed survives the non-admin store.
expect(collectHtmlEmbedSources(captured.content)).toEqual(
new Set([ADMIN_SOURCE]),
);
});
it('toggle ON + non-admin store: strips a NEWLY-added embed while keeping the prior admin one', async () => {
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// Non-admin keeps the admin embed, makes an unrelated paragraph edit (so the
// store is not a no-op and is persisted), and ALSO adds a brand-new embed.
const edited = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>evil()</script>' } },
],
};
const captured = await runStore('member', buildYdoc(edited), true, prior);
expect(captured.content).toBeDefined();
// Only the admin-vetted source remains; the newly-introduced one is stripped.
expect(collectHtmlEmbedSources(captured.content)).toEqual(
new Set([ADMIN_SOURCE]),
);
});
it('empty-fragment ydoc (no content) does not throw and persists no embed', async () => {
const emptyDoc = buildYdoc({
type: 'doc',
@@ -278,3 +339,118 @@ describe('PersistenceExtension.onStoreDocument htmlEmbed admin gate (real code)'
await expect(runStore('member', emptyDoc)).resolves.toBeDefined();
});
});
// Exercises the REAL early onChange guard (Gitea #26): guardHtmlEmbed converges
// the shared ydoc sub-second, before the 10s store debounce. We call it directly
// (it is the debounced timer body) and assert the ydoc fragment no longer yields
// an htmlEmbed for the non-admin's transient embed, while admin-vetted embeds
// already in the persisted content survive.
describe('PersistenceExtension.guardHtmlEmbed early onChange guard (real code)', () => {
async function runGuard(
role: string | null | undefined,
doc: Y.Doc,
featureEnabled = true,
priorContent?: any,
) {
const { ext } = buildExtension(featureEnabled, priorContent);
await (ext as any).guardHtmlEmbed(
'page-1',
doc,
{ user: { id: 'u1', role, workspaceId: 'ws-1' } },
);
}
it('toggle ON + non-admin: strips a newly-added embed from the shared ydoc', async () => {
// Prior persisted content has NO embed; the live doc has one a non-admin
// just added.
const doc = buildYdoc({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>evil()</script>' } },
],
});
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
true,
);
await runGuard('member', doc, true, { type: 'doc', content: [] });
// The shared ydoc fragment no longer yields any htmlEmbed.
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
false,
);
});
it('toggle ON + non-admin: preserves a prior admin embed, strips the new one', async () => {
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// Live doc keeps the admin embed AND adds a brand-new one.
const doc = buildYdoc({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>evil()</script>' } },
],
});
await runGuard('member', doc, true, prior);
// Only the admin-vetted source survives in the shared ydoc.
expect(
collectHtmlEmbedSources(TiptapTransformer.fromYdoc(doc, 'default')),
).toEqual(new Set([ADMIN_SOURCE]));
});
it('toggle OFF + non-admin: strips ALL embeds (allow-list is null)', async () => {
// Even an embed that matches the prior content is stripped when the toggle
// is OFF, because the OFF path passes allowed=null (strip everything) and
// never reads the prior content for an allow-list.
const SOURCE = '<script>any()</script>';
const doc = buildYdoc({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: SOURCE } },
],
});
await runGuard('member', doc, false, {
type: 'doc',
content: [{ type: HTML_EMBED_NODE_NAME, attrs: { source: SOURCE } }],
});
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
false,
);
});
it('admin role: guard is a defensive no-op (embed preserved)', async () => {
const doc = buildYdoc({
type: 'doc',
content: [
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>ok()</script>' } },
],
});
await runGuard('admin', doc, true, { type: 'doc', content: [] });
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
true,
);
});
it('no embed present: guard is a cheap no-op (loop-safe re-fire)', async () => {
const doc = buildYdoc({
type: 'doc',
content: [{ type: 'paragraph', content: [{ type: 'text', text: 'plain' }] }],
});
await runGuard('member', doc, true, { type: 'doc', content: [] });
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
false,
);
});
});

View File

@@ -40,9 +40,12 @@ import {
} from '../constants';
import { TransclusionService } from '../../core/page/transclusion/transclusion.service';
import {
canAuthorHtmlEmbed,
collectHtmlEmbedSources,
hasHtmlEmbedNode,
htmlEmbedAllowed,
isHtmlEmbedFeatureEnabled,
stripDisallowedHtmlEmbedNodes,
stripHtmlEmbedNodes,
} from '../../common/helpers/prosemirror/html-embed.util';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
@@ -56,6 +59,21 @@ export class PersistenceExtension implements Extension {
// coalescing window" per document and OR it across all edits in the window,
// so the snapshot is marked 'agent' regardless of who wrote last.
private agentTouched: Map<string, boolean> = new Map();
// Per-document debounce timers for the early htmlEmbed guard (Gitea #26).
// onChange schedules a short (~300ms) debounced strip that converges the
// shared ydoc for all connected clients well before the 10s store debounce,
// shrinking the pre-persist broadcast window of a non-admin's transient embed.
private htmlEmbedGuardTimers: Map<string, NodeJS.Timeout> = new Map();
// Per-document cache of the workspace htmlEmbed toggle (Gitea #26). Populated
// in onLoadDocument (which already loads the page + has workspace context) and
// read in onChange to gate early-guard scheduling: when the toggle is OFF (the
// common default) we schedule NOTHING — no timer, no fromYdoc, no DB read — and
// rely on the onStoreDocument strip as the backstop (when OFF the embed does
// not execute in editable mode anyway). Cleared in afterUnloadDocument.
// STALENESS: if an admin flips the toggle ON mid-session this cache stays OFF
// until the document is reloaded, so the early guard won't schedule — accepted,
// the onStoreDocument backstop still strips on persist.
private htmlEmbedToggleByDoc: Map<string, boolean> = new Map();
constructor(
private readonly pageRepo: PageRepo,
@@ -87,6 +105,23 @@ export class PersistenceExtension implements Extension {
return;
}
// Cache the workspace htmlEmbed toggle for this document (Gitea #26). We
// already have the page (hence its workspaceId) here, so resolve the toggle
// once and cache it keyed by documentName. onChange reads this to decide
// whether to schedule the early guard at all — when OFF we skip the guard
// entirely (no timer, no fromYdoc, no DB read). Cleared in
// afterUnloadDocument. See htmlEmbedToggleByDoc for the staleness note.
try {
const enabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(page.workspaceId))?.settings,
);
this.htmlEmbedToggleByDoc.set(documentName, enabled);
} catch (err) {
// Fail OFF: if the toggle can't be resolved, never schedule the early
// guard; the onStoreDocument backstop still strips on persist.
this.htmlEmbedToggleByDoc.set(documentName, false);
}
if (page.ydoc) {
this.logger.debug(`ydoc loaded from db: ${pageId}`);
@@ -130,25 +165,51 @@ export class PersistenceExtension implements Extension {
// every htmlEmbed node before it is written to the page row AND before the
// ydoc state is re-encoded, so the node cannot be reintroduced by a
// non-admin via the collab socket.
// NOTE (residual risk): the gate is keyed to the storing connection's user.
// If an admin already authored an htmlEmbed and a non-admin's later store
// does not touch it, this strip would remove the admin's embed on that
// non-admin store. This is intentionally conservative (fail closed): the
// admin re-adds/keeps the node on their own next edit. A future refinement
// could diff against the previously persisted admin-authored embeds.
// NOTE (defense-in-depth refinement, Gitea #29): the gate is keyed to the
// storing connection's user, but it no longer blindly strips EVERY embed on
// a non-admin store. We distinguish two cases inside the !allowed branch:
// - Feature toggle OFF => strip ALL embeds (the feature is disabled for
// everyone; existing embeds get cleaned up on the next save).
// - Toggle ON but the storer is a NON-admin => strip only NEWLY-introduced
// embeds and PRESERVE embeds already present in the currently-persisted
// page content (admin-authored, already vetted). So a non-admin still
// cannot ADD an embed, but an unrelated edit (e.g. a paragraph tweak) no
// longer destroys an admin's existing embed (the prior data-loss bug).
// The pre-existing-embed identity is the raw `attrs.source` (see
// collectHtmlEmbedSources). A non-admin who copies an existing admin embed's
// exact source elsewhere passes — acceptable, that HTML is already vetted.
//
// ACCEPTED RESIDUAL RISK (pre-persist broadcast window): this strip runs in
// the debounced onStoreDocument, but hocuspocus broadcasts each inbound Yjs
// update to connected clients immediately, so a non-admin's transient
// htmlEmbed can execute in OTHER open editors' browsers in the brief window
// before this persist strips it. The exposure is limited to concurrent
// AUTHENTICATED space members who have the doc open with Edit rights
// (semi-trusted) — anonymous public-share/readonly viewers do NOT open a
// collab socket (ReadonlyPageEditor renders fetched, already-stripped
// content; HocuspocusProvider is only used by the authenticated editable
// page-editor), and the PERSISTED page row plus every share/readonly read
// path are protected by this strip. The window is therefore accepted rather
// than mitigated with an inbound beforeBroadcast strip.
// ACCEPTED RESIDUAL RISK (toggle-ON allow-list TOCTOU): the allow-list is a
// best-effort snapshot read OUTSIDE the locked transaction (the prior content
// is pre-read above, but inside executeTx the row is re-read withLock without
// recomputing the allow-list). A concurrent admin store that changes the
// persisted embeds between the pre-read and this write can make the preserve
// decision use a slightly stale snapshot — worst case one embed transiently
// kept or dropped; it converges on the next store, with no auth bypass or
// broader data loss. The race is accepted because it only affects concurrent
// authenticated editors on the (rare) toggle-ON non-admin path, converges on
// the next store, and the persisted row plus every share/readonly read path
// remain protected by the strip.
//
// RESIDUAL RISK (pre-persist broadcast window) — NOW MITIGATED (Gitea #26):
// this strip runs in the debounced onStoreDocument (up to 10s), but
// hocuspocus broadcasts each inbound Yjs update to connected clients
// immediately, so a non-admin's transient htmlEmbed can execute in OTHER open
// editors' browsers in the window before this persist strips it. The exposure
// is limited to concurrent AUTHENTICATED space members who have the doc open
// with Edit rights (semi-trusted) — anonymous public-share/readonly viewers do
// NOT open a collab socket (ReadonlyPageEditor renders fetched,
// already-stripped content; HocuspocusProvider is only used by the
// authenticated editable page-editor), and the PERSISTED page row plus every
// share/readonly read path are protected by this strip.
// The window is now SHRUNK to sub-second by an onChange-debounced early guard
// (~300ms) — see guardHtmlEmbed() — which runs the SAME preserve/strip gate as
// this block and re-encodes the cleaned ydoc, converging the doc for all
// clients long before this 10s store debounce fires. This onStoreDocument
// strip remains the authoritative backstop for persistence. The irreducible
// residual is only the VERY FIRST inbound broadcast before the ~300ms debounce
// fires: hocuspocus exposes no synchronous beforeBroadcast filter to drop the
// node before that first relay, so it cannot be eliminated entirely.
// Toggle-AND-admin gate: htmlEmbed survives only when the workspace feature
// toggle is ON and the storing user is an admin/owner. OFF (default) =>
// stripped for everyone (existing embeds get cleaned up on next save).
@@ -157,22 +218,44 @@ export class PersistenceExtension implements Extension {
);
if (!htmlEmbedAllowed(htmlEmbedEnabled, context?.user?.role)) {
if (hasHtmlEmbedNode(tiptapJson)) {
this.logger.warn(
`Stripping htmlEmbed node(s) from collab store by user ${context?.user?.id} on ${documentName}`,
);
tiptapJson = stripHtmlEmbedNodes(tiptapJson);
// Reflect the stripped content back into the shared ydoc so the node is
// removed for all connected clients, not just the persisted row.
const fragment = document.getXmlFragment('default');
if (fragment.length > 0) {
fragment.delete(0, fragment.length);
let strippedJson: typeof tiptapJson;
if (htmlEmbedEnabled === false) {
// Toggle OFF: feature disabled for everyone -> strip ALL embeds.
strippedJson = stripHtmlEmbedNodes(tiptapJson);
} else {
// Toggle ON, non-admin storer: preserve embeds already present in the
// currently-persisted (admin-vetted) page content; strip only the
// newly-introduced ones. Pre-read the prior content — a small extra
// query only on this rare non-admin + toggle-ON path.
const prior = await this.pageRepo.findById(pageId, {
includeContent: true,
});
const allowed = collectHtmlEmbedSources(prior?.content);
strippedJson = stripDisallowedHtmlEmbedNodes(tiptapJson, allowed);
}
// Only mutate the ydoc + log when the strip actually removed something;
// an unnecessary ydoc rewrite would churn the doc for all clients. With
// the toggle-ON branch a non-admin store that only touches admin-vetted
// embeds leaves the content unchanged here.
if (!isDeepStrictEqual(strippedJson, tiptapJson)) {
this.logger.warn(
`Stripping htmlEmbed node(s) from collab store by user ${context?.user?.id} on ${documentName}`,
);
tiptapJson = strippedJson;
// Reflect the stripped content back into the shared ydoc so the node
// is removed for all connected clients, not just the persisted row.
const fragment = document.getXmlFragment('default');
if (fragment.length > 0) {
fragment.delete(0, fragment.length);
}
const cleanDoc = TiptapTransformer.toYdoc(
tiptapJson,
'default',
tiptapExtensions,
);
Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc));
}
const cleanDoc = TiptapTransformer.toYdoc(
tiptapJson,
'default',
tiptapExtensions,
);
Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc));
}
}
@@ -346,12 +429,168 @@ export class PersistenceExtension implements Extension {
if (data.context?.actor === 'agent') {
this.agentTouched.set(documentName, true);
}
// Early htmlEmbed guard scheduling (Gitea #26). Schedule the short debounced
// guard ONLY when (a) this document's workspace toggle is cached ON and
// (b) the changing connection's user is a NON-admin (cannot author
// htmlEmbed). When the toggle is OFF/unknown we schedule NOTHING — no timer,
// no fromYdoc, no DB read — killing the OFF-case overhead (the common
// default); the onStoreDocument strip is the backstop and an OFF embed does
// not execute in editable mode anyway. We do NO expensive work here — we only
// (re)schedule the timer; the debounce coalesces rapid edits into a single
// guard check.
if (
userId &&
this.htmlEmbedToggleByDoc.get(documentName) === true &&
!canAuthorHtmlEmbed(data.context?.user?.role)
) {
const existing = this.htmlEmbedGuardTimers.get(documentName);
if (existing) {
clearTimeout(existing);
}
const timer = setTimeout(() => {
this.htmlEmbedGuardTimers.delete(documentName);
void this.guardHtmlEmbed(documentName, data.document, data.context);
}, 300);
this.htmlEmbedGuardTimers.set(documentName, timer);
}
}
/**
* Early, onChange-debounced htmlEmbed strip (Gitea #26). Mirrors the
* onStoreDocument admin gate but runs ~300ms after a non-admin edit instead of
* waiting for the 10s store debounce, so a non-admin's transient embed is
* removed from the shared ydoc — and re-broadcast as cleaned state — for all
* connected clients in sub-second time. onStoreDocument remains the
* authoritative persistence backstop; this is an ADDITIONAL early pass.
*
* CONCURRENCY (the critical invariant): the Y.Doc mutation is a single
* SYNCHRONOUS block with NO `await` between the fromYdoc snapshot and the
* applyUpdate write. ALL async work (the workspace toggle lookup and the
* persisted-content read for the allow-list) happens FIRST, before that block.
* Because JS is single-threaded, a synchronous block cannot interleave with
* inbound Yjs update handlers, so a concurrent edit that lands while we await
* cannot be CLOBBERED: we re-snapshot the live doc only after all awaits, then
* delete + rebuild + applyUpdate without yielding. (An earlier version awaited
* DB reads BETWEEN the snapshot and the write, so a concurrent edit in that gap
* was lost — this restructure fixes that.)
*
* The allow-list is a best-effort snapshot read outside any lock (TOCTOU
* accepted, same as onStoreDocument): worst case one embed is transiently kept
* or dropped; it converges on the next guard/store, with no auth bypass.
*
* Loop-safety: the corrective applyUpdate has a null origin, so the re-fired
* onChange carries no userId and is not rescheduled; and after a strip no
* htmlEmbed remains, so a subsequent guard fire is a cheap no-op (the
* hasHtmlEmbedNode early-exit). NEVER throws — an unhandled rejection in a timer
* would crash the process — so the whole body is wrapped in try/catch.
*/
private async guardHtmlEmbed(
documentName: string,
document: Y.Doc,
context: any,
): Promise<void> {
// Defensive: ensure no stale timer entry survives for this document.
this.htmlEmbedGuardTimers.delete(documentName);
try {
// Re-check defensively: onChange only schedules for non-admins, but if an
// admin/owner somehow reaches here, the embed is authored content — do
// nothing (onStoreDocument's toggle-AND-admin gate handles persistence).
if (canAuthorHtmlEmbed(context?.user?.role)) {
return;
}
// ---- ASYNC PHASE: do ALL awaits up front, before touching the ydoc. ----
// Resolve the workspace toggle exactly as onStoreDocument does. When OFF we
// strip everything; when ON we use the preserve logic (keep admin-vetted
// embeds, strip only the non-admin's newly-introduced ones).
const enabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(context?.user?.workspaceId))
?.settings,
);
// The allow-list (admin-vetted sources already in the persisted content).
// null => strip ALL (toggle OFF). Read here, BEFORE the synchronous block,
// so no await sits between the doc snapshot and the doc write.
let allowed: Set<string> | null = null;
if (enabled !== false) {
const prior = await this.pageRepo.findById(getPageId(documentName), {
includeContent: true,
});
allowed = collectHtmlEmbedSources(prior?.content);
}
// The awaits above may have let the document be unloaded/destroyed. If so,
// bail — mutating a destroyed doc is pointless and could throw (the
// try/catch is the ultimate safety net regardless).
if ((document as { isDestroyed?: boolean }).isDestroyed) {
return;
}
// ---- SYNCHRONOUS PHASE: snapshot -> strip -> reflect, NO await here. ----
// Because there is no await between fromYdoc and applyUpdate, no inbound
// Yjs update can interleave, so a concurrent edit cannot be lost.
const json = TiptapTransformer.fromYdoc(document, 'default');
// Cheap exit: nothing to guard if the doc has no embed at all. This is also
// why a post-strip re-fire is a no-op (loop-safe).
if (!hasHtmlEmbedNode(json)) {
return;
}
const strippedJson =
allowed === null
? stripHtmlEmbedNodes(json)
: stripDisallowedHtmlEmbedNodes(json, allowed);
// Nothing was stripped (e.g. the only embed is an admin-vetted one) — do
// not churn the shared ydoc for all clients.
if (isDeepStrictEqual(strippedJson, json)) {
return;
}
// Reflect the stripped content back into the shared ydoc EXACTLY as
// onStoreDocument does, so the node is removed for all connected clients,
// not just on the eventual persist. This re-encode broadcasts the cleaned
// state; after it hasHtmlEmbedNode is false, so any later guard fire is a
// cheap no-op (loop-safe).
const fragment = document.getXmlFragment('default');
if (fragment.length > 0) {
fragment.delete(0, fragment.length);
}
const cleanDoc = TiptapTransformer.toYdoc(
strippedJson,
'default',
tiptapExtensions,
);
Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc));
this.logger.warn(
`Stripping htmlEmbed node(s) via early onChange guard by user ${context?.user?.id} on ${documentName}`,
);
} catch (err) {
// NEVER rethrow out of a timer-scheduled call.
this.logger.error(
`Early htmlEmbed guard failed on ${documentName}`,
err,
);
}
}
async afterUnloadDocument(data: afterUnloadDocumentPayload) {
const documentName = data.documentName;
this.contributors.delete(documentName);
this.agentTouched.delete(documentName);
// Drop the cached toggle for this document so a reload re-resolves it (and
// picks up a mid-session admin toggle flip).
this.htmlEmbedToggleByDoc.delete(documentName);
// Clear any pending early-guard timer so it cannot fire after the document
// is unloaded (leak / use-after-unload prevention).
const timer = this.htmlEmbedGuardTimers.get(documentName);
if (timer) {
clearTimeout(timer);
this.htmlEmbedGuardTimers.delete(documentName);
}
}
private consumeContributors(documentName: string): string[] {

View File

@@ -1,8 +1,10 @@
import {
canAuthorHtmlEmbed,
collectHtmlEmbedSources,
hasHtmlEmbedNode,
htmlEmbedAllowed,
isHtmlEmbedFeatureEnabled,
stripDisallowedHtmlEmbedNodes,
stripHtmlEmbedNodes,
} from './html-embed.util';
import { htmlToJson, jsonToHtml } from '../../../collaboration/collaboration.util';
@@ -93,6 +95,17 @@ describe('stripHtmlEmbedNodes', () => {
expect(result).toEqual(doc);
});
it('neutralizes a root node that is itself an htmlEmbed', () => {
// Defensive: the PM root is always a `doc`, so this is unreachable in normal
// use, but the helper must still never return a bare htmlEmbed.
const root = {
type: 'htmlEmbed',
attrs: { source: '<script>alert(1)</script>' },
};
const result = stripHtmlEmbedNodes(root);
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('strips a deeply nested htmlEmbed (3+ levels: callout > column > paragraph-sibling)', () => {
// htmlEmbed sits as a sibling of a paragraph, nested four containers deep.
const doc = {
@@ -158,6 +171,169 @@ describe('stripHtmlEmbedNodes', () => {
});
});
describe('collectHtmlEmbedSources', () => {
it('collects the source of every htmlEmbed node, including nested ones', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<b>top</b>' } },
{
type: 'columns',
content: [
{
type: 'column',
content: [
{ type: 'htmlEmbed', attrs: { source: '<i>nested</i>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'x' }] },
],
},
],
},
],
};
const sources = collectHtmlEmbedSources(doc);
expect(sources).toEqual(new Set(['<b>top</b>', '<i>nested</i>']));
});
it('returns an empty set for a doc with no embeds', () => {
const doc = {
type: 'doc',
content: [{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] }],
};
expect(collectHtmlEmbedSources(doc).size).toBe(0);
});
it('gracefully skips embeds with absent attrs or non-string source', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed' }, // no attrs
{ type: 'htmlEmbed', attrs: {} }, // no source
{ type: 'htmlEmbed', attrs: { source: 42 } }, // non-string
{ type: 'htmlEmbed', attrs: { source: '<ok/>' } },
],
};
expect(collectHtmlEmbedSources(doc)).toEqual(new Set(['<ok/>']));
});
it('returns an empty set for non-object input', () => {
expect(collectHtmlEmbedSources(null).size).toBe(0);
expect(collectHtmlEmbedSources(undefined).size).toBe(0);
expect(collectHtmlEmbedSources('x' as any).size).toBe(0);
});
});
describe('stripDisallowedHtmlEmbedNodes', () => {
it('keeps an embed whose source is allowed and removes the rest', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'htmlEmbed', attrs: { source: '<new-evil/>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'keep' }] },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(collectHtmlEmbedSources(result)).toEqual(new Set(['<vetted/>']));
// The allowed embed and the paragraph survive; the new embed is gone.
expect(result.content).toHaveLength(2);
expect(result.content[0].attrs.source).toBe('<vetted/>');
expect(result.content[1].type).toBe('paragraph');
});
it('keeps BOTH embeds when two nodes share the same allowed source', () => {
// Source-identity semantics: identity is the raw `attrs.source`, so a
// non-admin who duplicates an existing admin-vetted source keeps both copies.
// This is intended — the raw HTML is already vetted, so a duplicate is safe.
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'mid' }] },
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(hasHtmlEmbedNode(result)).toBe(true);
const embeds = result.content.filter(
(n: any) => n.type === 'htmlEmbed',
);
expect(embeds).toHaveLength(2);
expect(embeds.every((n: any) => n.attrs.source === '<vetted/>')).toBe(true);
});
it('removes a newly-introduced embed when nothing is allowed', () => {
const doc = {
type: 'doc',
content: [{ type: 'htmlEmbed', attrs: { source: '<new/>' } }],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set());
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('filters nested embeds by the allow-list (e.g. inside columns)', () => {
const doc = {
type: 'doc',
content: [
{
type: 'columns',
content: [
{
type: 'column',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'htmlEmbed', attrs: { source: '<new/>' } },
],
},
],
},
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
const col = findFirstChild(result, 'column');
expect(col.content).toHaveLength(1);
expect(col.content[0].attrs.source).toBe('<vetted/>');
});
it('treats an embed with absent/non-string source as not allowed (stripped)', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed' },
{ type: 'htmlEmbed', attrs: {} },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('does not mutate the input document', () => {
const doc = {
type: 'doc',
content: [{ type: 'htmlEmbed', attrs: { source: '<new/>' } }],
};
stripDisallowedHtmlEmbedNodes(doc, new Set());
expect(doc.content).toHaveLength(1);
expect(doc.content[0].type).toBe('htmlEmbed');
});
it('neutralizes a root node that is itself a disallowed htmlEmbed', () => {
const root = { type: 'htmlEmbed', attrs: { source: '<new/>' } };
const result = stripDisallowedHtmlEmbedNodes(root, new Set());
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('keeps a root node that is an allowed htmlEmbed (defensive branch)', () => {
const root = { type: 'htmlEmbed', attrs: { source: '<vetted/>' } };
const result = stripDisallowedHtmlEmbedNodes(root, new Set(['<vetted/>']));
expect(collectHtmlEmbedSources(result)).toEqual(new Set(['<vetted/>']));
});
it('returns non-object input unchanged', () => {
expect(stripDisallowedHtmlEmbedNodes(null as any, new Set())).toBeNull();
});
});
describe('hasHtmlEmbedNode (root/odd-shape detection)', () => {
it('returns true when the ROOT node itself is an htmlEmbed (not only a child)', () => {
const rootEmbed = { type: 'htmlEmbed', attrs: { source: '<script>r</script>' } };

View File

@@ -22,6 +22,15 @@ export function stripHtmlEmbedNodes<T = JSONContent>(pmJson: T): T {
const node = pmJson as unknown as JSONContent;
// Defensive root-type check: if the ROOT node is itself an htmlEmbed, the
// children-filtering below could never drop it, so a bare htmlEmbed would be
// returned as-is. This branch is unreachable in normal use (the PM document
// root is always a `doc`) and exists only to make the helper total — a bare
// htmlEmbed can never be returned by this function.
if (node.type === HTML_EMBED_NODE_NAME) {
return { type: 'doc', content: [] } as unknown as T;
}
if (Array.isArray(node.content)) {
const filtered: JSONContent[] = [];
for (const child of node.content) {
@@ -39,6 +48,107 @@ export function stripHtmlEmbedNodes<T = JSONContent>(pmJson: T): T {
return { ...node } as unknown as T;
}
/**
* Walk the document and collect a stable identity for every `htmlEmbed` node.
*
* The identity is the node's `attrs.source` string — the raw HTML the embed
* renders. Two embeds that render the exact same HTML are treated as the same
* identity. Used by the collab persist path to know which embeds are ALREADY
* present in the currently-persisted (admin-vetted) page content, so a later
* non-admin store can strip only NEWLY-introduced embeds while preserving the
* pre-existing admin-authored ones.
*
* Absent attrs or a non-string/absent `source` are skipped gracefully (such a
* node contributes no identity to the set).
*/
export function collectHtmlEmbedSources(pmJson: unknown): Set<string> {
const sources = new Set<string>();
const walk = (node: unknown): void => {
if (!node || typeof node !== 'object') {
return;
}
const n = node as JSONContent;
if (n.type === HTML_EMBED_NODE_NAME) {
const source = (n.attrs as Record<string, unknown> | undefined)?.source;
if (typeof source === 'string') {
sources.add(source);
}
}
if (Array.isArray(n.content)) {
for (const child of n.content) {
walk(child);
}
}
};
walk(pmJson);
return sources;
}
/**
* Like {@link stripHtmlEmbedNodes}, but KEEP any `htmlEmbed` node whose
* `attrs.source` is in `allowedSources`; remove the rest.
*
* Used on the collab persist path when the feature toggle is ON but the storing
* user is a NON-admin: `allowedSources` is the set of embed sources already
* present in the currently-persisted page content (admin-authored, already
* vetted). A non-admin therefore cannot ADD a new embed, but their unrelated
* edit also cannot destroy an admin's existing one.
*
* NOTE: identity is the raw source string, so a non-admin who COPIES an existing
* admin embed's exact source into a NEW location passes this check. That is
* acceptable — the source is already admin-vetted content present in the doc; no
* new untrusted HTML is introduced.
*
* Returns a NEW document; the input is not mutated. Same defensive root-type
* check pattern as {@link stripHtmlEmbedNodes}.
*/
export function stripDisallowedHtmlEmbedNodes<T = JSONContent>(
pmJson: T,
allowedSources: Set<string>,
): T {
if (!pmJson || typeof pmJson !== 'object') {
return pmJson;
}
const node = pmJson as unknown as JSONContent;
// Defensive root-type check (mirrors stripHtmlEmbedNodes): if the ROOT node is
// itself an htmlEmbed and its source is NOT allowed, the children-filtering
// below could never drop it, so neutralize it here. Unreachable in normal use
// (the PM document root is always a `doc`).
if (node.type === HTML_EMBED_NODE_NAME) {
const source = (node.attrs as Record<string, unknown> | undefined)?.source;
if (typeof source === 'string' && allowedSources.has(source)) {
return { ...node } as unknown as T;
}
return { type: 'doc', content: [] } as unknown as T;
}
if (Array.isArray(node.content)) {
const filtered: JSONContent[] = [];
for (const child of node.content) {
// Drop a disallowed htmlEmbed child (newly introduced); keep an allowed
// one (already present in the persisted, admin-vetted content).
if (child && child.type === HTML_EMBED_NODE_NAME) {
const source = (child.attrs as Record<string, unknown> | undefined)
?.source;
if (typeof source === 'string' && allowedSources.has(source)) {
filtered.push({ ...child });
}
continue;
}
// Recurse so nested htmlEmbed nodes (e.g. inside columns/callouts) are
// also filtered by the same allow-list.
filtered.push(stripDisallowedHtmlEmbedNodes(child, allowedSources));
}
return { ...node, content: filtered } as unknown as T;
}
return { ...node } as unknown as T;
}
/**
* Returns true if the document contains at least one `htmlEmbed` node anywhere
* in its tree. Useful to decide whether a strip pass actually changed anything

View File

@@ -39,6 +39,11 @@ import {
} from '../casl/interfaces/space-ability.type';
import SpaceAbilityFactory from '../casl/abilities/space-ability.factory';
import { PageRepo } from '@docmost/db/repos/page/page.repo';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
import {
isHtmlEmbedFeatureEnabled,
stripHtmlEmbedNodes,
} from '../../common/helpers/prosemirror/html-embed.util';
import { RecentPageDto } from './dto/recent-page.dto';
import { CreatedByUserDto } from './dto/created-by-user.dto';
import { DuplicatePageDto } from './dto/duplicate-page.dto';
@@ -63,6 +68,7 @@ export class PageController {
constructor(
private readonly pageService: PageService,
private readonly pageRepo: PageRepo,
private readonly workspaceRepo: WorkspaceRepo,
private readonly pageHistoryService: PageHistoryService,
private readonly spaceAbility: SpaceAbilityFactory,
private readonly pageAccessService: PageAccessService,
@@ -92,6 +98,18 @@ export class PageController {
const permissions = { canEdit, hasRestriction };
if (page.content) {
const workspace = await this.workspaceRepo.findById(page.workspaceId);
if (!isHtmlEmbedFeatureEnabled(workspace?.settings)) {
// Kill-switch: when the workspace feature is OFF, never serve raw
// htmlEmbed nodes on the read path (mirrors the public-share strip),
// so disabling the feature is an immediate, total kill-switch and not
// dependent on the page being re-saved. Admin-authored content only.
// Fail-closed: a missing workspace resolves to OFF and is stripped.
page.content = stripHtmlEmbedNodes(page.content) as any;
}
}
if (dto.format && dto.format !== 'json' && page.content) {
const contentOutput =
dto.format === 'markdown'
@@ -536,6 +554,16 @@ export class PageController {
await this.pageAccessService.validateCanView(page, user);
if (history.content) {
const workspace = await this.workspaceRepo.findById(page.workspaceId);
if (!isHtmlEmbedFeatureEnabled(workspace?.settings)) {
// Kill-switch: history snapshots are an authenticated read path too, so
// strip htmlEmbed when the workspace feature is OFF (same as /info and
// the public-share path). Fail-closed on a missing workspace.
history.content = stripHtmlEmbedNodes(history.content) as any;
}
}
return history;
}

View File

@@ -1,25 +1,31 @@
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import {
hasHtmlEmbedNode,
htmlEmbedAllowed,
stripHtmlEmbedNodes,
} from '../../../common/helpers/prosemirror/html-embed.util';
// Exercises the REAL PageService htmlEmbed admin gate on its two non-collab
// write paths: PageService.create() and PageService.duplicatePage(). Both build
// content/textContent/ydoc directly and persist, bypassing the collab
// onStoreDocument strip, so each must run the incoming document through the
// toggle-AND-admin gate (`htmlEmbedAllowed(featureEnabled, role)` -> if not
// allowed, `stripHtmlEmbedNodes`) BEFORE persisting.
//
// This spec constructs the REAL PageService with every constructor dep mocked,
// feeds content containing an `htmlEmbed`, calls the real method, and asserts on
// the PERSISTED content (captured at the repo insert / db insert boundary) that
// the embed was actually stripped (member/unknown role) or preserved
// (admin/owner + toggle ON). Mirrors the GOOD pattern in
// transclusion/spec/transclusion-unsync-html-embed.spec.ts.
//
// page.service.ts pulls in the collaboration gateway (a transitive ESM chain
// `lib0/decoding.js` that jest's transformIgnorePatterns does not transpile), so
// that single module is mocked away — it is never used on the create/duplicate
// gate paths.
jest.mock('../../../collaboration/collaboration.gateway', () => ({
CollaborationGateway: class {},
}));
// PageService.create() and duplicatePage() guards.
//
// page.service.ts cannot be unit-LOADED under the server's jest config (a
// transitive ESM dep, @sindresorhus/slugify, is not in transformIgnorePatterns),
// so we cover the two load-bearing properties at the strongest feasible layer:
//
// (1) BEHAVIOR — using the REAL html-embed helpers, replay the exact predicate
// each path applies: non-admin/unknown role -> strip, admin/owner -> keep.
//
// (2) IDENTITY — source-pin which role each path reads (create: the `callerRole`
// param threaded from the request; duplicate: `authUser.role`), so a
// refactor that drops the guard or reads the wrong role trips the test.
// This is what replaces the removed `applyAdminGate` stand-in for these
// two entrypoints.
import { PageService } from './page.service';
import { hasHtmlEmbedNode } from '../../../common/helpers/prosemirror/html-embed.util';
const WS = 'ws-1';
const SPACE = 'space-1';
const USER = 'u1';
const docWithEmbed = () => ({
type: 'doc',
@@ -29,74 +35,206 @@ const docWithEmbed = () => ({
],
});
// The real predicate both paths apply (see SECURITY blocks in page.service.ts):
// toggle AND admin.
function applyGate(
json: any,
featureEnabled: boolean,
role: string | null | undefined,
) {
if (!htmlEmbedAllowed(featureEnabled, role) && hasHtmlEmbedNode(json)) {
return stripHtmlEmbedNodes(json);
}
return json;
// Minimal chainable kysely stub. `nextPagePosition` (used by create) and
// duplicatePage's bulk insert go through `this.db`; only the calls those paths
// make need to resolve. `capturedInserts` collects every page row handed to
// `insertInto('pages').values(...)` so we can assert on the persisted content.
function buildDb(capturedInserts: any[]) {
const selectChain: any = {
select: () => selectChain,
selectAll: () => selectChain,
where: () => selectChain,
orderBy: () => selectChain,
limit: () => selectChain,
execute: async () => [],
executeTakeFirst: async () => undefined,
};
const db: any = {
selectFrom: () => selectChain,
insertInto: (table: string) => ({
values: (rows: any) => {
if (table === 'pages') {
for (const row of Array.isArray(rows) ? rows : [rows]) {
capturedInserts.push(row);
}
}
return { execute: async () => undefined };
},
}),
// executeTx -> db.transaction().execute(cb): run the callback with `db`
// itself acting as the transaction so any in-tx inserts are captured too.
transaction: () => ({ execute: async (cb: any) => cb(db) }),
};
return db;
}
describe('page create/duplicate gate decision (real helpers)', () => {
it('toggle ON + non-admin (member) strips', () => {
const result = applyGate(docWithEmbed(), true, 'member');
expect(hasHtmlEmbedNode(result)).toBe(false);
expect(result.content).toHaveLength(1);
expect(result.content[0].content[0].text).toBe('body');
});
// Build the REAL PageService with all 13 constructor deps mocked. `featureEnabled`
// drives the workspace toggle the gate reads via workspaceRepo.findById.
function buildService(opts: {
featureEnabled: boolean;
capturedInserts: any[];
rootPage?: any; // for duplicatePage
}) {
const { featureEnabled, capturedInserts } = opts;
it('toggle ON + unknown/empty role fails closed (strips)', () => {
for (const role of [null, undefined, 'viewer'] as const) {
expect(hasHtmlEmbedNode(applyGate(docWithEmbed(), true, role))).toBe(
false,
);
}
});
const pageRepo: any = {
findById: jest.fn(async () => null), // no parent page in create tests
// create() persists here; capture the row so we can inspect content.
insertPage: jest.fn(async (row: any) => {
capturedInserts.push(row);
return { id: 'new-page', slugId: 'slug-1', ...row };
}),
getPageAndDescendants: jest.fn(async () => [opts.rootPage].filter(Boolean)),
};
it('toggle ON + admin/owner keep', () => {
expect(hasHtmlEmbedNode(applyGate(docWithEmbed(), true, 'admin'))).toBe(
true,
);
expect(hasHtmlEmbedNode(applyGate(docWithEmbed(), true, 'owner'))).toBe(
true,
const pagePermissionRepo: any = {
// duplicatePage filters accessible pages; grant the root so it is copied.
filterAccessiblePageIds: jest.fn(async () =>
opts.rootPage ? [opts.rootPage.id] : [],
),
};
const workspaceRepo: any = {
findById: jest.fn(async () => ({
id: WS,
settings: { htmlEmbed: featureEnabled },
})),
};
const attachmentRepo: any = { findByIds: jest.fn(async () => []) };
const storageService: any = { copy: jest.fn(async () => undefined) };
const noopQueue: any = { add: jest.fn(async () => undefined) };
const eventEmitter: any = { emit: jest.fn() };
const collaborationGateway: any = {};
const watcherService: any = {};
// duplicatePage fires transclusion bulk inserts after persisting; they are
// best-effort (wrapped in try/catch) and irrelevant to the gate.
const transclusionService: any = {
insertTransclusionsForPages: jest.fn(async () => undefined),
insertReferencesForPages: jest.fn(async () => undefined),
insertTemplateReferencesForPages: jest.fn(async () => undefined),
};
const db = buildDb(capturedInserts);
const service = new PageService(
pageRepo,
pagePermissionRepo,
attachmentRepo,
db,
storageService,
noopQueue, // attachmentQueue
noopQueue, // aiQueue
noopQueue, // generalQueue
eventEmitter,
collaborationGateway,
watcherService,
transclusionService,
workspaceRepo,
);
return service;
}
describe('PageService.create htmlEmbed admin gate (real code)', () => {
// Run create() and return the content actually persisted via insertPage.
async function persistedContent(
featureEnabled: boolean,
callerRole: string | null | undefined,
) {
const capturedInserts: any[] = [];
const service = buildService({ featureEnabled, capturedInserts });
await service.create(
USER,
WS,
{
spaceId: SPACE,
title: 'p',
// 'json' format is used as-is by parseProsemirrorContent (passed to the
// real jsonToNode schema validation), so hand it the PM-JSON object.
content: docWithEmbed(),
format: 'json' as any,
} as any,
callerRole,
);
expect(capturedInserts).toHaveLength(1);
return capturedInserts[0].content;
}
it('toggle ON + member: persisted content has htmlEmbed stripped', async () => {
const content = await persistedContent(true, 'member');
expect(hasHtmlEmbedNode(content)).toBe(false);
// Non-embed content survives.
expect(JSON.stringify(content)).toContain('body');
});
it('toggle OFF strips for everyone (admin/owner/member)', () => {
for (const role of ['admin', 'owner', 'member'] as const) {
expect(hasHtmlEmbedNode(applyGate(docWithEmbed(), false, role))).toBe(
false,
);
it('toggle ON + admin: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'admin'))).toBe(true);
});
it('toggle ON + owner: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'owner'))).toBe(true);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(hasHtmlEmbedNode(await persistedContent(false, 'admin'))).toBe(false);
});
it('unknown/empty role: fails closed (stripped)', async () => {
for (const role of [undefined, null, 'viewer'] as const) {
expect(hasHtmlEmbedNode(await persistedContent(true, role))).toBe(false);
}
});
});
const SRC = readFileSync(join(__dirname, 'page.service.ts'), 'utf-8');
describe('PageService.duplicatePage htmlEmbed admin gate (real code)', () => {
// Duplicate a single source page that contains an embed and return the content
// persisted for the copy (captured at db.insertInto('pages').values(...)).
async function persistedContent(
featureEnabled: boolean,
role: string | null | undefined,
) {
const rootPage: any = {
id: 'src-page',
slugId: 'src-slug',
title: 'Source',
icon: null,
position: 'a0',
spaceId: SPACE,
workspaceId: WS,
parentPageId: null,
content: docWithEmbed(),
};
const capturedInserts: any[] = [];
const service = buildService({ featureEnabled, capturedInserts, rootPage });
const authUser: any = { id: USER, workspaceId: WS, role };
await service.duplicatePage(rootPage, undefined, authUser);
// The bulk insert is the page persist boundary; one source page -> one copy.
const pageRows = capturedInserts.filter((r) => r.content);
expect(pageRows.length).toBeGreaterThanOrEqual(1);
return pageRows[0].content;
}
describe('page create/duplicate gate identity is pinned (source contract)', () => {
it('create() gates on toggle AND the caller role param before deriving content/ydoc', () => {
// create() receives the caller's workspace role as `callerRole` and gates on
// the combined toggle-AND-admin predicate; the embed must be stripped BEFORE
// insertPage.
expect(SRC).toMatch(
/!htmlEmbedAllowed\(\s*htmlEmbedEnabled\s*,\s*callerRole\s*\)\s*&&\s*hasHtmlEmbedNode\(\s*prosemirrorJson\s*\)/,
);
expect(SRC).toContain('prosemirrorJson = stripHtmlEmbedNodes(prosemirrorJson)');
it('toggle ON + member: persisted copy has htmlEmbed stripped', async () => {
const content = await persistedContent(true, 'member');
expect(hasHtmlEmbedNode(content)).toBe(false);
expect(JSON.stringify(content)).toContain('body');
});
it('duplicatePage() gates on toggle AND the duplicating user role (authUser.role)', () => {
expect(SRC).toMatch(
/!htmlEmbedAllowed\(\s*htmlEmbedEnabled\s*,\s*authUser\.role\s*\)\s*&&\s*hasHtmlEmbedNode\(\s*prosemirrorJson\s*\)/,
);
it('toggle ON + admin: persisted copy keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'admin'))).toBe(true);
});
it('both paths resolve the toggle from the workspace settings', () => {
expect(SRC).toContain('isHtmlEmbedFeatureEnabled(');
expect(SRC).toContain('this.workspaceRepo.findById(');
it('toggle ON + owner: persisted copy keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'owner'))).toBe(true);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(hasHtmlEmbedNode(await persistedContent(false, 'admin'))).toBe(false);
});
it('unknown/empty role: fails closed (stripped)', async () => {
for (const role of [undefined, null, 'viewer'] as const) {
expect(hasHtmlEmbedNode(await persistedContent(true, role))).toBe(false);
}
});
});

View File

@@ -1,123 +1,266 @@
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import {
hasHtmlEmbedNode,
htmlEmbedAllowed,
stripHtmlEmbedNodes,
} from '../../../common/helpers/prosemirror/html-embed.util';
// Exercises the REAL htmlEmbed admin gate on the two import write paths:
//
// (1) ImportService.importPage() — single .html/.md upload
// (2) FileImportTaskService.processGenericImport() — zip / multi-file import
//
// Both build content/textContent/ydoc directly and persist (bypassing the
// collab onStoreDocument strip), so each must run the imported document through
// the toggle-AND-admin gate: resolve the importer via userRepo.findById, read
// the workspace toggle, then `htmlEmbedAllowed(enabled, role)` -> if not allowed,
// `stripHtmlEmbedNodes` BEFORE persisting.
//
// This spec constructs the REAL services with deps mocked, feeds an imported
// HTML document that contains an `htmlEmbed` div (parsed into a real htmlEmbed
// node by the REAL htmlToJson), runs the real method, and asserts the PERSISTED
// content (captured at the insert boundary) is stripped for a non-admin /
// missing user and preserved for admin/owner + toggle ON. Mirrors the GOOD
// pattern in transclusion/spec/transclusion-unsync-html-embed.spec.ts.
//
// Three modules are mocked away because they pull transitive ESM deps that
// jest's transformIgnorePatterns does not transpile (`lib0/decoding.js` via the
// collab gateway, `@sindresorhus/slugify` via import-formatter, `p-limit` via
// import-attachment). None of them participate in the gate decision:
// - import-formatter: contextless HTML cleanup + link rewriting; replaced with
// faithful passthroughs (the embed div has no href/iframe, so the real
// normalizer would leave it untouched anyway).
// - import-attachment: attachment rewriting; passthrough returns html as-is.
jest.mock('../../../collaboration/collaboration.gateway', () => ({
CollaborationGateway: class {},
}));
jest.mock('../utils/import-formatter', () => ({
normalizeImportHtml: () => {},
formatImportHtml: async (opts: any) => ({
html: opts.html,
backlinks: [],
pageIcon: undefined,
}),
}));
jest.mock('./import-attachment.service', () => ({
ImportAttachmentService: class {},
}));
// FAIL-CLOSED IDENTITY for the import write paths.
//
// import.service / file-import-task.service cannot be unit-LOADED under the
// server's jest config (a transitive ESM dep, @sindresorhus/slugify, is not in
// transformIgnorePatterns). So we cover the two load-bearing properties at the
// strongest feasible layer:
//
// (1) BEHAVIOR — using the REAL html-embed helpers, replay the exact gate
// predicate each entrypoint runs against the role resolved from
// userRepo.findById(...): a MISSING user (findById -> undefined) must fail
// closed (strip), and only 'admin'/'owner' keep the embed.
//
// (2) IDENTITY — source-pin which identity governs the gate so a refactor that
// swaps the lookup to the wrong user (e.g. the queue worker's caller) is
// caught: zip import resolves the role from `fileTask.creatorId`; single
// import from the request `userId`. NOT some ambient caller.
//
// If a guard is deleted/misplaced or the identity field changes, these break.
import { promises as fs } from 'node:fs';
import * as os from 'node:os';
import * as path from 'node:path';
import { ImportService } from './import.service';
import { FileImportTaskService } from './file-import-task.service';
import { hasHtmlEmbedNode } from '../../../common/helpers/prosemirror/html-embed.util';
const docWithEmbed = () => ({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'imported body' }] },
{ type: 'htmlEmbed', attrs: { source: '<script>x</script>' } },
],
});
const WS = 'ws-1';
const SPACE = 'space-1';
const USER = 'importer-1';
// The real predicate both import entrypoints apply (see the SECURITY blocks in
// import.service.ts and file-import-task.service.ts): resolve the importer via
// userRepo.findById, then `!canAuthorHtmlEmbed(role) && hasHtmlEmbedNode(json)`.
function applyImportGate(
json: any,
featureEnabled: boolean,
importingUser: { role?: string } | undefined,
) {
if (
!htmlEmbedAllowed(featureEnabled, importingUser?.role) &&
hasHtmlEmbedNode(json)
) {
return stripHtmlEmbedNodes(json);
}
return json;
// HTML carrying the serialized htmlEmbed node. The REAL htmlToJson parses
// `<div data-type="htmlEmbed" data-source="BASE64">` into an htmlEmbed PM node
// (base64 below decodes to `<script>x</script>`).
const HTML_WITH_EMBED =
'<p>imported body</p>' +
'<div data-type="htmlEmbed" data-source="PHNjcmlwdD54PC9zY3JpcHQ+"></div>';
function workspaceRepoFor(featureEnabled: boolean) {
return {
findById: jest.fn(async () => ({
id: WS,
settings: { htmlEmbed: featureEnabled },
})),
};
}
describe('import gate fail-closed by toggle AND resolved-user role (real helpers)', () => {
it('toggle ON + missing user (userRepo.findById -> undefined) strips the embed', () => {
// findById returns undefined when the user/workspace pair does not resolve;
// undefined?.role is undefined -> htmlEmbedAllowed(true, undefined) === false.
const result = applyImportGate(docWithEmbed(), true, undefined);
expect(hasHtmlEmbedNode(result)).toBe(false);
// userRepo.findById resolves the importer's role (or undefined for a missing
// user -> fail closed).
function userRepoFor(user: { role?: string } | undefined) {
return { findById: jest.fn(async () => user) };
}
describe('ImportService.importPage htmlEmbed admin gate (real code)', () => {
// Run importPage with a single uploaded .html and return the persisted content
// captured at pageRepo.insertPage.
async function persistedContent(
featureEnabled: boolean,
user: { role?: string } | undefined,
) {
const captured: any[] = [];
const pageRepo: any = {
insertPage: jest.fn(async (row: any) => {
captured.push(row);
return { id: 'p1', slugId: 's1', ...row };
}),
};
// db is only used for getNewPagePosition (a select chain).
const selectChain: any = {
select: () => selectChain,
where: () => selectChain,
orderBy: () => selectChain,
limit: () => selectChain,
executeTakeFirst: async () => undefined,
};
const db: any = { selectFrom: () => selectChain };
const service = new ImportService(
pageRepo,
userRepoFor(user) as any,
{ putBuffer: jest.fn() } as any, // storageService (unused on this path)
db,
{ add: jest.fn() } as any, // fileTaskQueue (unused)
workspaceRepoFor(featureEnabled) as any,
);
const file: any = {
filename: 'doc.html',
toBuffer: async () => Buffer.from(HTML_WITH_EMBED, 'utf-8'),
};
await service.importPage(Promise.resolve(file), USER, SPACE, WS);
expect(captured).toHaveLength(1);
return captured[0].content;
}
it('toggle ON + member: persisted content has htmlEmbed stripped', async () => {
const content = await persistedContent(true, { role: 'member' });
expect(hasHtmlEmbedNode(content)).toBe(false);
expect(JSON.stringify(content)).toContain('imported body');
});
it("toggle ON + resolved role 'member' strips", () => {
it('toggle ON + missing user (findById -> undefined): fails closed (stripped)', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, undefined))).toBe(
false,
);
});
it('toggle ON + admin: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'admin' }))).toBe(
true,
);
});
it('toggle ON + owner: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'owner' }))).toBe(
true,
);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(
hasHtmlEmbedNode(applyImportGate(docWithEmbed(), true, { role: 'member' })),
hasHtmlEmbedNode(await persistedContent(false, { role: 'admin' })),
).toBe(false);
});
it("toggle ON + resolved role 'admin' keeps the embed", () => {
expect(
hasHtmlEmbedNode(applyImportGate(docWithEmbed(), true, { role: 'admin' })),
).toBe(true);
});
it("toggle ON + resolved role 'owner' keeps the embed", () => {
expect(
hasHtmlEmbedNode(applyImportGate(docWithEmbed(), true, { role: 'owner' })),
).toBe(true);
});
it('toggle OFF strips for every role (admin/owner/member)', () => {
for (const role of ['admin', 'owner', 'member'] as const) {
expect(
hasHtmlEmbedNode(applyImportGate(docWithEmbed(), false, { role })),
).toBe(false);
}
});
});
// Source-pin the identity each entrypoint feeds to userRepo.findById. These are
// the lines that decide WHOSE role governs the gate; pinning them means a
// refactor that points the lookup at the wrong user trips the test.
const SRC_DIR = join(__dirname);
describe('FileImportTaskService.processGenericImport htmlEmbed admin gate (real code)', () => {
let extractDir: string;
describe('import gate identity is pinned to the importer (source contract)', () => {
it('single import resolves the role from the request userId', () => {
const src = readFileSync(join(SRC_DIR, 'import.service.ts'), 'utf-8');
// The role lookup must key on the request `userId`, then gate on the role.
expect(src).toMatch(
/this\.userRepo\.findById\(\s*userId\s*,\s*workspaceId\s*\)/,
);
expect(src).toMatch(
/htmlEmbedAllowed\(\s*htmlEmbedEnabled\s*,\s*importingUser\?\.role\s*\)/,
);
// And the toggle is resolved from the workspace settings.
expect(src).toContain('isHtmlEmbedFeatureEnabled(');
// And the gate uses the real strip helper.
expect(src).toContain('stripHtmlEmbedNodes(prosemirrorJson)');
beforeEach(async () => {
// Real temp dir holding a single .html page that carries the embed; the
// method reads it from disk via fs.readFile.
extractDir = await fs.mkdtemp(path.join(os.tmpdir(), 'html-embed-import-'));
await fs.writeFile(path.join(extractDir, 'page.html'), HTML_WITH_EMBED);
});
it('zip import resolves the role from fileTask.creatorId (NOT the queue caller)', () => {
const src = readFileSync(
join(SRC_DIR, 'file-import-task.service.ts'),
'utf-8',
afterEach(async () => {
await fs.rm(extractDir, { recursive: true, force: true });
});
// Run processGenericImport over the temp dir and return the content persisted
// for the imported page (captured at trx.insertInto('pages').values(...)).
async function persistedContent(
featureEnabled: boolean,
user: { role?: string } | undefined,
) {
const captured: any[] = [];
const trxInsertChain = (table: string) => ({
values: (row: any) => {
if (table === 'pages') captured.push(row);
return { execute: async () => undefined };
},
});
const trx: any = { insertInto: trxInsertChain };
const db: any = {
// spaces lookup at the top of processGenericImport
selectFrom: () => ({
select: () => ({
where: () => ({ executeTakeFirst: async () => ({ slug: 'sp' }) }),
}),
}),
// executeTx -> db.transaction().execute(cb)
transaction: () => ({ execute: async (cb: any) => cb(trx) }),
};
// importService stub: only the real, gate-relevant helpers are used. We give
// it the REAL implementations by delegating to a real ImportService for
// processHTML/extractTitleAndRemoveHeading/createYdoc so the embed parse and
// strip path runs for real.
const realImport = new ImportService(
{} as any,
{} as any,
{} as any,
{} as any,
{} as any,
{} as any,
);
expect(src).toMatch(
/this\.userRepo\.findById\(\s*fileTask\.creatorId\s*,\s*fileTask\.workspaceId\s*,?\s*\)/,
const importService: any = {
processHTML: (html: string) => realImport.processHTML(html),
extractTitleAndRemoveHeading: (s: any) =>
realImport.extractTitleAndRemoveHeading(s),
createYdoc: (j: any) => realImport.createYdoc(j),
};
const importAttachmentService: any = {
// passthrough: no attachment rewriting, return html unchanged
processAttachments: jest.fn(async (opts: any) => opts.html),
};
const service = new FileImportTaskService(
{ putBuffer: jest.fn() } as any, // storageService
importService,
{ nextPagePosition: jest.fn(async () => 'a0') } as any, // pageService (position only)
{ insertBacklink: jest.fn() } as any, // backlinkRepo
db,
importAttachmentService,
userRepoFor(user) as any,
workspaceRepoFor(featureEnabled) as any,
{ emit: jest.fn() } as any, // eventEmitter
{ logBatchWithContext: jest.fn() } as any, // auditService
);
expect(src).toMatch(
/importerCanAuthorHtmlEmbed\s*=\s*htmlEmbedAllowed\(\s*htmlEmbedEnabled\s*,\s*importingUser\?\.role\s*,?\s*\)/,
const fileTask: any = {
id: 'task-1',
creatorId: USER,
workspaceId: WS,
spaceId: SPACE,
source: 'generic',
};
await service.processGenericImport({ extractDir, fileTask });
expect(captured.length).toBeGreaterThanOrEqual(1);
return captured[0].content;
}
it('toggle ON + member: persisted page has htmlEmbed stripped', async () => {
const content = await persistedContent(true, { role: 'member' });
expect(hasHtmlEmbedNode(content)).toBe(false);
expect(JSON.stringify(content)).toContain('imported body');
});
it('toggle ON + missing user (creatorId resolves to undefined): fails closed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, undefined))).toBe(
false,
);
expect(src).toContain('isHtmlEmbedFeatureEnabled(');
expect(src).toContain('stripHtmlEmbedNodes(prosemirrorJson)');
});
it('toggle ON + admin: persisted page keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'admin' }))).toBe(
true,
);
});
it('toggle ON + owner: persisted page keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'owner' }))).toBe(
true,
);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(
hasHtmlEmbedNode(await persistedContent(false, { role: 'admin' })),
).toBe(false);
});
});