Merge remote-tracking branch 'gitea/develop' into fix/review-batch-2

# Conflicts:
#	AGENTS.md
#	CHANGELOG.md
#	README.md
#	apps/server/src/collaboration/collaboration.handler.ts
#	apps/server/src/common/helpers/prosemirror/html-embed.spec.ts
#	apps/server/src/common/helpers/prosemirror/html-embed.util.ts
#	apps/server/src/core/ai-chat/public-share-chat.service.ts
#	apps/server/src/core/ai-chat/public-share-chat.spec.ts
#	apps/server/src/core/ai-chat/public-share-workspace-limiter.ts
#	apps/server/src/core/page/services/page.service.ts
#	apps/server/src/core/page/transclusion/transclusion.service.ts
#	apps/server/src/integrations/import/services/file-import-task.service.ts
#	apps/server/src/integrations/import/services/import.service.ts
This commit is contained in:
claude code agent 227
2026-06-21 05:32:44 +03:00
65 changed files with 1448 additions and 2927 deletions

View File

@@ -29,6 +29,11 @@ PORT=3000
# `127.0.0.1, 10.0.0.0/8`
# TRUST_PROXY=
# APP_SECRET has a DUAL role: it signs JWTs AND derives the AES-256-GCM key that
# encrypts stored AI-provider credentials (API keys) at rest. CONSEQUENCE: if you
# change APP_SECRET after setup, every stored AI API key becomes undecryptable —
# you must re-enter them in AI settings — and all existing sessions/JWTs are
# invalidated. Choose it ONCE, keep it stable, and back it up alongside your DB.
# minimum of 32 characters. Generate one with: openssl rand -hex 32
APP_SECRET=REPLACE_WITH_LONG_SECRET
@@ -139,7 +144,12 @@ MCP_DOCMOST_PASSWORD=
#
# Backstop: a cluster-wide, sliding-window cap per workspace (IP-independent,
# keyed by the server-resolved workspace id) bounds the owner's bill even if the
# per-IP limit is fully evaded. It is a COST backstop, not an access control,
# and FAILS OPEN if Redis is unavailable. Override the hourly cap below
# per-IP limit is fully evaded. It is a COST backstop, not an access control, and
# FAILS CLOSED if Redis is unavailable (an optional assistant briefly going
# offline is safer than an unbounded bill). Override the hourly cap below
# (default: 300 calls per workspace per rolling hour).
# SHARE_AI_WORKSPACE_MAX_PER_HOUR=300
#
# Per-request output-token ceiling for the anonymous assistant (default: 512).
# Worst-case output per accepted call = agent steps (5) × this value.
# SHARE_AI_MAX_OUTPUT_TOKENS=512

3
.gitignore vendored
View File

@@ -42,3 +42,6 @@ lerna-debug.log*
.nx/installation
.nx/cache
.claude/worktrees/
# TypeScript incremental build artifacts
*.tsbuildinfo

View File

@@ -280,4 +280,4 @@ The git tag is the source of truth for the displayed version (UI reads `git desc
## Planning docs
`docs/*.md` hold design plans for in-progress / planned features (mobile app, offline sync, RAG improvements, streaming dictation). Arbitrary HTML embed has **shipped** (admin-gated by the `htmlEmbed` workspace toggle in Workspace settings) and is no longer a planning doc. `docs/backlog/*.md` track known issues / follow-ups (e.g. AI-chat review follow-ups). Consult the relevant plan before working on one of those areas.
`docs/*.md` hold design plans for in-progress / planned features (mobile app, offline sync, RAG improvements, voice dictation). Arbitrary HTML embed has **shipped** — it renders inside a sandboxed iframe and, when the `htmlEmbed` workspace toggle is on, is insertable by any member (no longer admin-only); turning the toggle off hides/stops serving existing embeds on public share pages. `docs/backlog/*.md` track known issues / follow-ups (e.g. AI-chat review follow-ups). Consult the relevant plan before working on one of those areas.

View File

@@ -10,6 +10,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- Admin-only "Analytics / tracker" workspace setting: a raw HTML/JS snippet
injected into the `<head>` of public share pages only (for analytics such as
Google Analytics or Yandex.Metrika).
### Changed
- HTML embed blocks now render inside a sandboxed iframe (separate origin) and,
when the workspace HTML-embed toggle is on, can be inserted by any member
(previously admin-only). Turning the toggle off hides existing embeds and
stops serving them on public share pages.
- Remove the server-side role-based stripping of HTML-embed blocks from the
write paths (collab/REST/MCP, page create/duplicate, import, transclusion
unsync); sandboxing makes per-write gating unnecessary. The only remaining
server-side strip is the public-share read path, which still honors the
workspace HTML-embed toggle.
### Breaking Changes
- **MCP shared-token auth moved to its own header.** The `/mcp` shared guard

View File

@@ -102,6 +102,8 @@ community feature, with no enterprise license. Open it from the page header; the
-**AI chat** — built-in AI agent chat over your wiki content (read + write, RAG search, configurable provider, optional web access via external MCP).
-**Voice dictation** — microphone button in the AI agent chat and the page editor; audio is transcribed server-side (Whisper / OpenAI-compatible STT) via the workspace AI provider, with an admin toggle to show/hide it.
-**Page templates** — flag a page as a template and embed its whole content live into other pages; edits to the template propagate to every place it is inserted (whole-page transclusion on top of the existing synced blocks).
-**Public-share AI assistant** — anonymous visitors of a shared page can ask the AI agent, scoped strictly to that share's page tree (read-only, share-scoped search), behind a workspace toggle.
-**Footnotes** — academic-style footnotes: a numbered superscript reference inline (read it in place via a hover popover), with the note text living as a real, editable block at the bottom of the page; auto-numbered, collaboration-safe, and round-trips through Markdown export/import and the AI agent / MCP.
### In progress
@@ -110,12 +112,10 @@ community feature, with no enterprise license. Open it from the page header; the
### Planned
- 🔭 **Viewer comments** — let read-only viewers leave comments.
- 🔭 **Public-share AI assistant** — let anonymous visitors of a shared page ask the AI agent, scoped strictly to that share's page tree (read-only, share-scoped search), behind a workspace toggle. See [docs/public-share-assistant-plan.md](docs/public-share-assistant-plan.md).
- 🔭 **Password-protected pages** — protect individual pages / shares with a password.
- 🔭 **Windows / Linux app** — native desktop app for Windows and Linux.
- 🔭 **Mobile app** — mobile apps (iOS first, Android to follow), reusing the existing responsive web UI and editor via a Capacitor wrapper, with offline planned for later. See [docs/mobile-app-plan.md](docs/mobile-app-plan.md).
- 🔭 **Offline mode** — offline sync & PWA support.
- 🔭 **Footnotes** — academic-style footnotes: a numbered superscript reference inline (read it in place via a hover popover), with the note text living as a real, editable block at the bottom of the page; auto-numbered, collaboration-safe, and round-trips through Markdown export/import and the AI agent / MCP. See [docs/footnotes-plan.md](docs/footnotes-plan.md).
- 🔭 **Editor & UX improvements** — blocks inside tables (lists, to-do items), column layout, additional heading levels, highlight blocks, custom emoji in callouts, floating images, anchor links for page mentions, toggles (shared-page width, aside/sidebar, spellcheck, ligatures), sanitized space-tree export, and mentions in breadcrumbs.
## Getting started
@@ -158,6 +158,11 @@ the existing data directory is reused as-is:
start the new migrations apply on top of your existing schema (`CREATE EXTENSION vector` plus the
`page_embeddings` and AI tables); watch the logs for `Migration "..." executed successfully`.
> ⚠️ **Never change `APP_SECRET` after setup.** It does double duty: it signs JWTs *and* derives the
> AES-256-GCM key that encrypts stored AI-provider credentials (API keys). Rotating it makes every
> saved AI API key undecryptable (you'd have to re-enter them in AI settings) and invalidates all
> existing sessions. Pick it once, keep it stable, and back it up together with your database.
### Notes
- **Back up first.** Take a `pg_dump` before swapping — migrations apply in place, and the

View File

@@ -102,6 +102,9 @@ real-time-коллаборации Docmost, поэтому запись нико
-**Приложение для macOS** — нативное приложение для macOS ([gitmost-app](https://github.com/vvzvlad/gitmost-app)), встраивающее UI с вкладками для нескольких серверов.
-**AI-чат** — встроенный чат с AI-агентом по содержимому вики (чтение + запись, RAG-поиск, настраиваемый провайдер, опциональный доступ в интернет через внешние MCP).
-**Голосовая диктовка** — кнопка-микрофон в чате AI-агента и в редакторе страниц; аудио распознаётся на сервере (Whisper / OpenAI-совместимый STT) через AI-провайдер воркспейса, с тумблером админа для показа/скрытия.
-**Шаблоны страниц** — пометить страницу шаблоном и вставлять её содержимое живой ссылкой в другие страницы; правки шаблона распространяются на все места вставки (whole-page-транслюзия поверх существующих synced-блоков).
-**AI-ассистент на публичных шарах** — анонимный зритель расшаренной страницы может спросить AI-агента, который ищет строго по дереву этой шары (read-only, share-scoped поиск), за тумблером воркспейса.
-**Сноски** — сноски академического вида: нумерованная ссылка-надстрочник прямо в тексте (читается на месте во всплывающем окне по наведению), а текст сноски живёт реальным редактируемым блоком внизу страницы; авто-нумерация, безопасна для совместного редактирования, переживает экспорт/импорт Markdown и доступна AI-агенту / MCP.
### В процессе
@@ -109,14 +112,11 @@ real-time-коллаборации Docmost, поэтому запись нико
### В планах
- 🔭 **Шаблоны страниц** — пометить страницу шаблоном и вставлять её содержимое живой ссылкой в другие страницы; правки шаблона распространяются на все места вставки (whole-page-транслюзия поверх существующих synced-блоков). См. [docs/page-templates-plan.md](docs/page-templates-plan.md).
- 🔭 **Комментарии зрителей** — возможность комментировать для пользователей с доступом только на чтение.
- 🔭 **AI-ассистент на публичных шарах** — возможность анонимному зрителю расшаренной страницы спросить AI-агента, который ищет строго по дереву этой шары (read-only, share-scoped поиск), за тумблером воркспейса. См. [docs/public-share-assistant-plan.md](docs/public-share-assistant-plan.md).
- 🔭 **Защищённые паролем страницы** — защита отдельных страниц / шар паролем.
- 🔭 **Приложение для Windows / Linux** — нативное десктоп-приложение для Windows и Linux.
- 🔭 **Мобильное приложение** — мобильные приложения (iOS обязательно, Android как пойдёт) на базе существующей адаптивной веб-версии и редактора через обёртку Capacitor; оффлайн запланирован на будущее. См. [docs/mobile-app-plan.md](docs/mobile-app-plan.md).
- 🔭 **Офлайн-режим** — офлайн-синхронизация и поддержка PWA.
- 🔭 **Сноски** — сноски академического вида: нумерованная ссылка-надстрочник прямо в тексте (читается на месте во всплывающем окне по наведению), а текст сноски живёт реальным редактируемым блоком внизу страницы; авто-нумерация, безопасна для совместного редактирования, переживает экспорт/импорт Markdown и доступна AI-агенту / MCP. См. [docs/footnotes-plan.md](docs/footnotes-plan.md).
- 🔭 **Улучшения редактора и UX** — блоки внутри таблиц (списки, чек-листы), колоночная вёрстка, дополнительные уровни заголовков, highlight-блоки, кастомные эмодзи в callout-ах, плавающие изображения, anchor-ссылки на упоминания страниц, тоглы (ширина шары, aside/сайдбар, spellcheck, лигатуры), санитизация экспорта дерева спейса и mentions в хлебных крошках.
## С чего начать
@@ -159,6 +159,12 @@ dump/restore, существующий каталог данных переис
новые миграции применяются поверх вашей схемы (`CREATE EXTENSION vector` плюс таблицы
`page_embeddings` и AI-таблицы); следите в логах за строками `Migration "..." executed successfully`.
> ⚠️ **Никогда не меняйте `APP_SECRET` после установки.** Он выполняет двойную роль: подписывает JWT
> *и* служит материалом для ключа AES-256-GCM, которым шифруются сохранённые ключи AI-провайдеров
> (API-ключи). Смена секрета сделает все сохранённые AI-ключи нерасшифровываемыми (придётся вводить
> их заново в настройках AI) и инвалидирует все текущие сессии. Задайте его один раз, держите
> неизменным и бэкапьте вместе с базой данных.
## Возможности

View File

@@ -1145,6 +1145,7 @@
"Current context size": "Current context size",
"AI agent": "AI agent",
"AI agent is typing…": "AI agent is typing…",
"{{name}} is typing…": "{{name}} is typing…",
"Send": "Send",
"Stop": "Stop",
"Chat menu": "Chat menu",
@@ -1239,5 +1240,20 @@
"Reusable presets that shape the agent's behavior (and optionally its model). Picked when starting a new chat.": "Reusable presets that shape the agent's behavior (and optionally its model). Picked when starting a new chat.",
"No roles configured": "No roles configured",
"Delete role": "Delete role",
"Are you sure you want to delete this role?": "Are you sure you want to delete this role?"
"Are you sure you want to delete this role?": "Are you sure you want to delete this role?",
"HTML embed": "HTML embed",
"Edit HTML embed": "Edit HTML embed",
"HTML embed is disabled in this workspace": "HTML embed is disabled in this workspace",
"Click to add HTML / CSS / JS": "Click to add HTML / CSS / JS",
"This HTML/CSS/JS runs in a sandboxed frame and cannot access the viewer's session, cookies, or API.": "This HTML/CSS/JS runs in a sandboxed frame and cannot access the viewer's session, cookies, or API.",
"<script>...</script>": "<script>...</script>",
"Height (px, blank = auto)": "Height (px, blank = auto)",
"advanced": "advanced",
"Enable HTML embed": "Enable HTML embed",
"Allow members to insert raw HTML/CSS/JavaScript blocks. The block renders in a sandboxed frame and cannot access the viewer's session, cookies, or API. Off by default.": "Allow members to insert raw HTML/CSS/JavaScript blocks. The block renders in a sandboxed frame and cannot access the viewer's session, cookies, or API. Off by default.",
"When enabled, any member can insert an HTML embed block. The toggle just enables or disables the block type workspace-wide.": "When enabled, any member can insert an HTML embed block. The toggle just enables or disables the block type workspace-wide.",
"Embeds run inside a sandboxed iframe with a separate origin, so they cannot read or modify the page they are embedded in.": "Embeds run inside a sandboxed iframe with a separate origin, so they cannot read or modify the page they are embedded in.",
"Turning this off hides existing embeds (they render as a disabled placeholder) and stops serving them on public share pages.": "Turning this off hides existing embeds (they render as a disabled placeholder) and stops serving them on public share pages.",
"Analytics / tracker": "Analytics / tracker",
"Injected verbatim into the <head> of PUBLIC SHARE pages only (same-origin). For analytics snippets (Google Analytics, Yandex.Metrika, etc.). Admin only.": "Injected verbatim into the <head> of PUBLIC SHARE pages only (same-origin). For analytics snippets (Google Analytics, Yandex.Metrika, etc.). Admin only."
}

View File

@@ -668,6 +668,7 @@
"AI search": "Поиск ИИ",
"AI Answer": "Ответ ИИ",
"Ask AI": "Спросить ИИ",
"{{name}} is typing…": "{{name}} печатает…",
"AI is thinking...": "ИИ обрабатывает запрос...",
"Thinking": "Думаю",
"Ask a question...": "Задайте вопрос...",

View File

@@ -22,6 +22,11 @@ interface MessageItemProps {
* UUIDs/routes in the assistant's markdown don't leak as clickable links.
*/
neutralizeInternalLinks?: boolean;
/**
* Display name for the dimmed assistant label. Defaults to "AI agent" when
* absent; the public share passes the configured identity (agent role) name.
*/
assistantName?: string;
}
/**
@@ -40,6 +45,7 @@ export default function MessageItem({
message,
showCitations = true,
neutralizeInternalLinks = false,
assistantName,
}: MessageItemProps) {
const { t } = useTranslation();
const isUser = message.role === "user";
@@ -61,7 +67,7 @@ export default function MessageItem({
return (
<Box className={classes.messageRow}>
<Text size="xs" c="dimmed" mb={4}>
{t("AI agent")}
{assistantName?.trim() || t("AI agent")}
</Text>
{message.parts.map((part, index) => {
if (part.type === "text") {

View File

@@ -30,6 +30,12 @@ interface MessageListProps {
* UUIDs/routes don't leak as clickable links to anonymous readers.
*/
neutralizeInternalLinks?: boolean;
/**
* Display name for the assistant's dimmed row label and typing indicator.
* Defaults to "AI agent" when absent. The public share passes the configured
* identity (agent role) name; the internal chat omits it.
*/
assistantName?: string;
}
// Distance (px) from the bottom within which the viewport still counts as
@@ -67,6 +73,7 @@ export default function MessageList({
emptyState,
showCitations = true,
neutralizeInternalLinks = false,
assistantName,
}: MessageListProps) {
const { t } = useTranslation();
const viewportRef = useRef<HTMLDivElement>(null);
@@ -148,9 +155,10 @@ export default function MessageList({
message={message}
showCitations={showCitations}
neutralizeInternalLinks={neutralizeInternalLinks}
assistantName={assistantName}
/>
))}
{typing && <TypingIndicator />}
{typing && <TypingIndicator assistantName={assistantName} />}
</Stack>
</ScrollArea>
);

View File

@@ -2,22 +2,33 @@ import { Box, Group, Text } from "@mantine/core";
import { useTranslation } from "react-i18next";
import classes from "@/features/ai-chat/components/ai-chat.module.css";
interface TypingIndicatorProps {
/**
* Display name for the dimmed label and the "… is typing…" line. Defaults to
* "AI agent" when absent; the public share passes the configured identity
* (agent role) name.
*/
assistantName?: string;
}
/**
* Live "AI agent is typing…" placeholder shown while a turn is in flight but the
* latest assistant message has no visible content yet (no rendered text/tool
* parts). It covers the gap between sending and the first streamed token, and is
* replaced by the real assistant message once content starts arriving.
* Live " is typing…" placeholder shown while a turn is in flight but the latest
* assistant message has no visible content yet (no rendered text/tool parts). It
* covers the gap between sending and the first streamed token, and is replaced by
* the real assistant message once content starts arriving.
*
* Mirrors the assistant row layout in MessageItem (the dimmed "AI agent" label),
* so it reads as the assistant's bubble taking shape.
* Mirrors the assistant row layout in MessageItem (the dimmed label), so it reads
* as the assistant's bubble taking shape. The label and typing line use the
* configured identity name when provided, otherwise the generic "AI agent".
*/
export default function TypingIndicator() {
export default function TypingIndicator({ assistantName }: TypingIndicatorProps) {
const { t } = useTranslation();
const name = assistantName?.trim();
return (
<Box className={classes.messageRow}>
<Text size="xs" c="dimmed" mb={4}>
{t("AI agent")}
{name || t("AI agent")}
</Text>
<Group gap={8} align="center">
<span className={classes.typingDots} aria-hidden="true">
@@ -26,7 +37,7 @@ export default function TypingIndicator() {
<span />
</span>
<Text size="sm" c="dimmed">
{t("AI agent is typing…")}
{name ? t("{{name}} is typing…", { name }) : t("AI agent is typing…")}
</Text>
</Group>
</Box>

View File

@@ -0,0 +1,63 @@
import { describe, it, expect } from "vitest";
import {
buildSandboxSrcdoc,
canEdit,
HTML_EMBED_HEIGHT_MESSAGE,
shouldRender,
} from "./html-embed-sandbox";
describe("buildSandboxSrcdoc", () => {
it("embeds the user source verbatim", () => {
const out = buildSandboxSrcdoc("<div id='x'>hello</div>");
expect(out).toContain("<div id='x'>hello</div>");
});
it("injects the height-postMessage bootstrap after the source", () => {
const out = buildSandboxSrcdoc("<p>body</p>");
// The bootstrap is appended AFTER the source.
expect(out.indexOf("<p>body</p>")).toBeLessThan(
out.indexOf(HTML_EMBED_HEIGHT_MESSAGE),
);
// It reports its height to the parent via postMessage with the agreed type.
expect(out).toContain("parent.postMessage");
expect(out).toContain(HTML_EMBED_HEIGHT_MESSAGE);
// It observes resizes so the parent can keep the iframe sized to fit.
expect(out).toContain("ResizeObserver");
expect(out).toContain('addEventListener("load"');
});
it("handles an empty source (still injects the bootstrap)", () => {
const out = buildSandboxSrcdoc("");
expect(out).toContain(HTML_EMBED_HEIGHT_MESSAGE);
});
});
describe("shouldRender (render policy)", () => {
it("read-only renders regardless of the workspace toggle", () => {
// isEditable=false → the server already gated the content.
expect(shouldRender(false, false)).toBe(true);
expect(shouldRender(false, true)).toBe(true);
});
it("editable + toggle OFF does NOT render", () => {
expect(shouldRender(true, false)).toBe(false);
});
it("editable + toggle ON renders", () => {
expect(shouldRender(true, true)).toBe(true);
});
});
describe("canEdit (edit policy)", () => {
it("any member can edit when editable and the toggle is ON (no admin gate)", () => {
expect(canEdit(true, true)).toBe(true);
});
it("cannot edit when the toggle is OFF", () => {
expect(canEdit(true, false)).toBe(false);
});
it("cannot edit in read-only mode (no edit affordance)", () => {
expect(canEdit(false, true)).toBe(false);
});
});

View File

@@ -0,0 +1,100 @@
/**
* Pure helpers for the HTML embed node view. Kept out of the React component so
* the sandbox srcdoc builder and the render/edit policy can be unit-tested
* against a bare environment with no Tiptap/Mantine providers.
*/
/** postMessage type the sandboxed iframe uses to report its content height. */
export const HTML_EMBED_HEIGHT_MESSAGE = "gitmost-html-embed-height";
/**
* Build the `srcdoc` document for the sandboxed embed iframe.
*
* The user's `source` is placed verbatim, then a small bootstrap <script> is
* appended at the end of the body. The iframe is rendered with a sandbox that
* does NOT include `allow-same-origin`, so this content runs in an opaque
* ("null") origin and cannot read the viewer's cookies/session/API — it is
* harmless. The bootstrap measures the document height and reports it to the
* parent via postMessage on load and whenever the content resizes, so the
* parent can size the iframe to fit (auto-resize mode).
*/
export function buildSandboxSrcdoc(source: string): string {
const bootstrap = `
<script>
(function () {
var lastSent = -1;
var scheduled = false;
function measure() {
var doc = document.documentElement;
var body = document.body;
return Math.max(
doc ? doc.scrollHeight : 0,
body ? body.scrollHeight : 0
);
}
function flush() {
scheduled = false;
var height = measure();
// Only report when the height actually changed by more than 1px. This
// damps the iframe self-measure feedback loop: content sized to the iframe
// viewport would otherwise oscillate as the parent resizes the frame in
// response to each report.
if (Math.abs(height - lastSent) <= 1) return;
lastSent = height;
parent.postMessage(
{ type: ${JSON.stringify(HTML_EMBED_HEIGHT_MESSAGE)}, height: height },
"*"
);
}
function reportHeight() {
if (scheduled) return;
scheduled = true;
if (typeof requestAnimationFrame === "function") {
requestAnimationFrame(flush);
} else {
flush();
}
}
window.addEventListener("load", reportHeight);
// Report an initial height now (runs during parse, before load/images
// settle); the load handler and ResizeObserver refine it as content changes.
reportHeight();
if (typeof ResizeObserver !== "undefined") {
try {
var ro = new ResizeObserver(reportHeight);
ro.observe(document.documentElement);
} catch (e) {
// ResizeObserver unavailable/failed: the load handler still reports once.
}
}
})();
</script>`;
return `${source || ""}${bootstrap}`;
}
/**
* Render policy split by editor mode:
* - READ-ONLY / public-share view: the SERVER already decided whether to
* include the embed (it strips htmlEmbed from shared content when the
* workspace master toggle is OFF). An anonymous viewer has no workspace and
* thus reads `featureEnabled` as false, so we must NOT gate rendering on it
* here — we render exactly the `source` the server chose to serve.
* - EDITABLE editor: gate on the per-workspace master toggle so an author sees
* the inert placeholder when the feature is OFF.
*/
export function shouldRender(
isEditable: boolean,
featureEnabled: boolean,
): boolean {
return !isEditable || featureEnabled;
}
/**
* The edit affordance is only meaningful in edit mode and is offered only when
* the workspace master toggle is ON. The block renders in a sandboxed iframe
* (no same-origin access), so authoring is allowed to ANY member — there is no
* admin requirement.
*/
export function canEdit(isEditable: boolean, featureEnabled: boolean): boolean {
return isEditable && featureEnabled;
}

View File

@@ -2,11 +2,18 @@
position: relative;
}
/* The container the raw source is injected into. */
/* Fallback container used only for the empty, non-editor case. */
.htmlEmbedContent {
width: 100%;
}
/* The sandboxed iframe the embed source is rendered into. */
.htmlEmbedFrame {
display: block;
width: 100%;
border: none;
}
/* Edit affordance overlay, only shown while editing the document. */
.htmlEmbedToolbar {
position: absolute;

View File

@@ -1,85 +1,118 @@
import { NodeViewProps, NodeViewWrapper } from "@tiptap/react";
import React, { useCallback, useEffect, useRef, useState } from "react";
import React, {
useCallback,
useEffect,
useMemo,
useRef,
useState,
} from "react";
import clsx from "clsx";
import {
ActionIcon,
Button,
Group,
Modal,
NumberInput,
Text,
Textarea,
} from "@mantine/core";
import { IconCode, IconEdit } from "@tabler/icons-react";
import { useTranslation } from "react-i18next";
import { useAtomValue } from "jotai";
import useUserRole from "@/hooks/use-user-role.tsx";
import { workspaceAtom } from "@/features/user/atoms/current-user-atom.ts";
import classes from "./html-embed-view.module.css";
import {
buildSandboxSrcdoc,
canEdit as computeCanEdit,
renderRawHtml,
shouldExecute as computeShouldExecute,
} from "./render-raw-html.ts";
HTML_EMBED_HEIGHT_MESSAGE,
shouldRender as computeShouldRender,
} from "./html-embed-sandbox.ts";
// Sane bounds for the auto-resized iframe so a runaway embed cannot blow up the
// page layout, and a sensible default before the first height message arrives.
const MIN_IFRAME_HEIGHT = 40;
const MAX_IFRAME_HEIGHT = 4000;
const DEFAULT_IFRAME_HEIGHT = 150;
// Clamp a reported/configured height into the sane iframe bounds.
const clampHeight = (h: number) =>
Math.min(MAX_IFRAME_HEIGHT, Math.max(MIN_IFRAME_HEIGHT, h));
export default function HtmlEmbedView(props: NodeViewProps) {
const { t } = useTranslation();
const { node, selected, updateAttributes, editor } = props;
const { source } = node.attrs as { source: string };
const { isAdmin } = useUserRole();
const { source, height } = node.attrs as {
source: string;
height: number | null;
};
// Defense in depth: only execute the raw HTML/JS when the workspace HTML embed
// feature toggle is ON. When OFF (the default), we render a neutral disabled
// placeholder and inject nothing — so turning the feature off neutralizes
// existing embeds at render time as well as on the next server-side save.
// The HTML embed renders inside a SANDBOXED iframe (no same-origin access), so
// the workspace toggle is a feature switch, not a security gate. When OFF (the
// default) we render a neutral placeholder in the editor and nothing else.
const workspace = useAtomValue(workspaceAtom);
const htmlEmbedEnabled = workspace?.settings?.htmlEmbed === true;
// Execution policy split by editor mode:
// - READ-ONLY / public-share view: the SERVER already decided whether to
// include the embed (it strips htmlEmbed from shared content when the
// workspace toggle is OFF). An anonymous viewer has no workspace and thus
// reads `htmlEmbedEnabled` as false, so we must NOT gate execution on it
// here — we execute exactly the `source` the server chose to serve.
// - EDITABLE editor (admin authoring): keep gating on the per-workspace
// toggle so an admin sees the inert placeholder when the feature is OFF.
const shouldExecute = computeShouldExecute(
const shouldRender = computeShouldRender(
editor.isEditable,
htmlEmbedEnabled,
);
const contentRef = useRef<HTMLDivElement | null>(null);
const iframeRef = useRef<HTMLIFrameElement | null>(null);
const [modalOpen, setModalOpen] = useState(false);
const [draft, setDraft] = useState<string>(source || "");
const [draftHeight, setDraftHeight] = useState<number | "">(height ?? "");
// (Re)render the raw source whenever it changes. This runs in BOTH the
// editable editor and the read-only / public-share editor (same NodeView),
// so trackers fire for readers too — that is the intended behaviour. When the
// feature toggle is OFF we clear the container and inject/execute nothing.
// True when the author pinned an explicit height; otherwise we auto-resize to
// the iframe's reported content height.
const hasFixedHeight = typeof height === "number" && Number.isFinite(height);
// Auto-resize height tracked in state. Seeded to the default and updated from
// the iframe's postMessage reports (see effect below) regardless of mode, so
// switching a fixed-height embed back to auto immediately reflects the last
// reported content height instead of staying pinned to the old fixed value.
const [autoHeight, setAutoHeight] = useState<number>(DEFAULT_IFRAME_HEIGHT);
const srcdoc = useMemo(() => buildSandboxSrcdoc(source || ""), [source]);
// Auto-resize: accept height messages ONLY from this iframe's own content
// window. The sandboxed srcdoc has an opaque ("null") origin, so we cannot
// match by event.origin — we match by event.source instead. We track the
// reported height even while a fixed height is in effect, so toggling back to
// auto shows the current content height with no iframe reload.
useEffect(() => {
if (!contentRef.current) return;
if (shouldExecute) {
renderRawHtml(contentRef.current, source || "");
} else {
contentRef.current.innerHTML = "";
function onMessage(event: MessageEvent) {
if (event.source !== iframeRef.current?.contentWindow) return;
const data = event.data as { type?: string; height?: number };
if (data?.type !== HTML_EMBED_HEIGHT_MESSAGE) return;
const next = Number(data.height);
if (!Number.isFinite(next)) return;
setAutoHeight(clampHeight(next));
}
}, [source, shouldExecute]);
window.addEventListener("message", onMessage);
return () => window.removeEventListener("message", onMessage);
}, []);
const effectiveHeight = hasFixedHeight ? clampHeight(height) : autoHeight;
const openEditor = useCallback(() => {
setDraft(source || "");
setDraftHeight(height ?? "");
setModalOpen(true);
}, [source]);
}, [source, height]);
const onSave = useCallback(() => {
if (editor.isEditable) {
updateAttributes({ source: draft });
updateAttributes({
source: draft,
height: draftHeight === "" ? null : Number(draftHeight),
});
}
setModalOpen(false);
}, [draft, editor.isEditable, updateAttributes]);
}, [draft, draftHeight, editor.isEditable, updateAttributes]);
// The edit affordance is only meaningful in edit mode, is restricted to admins
// (the server strips the node for non-admins anyway), and is offered only when
// the workspace feature toggle is ON.
const canEdit = computeCanEdit(editor.isEditable, isAdmin, htmlEmbedEnabled);
// The edit affordance is only meaningful in edit mode and is offered only when
// the workspace master toggle is ON. Any member can edit (sandboxed = safe).
const canEdit = computeCanEdit(editor.isEditable, htmlEmbedEnabled);
return (
<NodeViewWrapper
@@ -101,12 +134,12 @@ export default function HtmlEmbedView(props: NodeViewProps) {
</div>
)}
{!shouldExecute ? (
{!shouldRender ? (
// Feature disabled for this workspace AND we're in the editable editor:
// never inject/execute the source. Show a neutral placeholder so an
// existing embed is visibly inert for the authoring admin. Read-only /
// share viewers never hit this branch (`shouldExecute` is always true
// there) — they execute exactly the source the server chose to serve.
// render a neutral placeholder so an existing embed is visibly inert for
// the author. Read-only / share viewers never hit this branch
// (`shouldRender` is always true there) — they render exactly the
// source the server chose to serve.
<div className={classes.htmlEmbedPlaceholder}>
<IconCode size={18} />
<Text size="sm">
@@ -114,9 +147,18 @@ export default function HtmlEmbedView(props: NodeViewProps) {
</Text>
</div>
) : source ? (
// Raw HTML/CSS/JS rendered into the wiki origin. Scripts are re-created
// in renderRawHtml so they execute.
<div ref={contentRef} className={classes.htmlEmbedContent} />
// Raw HTML/CSS/JS rendered inside a sandboxed iframe (no same-origin):
// scripts run in an opaque origin and cannot touch the viewer's
// session/cookies/API.
<iframe
ref={iframeRef}
className={classes.htmlEmbedFrame}
sandbox="allow-scripts allow-popups allow-forms"
srcDoc={srcdoc}
title={t("HTML embed")}
referrerPolicy="no-referrer"
style={{ height: effectiveHeight }}
/>
) : canEdit ? (
<div className={classes.htmlEmbedPlaceholder} onClick={openEditor}>
<IconCode size={18} />
@@ -124,7 +166,7 @@ export default function HtmlEmbedView(props: NodeViewProps) {
</div>
) : (
// Empty source, non-editor: render nothing visible.
<div ref={contentRef} className={classes.htmlEmbedContent} />
<div className={classes.htmlEmbedContent} />
)}
<Modal
@@ -135,7 +177,7 @@ export default function HtmlEmbedView(props: NodeViewProps) {
>
<Text size="xs" c="dimmed" mb="xs">
{t(
"This HTML/CSS/JS runs in the page origin for everyone who views it. Admins only.",
"This HTML/CSS/JS runs in a sandboxed frame and cannot access the viewer's session, cookies, or API.",
)}
</Text>
<Textarea
@@ -148,6 +190,19 @@ export default function HtmlEmbedView(props: NodeViewProps) {
styles={{ input: { fontFamily: "monospace" } }}
data-autofocus
/>
<NumberInput
mt="md"
label={t("Height (px, blank = auto)")}
value={draftHeight}
onChange={(value) =>
setDraftHeight(
value === "" || value === null ? "" : Number(value),
)
}
min={MIN_IFRAME_HEIGHT}
max={MAX_IFRAME_HEIGHT}
allowDecimal={false}
/>
<Group justify="flex-end" mt="md">
<Button variant="default" onClick={() => setModalOpen(false)}>
{t("Cancel")}

View File

@@ -1,112 +0,0 @@
import { describe, it, expect, beforeEach, afterEach } from "vitest";
import { JSDOM } from "jsdom";
import { renderRawHtml, shouldExecute, canEdit } from "./render-raw-html";
// jsdom does NOT execute <script> nodes unless its instance was created with
// `runScripts: "dangerously"`. The whole point of renderRawHtml is to make
// re-created scripts run, so the execution tests drive a dedicated script-
// running JSDOM and pass it a container from THAT document (renderRawHtml uses
// `container.ownerDocument`, so it creates the fresh scripts in the running
// instance). The default vitest jsdom (no runScripts) is used for the
// structural and policy assertions.
describe("renderRawHtml (script execution against a runScripts jsdom)", () => {
let dom: JSDOM;
let container: HTMLElement;
beforeEach(() => {
dom = new JSDOM("<!doctype html><html><body></body></html>", {
runScripts: "dangerously",
});
container = dom.window.document.createElement("div");
dom.window.document.body.appendChild(container);
});
afterEach(() => {
dom.window.close();
});
it("re-creates and executes an inline <script> (observable side effect)", () => {
renderRawHtml(
container,
"<div>hello</div><script>window.__htmlEmbedFlag = true;</script>",
);
// The re-created inline script ran inside the jsdom window.
expect((dom.window as unknown as Record<string, unknown>).__htmlEmbedFlag).toBe(
true,
);
// The non-script markup is preserved.
expect(container.querySelector("div")?.textContent).toBe("hello");
});
it("copies src/async/defer onto a re-created external <script src>", () => {
renderRawHtml(
container,
'<script src="https://example.com/t.js" async defer></script>',
);
const script = container.querySelector("script");
expect(script).not.toBeNull();
expect(script?.getAttribute("src")).toBe("https://example.com/t.js");
expect(script?.hasAttribute("async")).toBe(true);
expect(script?.hasAttribute("defer")).toBe(true);
});
it("clears the container when the source is empty", () => {
container.innerHTML = "<p>stale</p>";
renderRawHtml(container, "");
expect(container.innerHTML).toBe("");
});
it("clears prior content first on a re-render with new source", () => {
const win = dom.window as unknown as Record<string, unknown>;
renderRawHtml(
container,
"<span id='first'>one</span><script>window.__htmlEmbedCount = 1;</script>",
);
expect(win.__htmlEmbedCount).toBe(1);
expect(container.querySelector("#first")).not.toBeNull();
renderRawHtml(
container,
"<span id='second'>two</span><script>window.__htmlEmbedCount = 2;</script>",
);
// Prior content is gone; only the new render remains.
expect(container.querySelector("#first")).toBeNull();
expect(container.querySelector("#second")).not.toBeNull();
expect(win.__htmlEmbedCount).toBe(2);
});
});
describe("shouldExecute (execution policy)", () => {
it("read-only executes regardless of the workspace toggle", () => {
// isEditable=false → the server already gated the content.
expect(shouldExecute(false, false)).toBe(true);
expect(shouldExecute(false, true)).toBe(true);
});
it("editable + toggle OFF does NOT execute", () => {
expect(shouldExecute(true, false)).toBe(false);
});
it("editable + toggle ON executes", () => {
expect(shouldExecute(true, true)).toBe(true);
});
});
describe("canEdit (edit policy)", () => {
it("a member (non-admin) can never edit", () => {
expect(canEdit(true, false, true)).toBe(false);
expect(canEdit(false, false, true)).toBe(false);
});
it("an admin with the toggle OFF cannot edit", () => {
expect(canEdit(true, true, false)).toBe(false);
});
it("an admin with the toggle ON in editable mode can edit", () => {
expect(canEdit(true, true, true)).toBe(true);
});
it("an admin in read-only mode cannot edit (no edit affordance)", () => {
expect(canEdit(false, true, true)).toBe(false);
});
});

View File

@@ -1,73 +0,0 @@
/**
* Pure DOM helpers for the HTML embed node view. Kept out of the React
* component so the script re-creation/execution mechanism and the execution/
* edit policy can be unit-tested against a bare jsdom container with no
* Tiptap/Mantine providers.
*/
/**
* Inject raw HTML (including <script> tags) into `container`, executing any
* scripts.
*
* Setting `innerHTML` does NOT run inline or external <script> tags the browser
* parses that way: the HTML spec marks scripts inserted via innerHTML as
* "already started" so they never execute. To get the tracker/analytics
* use-case working we walk the freshly-parsed scripts and replace each with a
* brand-new <script> element copying its attributes and inline code. A
* programmatically created+inserted <script> DOES execute, so this restores
* normal script behaviour in the wiki origin (Variant C).
*/
export function renderRawHtml(container: HTMLElement, source: string): void {
// Clear any previous render (re-render on source change).
container.innerHTML = "";
if (!source) return;
container.innerHTML = source;
// Use the container's own document so the helper works against any document
// (the live page or a standalone jsdom instance in tests), not just the
// ambient global `document`.
const doc = container.ownerDocument;
const scripts = Array.from(container.querySelectorAll("script"));
for (const oldScript of scripts) {
const newScript = doc.createElement("script");
// Copy every attribute (src, type, async, defer, data-*, etc.).
for (const attr of Array.from(oldScript.attributes)) {
newScript.setAttribute(attr.name, attr.value);
}
// Copy inline code.
newScript.text = oldScript.textContent ?? "";
// Replacing the node in place triggers execution.
oldScript.parentNode?.replaceChild(newScript, oldScript);
}
}
/**
* Execution policy split by editor mode:
* - READ-ONLY / public-share view: the SERVER already decided whether to
* include the embed (it strips htmlEmbed from shared content when the
* workspace toggle is OFF). An anonymous viewer has no workspace and thus
* reads `featureEnabled` as false, so we must NOT gate execution on it here
* — we execute exactly the `source` the server chose to serve.
* - EDITABLE editor (admin authoring): keep gating on the per-workspace toggle
* so an admin sees the inert placeholder when the feature is OFF.
*/
export function shouldExecute(
isEditable: boolean,
featureEnabled: boolean,
): boolean {
return !isEditable || featureEnabled;
}
/**
* The edit affordance is only meaningful in edit mode, is restricted to admins
* (the server strips the node for non-admins anyway), and is offered only when
* the workspace feature toggle is ON.
*/
export function canEdit(
isEditable: boolean,
isAdmin: boolean,
featureEnabled: boolean,
): boolean {
return isEditable && isAdmin && featureEnabled;
}

View File

@@ -623,10 +623,9 @@ const CommandGroups: SlashMenuGroupedItemsType = {
},
{
title: "HTML embed",
description: "Embed raw HTML, CSS and JavaScript (admins only).",
description: "Embed raw HTML, CSS and JavaScript (sandboxed).",
searchTerms: ["html", "css", "js", "javascript", "script", "tracker", "analytics", "raw", "embed"],
icon: IconCode,
adminOnly: true,
requiresHtmlEmbedFeature: true,
command: ({ editor, range }: CommandProps) => {
editor
@@ -795,30 +794,12 @@ const CommandGroups: SlashMenuGroupedItemsType = {
};
/**
* Read whether the current user is a workspace admin/owner from the persisted
* `currentUser` (the same payload `currentUserAtom` stores via localStorage).
* Used to hide admin-only slash items (e.g. raw HTML embed). This is a UI gate
* only; the server independently strips admin-only nodes from non-admin writes.
*/
function isCurrentUserAdmin(): boolean {
try {
const raw = localStorage.getItem("currentUser");
if (!raw) return false;
const parsed = JSON.parse(raw);
const role = parsed?.user?.role;
return role === "owner" || role === "admin";
} catch {
return false;
}
}
/**
* Read the workspace-level HTML embed feature toggle from the persisted
* Read the workspace-level HTML embed master toggle from the persisted
* `currentUser` payload (the same localStorage entry `currentUserAtom` writes,
* carrying `workspace.settings`). ABSENT/false => OFF (the default). The slash
* `getSuggestionItems` is a plain function (no React/atom context), so we read
* the persisted state the same way `isCurrentUserAdmin()` does. UI gate only;
* the server independently strips htmlEmbed from every non-allowed write.
* the persisted state directly. UI gate only; an anonymous public-share read is
* served already-stripped content by the server when the toggle is OFF.
*/
function isHtmlEmbedFeatureEnabled(): boolean {
try {
@@ -840,7 +821,6 @@ export const getSuggestionItems = ({
}): SlashMenuGroupedItemsType => {
const search = query.toLowerCase();
const filteredGroups: SlashMenuGroupedItemsType = {};
const isAdmin = isCurrentUserAdmin();
const htmlEmbedFeatureEnabled = isHtmlEmbedFeatureEnabled();
const fuzzyMatch = (query: string, target: string) => {
@@ -856,9 +836,7 @@ export const getSuggestionItems = ({
for (const [group, items] of Object.entries(CommandGroups)) {
const filteredItems = items.filter((item) => {
if (excludeItems?.has(item.title)) return false;
// Hide admin-only items (raw HTML embed) from non-admins.
if (item.adminOnly && !isAdmin) return false;
// Hide HTML-embed-gated items unless the workspace feature toggle is ON.
// Hide the HTML embed item unless the workspace master toggle is ON.
if (item.requiresHtmlEmbedFeature && !htmlEmbedFeatureEnabled)
return false;
return (

View File

@@ -21,13 +21,9 @@ export type SlashMenuItemType = {
searchTerms: string[];
command: (props: CommandProps) => void;
disable?: (editor: ReturnType<typeof useEditor>) => boolean;
// When true, the item is only offered to workspace admins/owners. This is a
// UI convenience only — the real authoring gate is enforced server-side.
adminOnly?: boolean;
// When true, the item is hidden unless the workspace HTML embed feature toggle
// is ON. Combined with adminOnly, the item shows only for admins in workspaces
// where the feature is enabled. UI gate only — the server strips htmlEmbed on
// every write where the toggle is OFF or the user is not an admin.
// When true, the item is hidden unless the workspace HTML embed master toggle
// is ON. UI gate only — for anonymous public-share reads the server serves
// already-stripped content when the toggle is OFF.
requiresHtmlEmbedFeature?: boolean;
};

View File

@@ -28,6 +28,8 @@ interface ShareAiWidgetProps {
shareId: string;
/** The page the reader currently has open (context for "this page"). */
pageId: string;
/** Display name of the configured assistant identity; falls back to 'AI agent' when absent. */
assistantName?: string;
}
/**
@@ -48,7 +50,11 @@ interface ShareAiWidgetProps {
* links (so internal UUIDs/auth-gated routes in the answer don't leak as
* clickable links), and a documentation-focused empty state.
*/
export default function ShareAiWidget({ shareId, pageId }: ShareAiWidgetProps) {
export default function ShareAiWidget({
shareId,
pageId,
assistantName,
}: ShareAiWidgetProps) {
const { t } = useTranslation();
const [open, setOpen] = useState(false);
const [input, setInput] = useState("");
@@ -153,6 +159,7 @@ export default function ShareAiWidget({ shareId, pageId }: ShareAiWidgetProps) {
<MessageList
messages={messages}
isStreaming={isStreaming}
assistantName={assistantName}
showCitations={false}
// Anonymous reader: neutralize internal/relative links in the
// assistant's markdown so internal UUIDs/auth-gated routes don't

View File

@@ -45,6 +45,10 @@ export interface ISharedPage extends IShare {
// Whether the anonymous public-share AI assistant is enabled for the
// workspace (server-resolved). Gates the "Ask AI" widget.
aiAssistant?: boolean;
// Display name of the configured assistant identity (agent role name), used
// to label the public-share chat. Null/absent when no identity is set →
// the widget falls back to the generic "AI agent" label.
aiAssistantName?: string | null;
}
export interface IShareForPage extends IShare {

View File

@@ -1,57 +1,32 @@
import { workspaceAtom } from "@/features/user/atoms/current-user-atom.ts";
import { useAtom } from "jotai";
import { useState } from "react";
import { updateWorkspace } from "@/features/workspace/services/workspace-service.ts";
import { useWorkspaceSetting } from "@/features/workspace/hooks/use-workspace-setting.ts";
import { Switch, Stack, Paper, Group, Text, List } from "@mantine/core";
import { notifications } from "@mantine/notifications";
import useUserRole from "@/hooks/use-user-role.tsx";
import { useTranslation } from "react-i18next";
/**
* Admin toggle for the workspace HTML embed feature.
* Workspace master toggle that enables/disables the HTML embed block type.
*
* SECURITY: when ON, workspace admins/owners can embed raw HTML/CSS/JS that
* EXECUTES in the wiki page origin for every reader (a deliberate stored-XSS
* surface, e.g. for analytics trackers). OFF by default. The server strips
* htmlEmbed nodes on every write where the toggle is OFF or the saver is not an
* admin, so this switch fully enables/disables the feature workspace-wide.
* The block renders inside a SANDBOXED iframe (no same-origin access), so it
* cannot touch the viewer's session/cookies/API — it is a feature switch, not a
* security gate. When ON, ANY member can insert the block. OFF by default; for
* anonymous public-share reads the server serves already-stripped content when
* the toggle is OFF. The toggle itself is managed by workspace admins.
*/
export default function HtmlEmbedSettings() {
const { t } = useTranslation();
const [workspace, setWorkspace] = useAtom(workspaceAtom);
const { workspace, isLoading, save } = useWorkspaceSetting("htmlEmbed");
const { isAdmin } = useUserRole();
const [checked, setChecked] = useState<boolean>(
workspace?.settings?.htmlEmbed ?? false,
);
const [isLoading, setIsLoading] = useState(false);
async function handleToggle(value: boolean) {
setIsLoading(true);
const previous = checked;
setChecked(value); // optimistic update
try {
const updated = await updateWorkspace({ htmlEmbed: value });
// Force settings.htmlEmbed to the new value so the atom is consistent even
// if the response shape omits it.
setWorkspace({
...updated,
settings: {
...updated.settings,
htmlEmbed: value,
},
});
notifications.show({ message: t("Updated successfully") });
} catch (err) {
console.log(err);
setChecked(previous); // revert on failure
notifications.show({
message: t("Failed to update data"),
color: "red",
});
} finally {
setIsLoading(false);
}
const ok = await save(value);
if (!ok) setChecked(previous); // revert on failure
}
return (
@@ -69,7 +44,7 @@ export default function HtmlEmbedSettings() {
<Switch
label={t("Enable HTML embed")}
description={t(
"Allow workspace admins to insert raw HTML/CSS/JavaScript that EXECUTES in the wiki page origin for everyone who views the page (a deliberate stored-XSS surface, e.g. for analytics trackers). Off by default.",
"Allow members to insert raw HTML/CSS/JavaScript blocks. The block renders in a sandboxed frame and cannot access the viewer's session, cookies, or API. Off by default.",
)}
checked={checked}
disabled={!isAdmin || isLoading}
@@ -79,17 +54,17 @@ export default function HtmlEmbedSettings() {
<List size="xs" c="dimmed" mt="md" spacing={4}>
<List.Item>
{t(
"Only workspace admins/owners can insert HTML embeds. Members never can: the editor option is hidden for them and the server strips the embed on save at every write path.",
"When enabled, any member can insert an HTML embed block. The toggle just enables or disables the block type workspace-wide.",
)}
</List.Item>
<List.Item>
{t(
"If a non-admin edits and saves a page that contains an admin's embed, that save strips the embed (fail-closed). An admin must re-add it.",
"Embeds run inside a sandboxed iframe with a separate origin, so they cannot read or modify the page they are embedded in.",
)}
</List.Item>
<List.Item>
{t(
"Turning this off strips existing embeds on their next save and immediately disables execution (existing embeds render as a disabled placeholder).",
"Turning this off hides existing embeds (they render as a disabled placeholder) and stops serving them on public share pages.",
)}
</List.Item>
</List>

View File

@@ -0,0 +1,76 @@
import { useState } from "react";
import { useWorkspaceSetting } from "@/features/workspace/hooks/use-workspace-setting.ts";
import {
Button,
Group,
Paper,
Stack,
Text,
Textarea,
} from "@mantine/core";
import useUserRole from "@/hooks/use-user-role.tsx";
import { useTranslation } from "react-i18next";
/**
* Admin-only analytics/tracker snippet for public share pages.
*
* The value is injected VERBATIM into the <head> of PUBLIC SHARE pages only,
* in the page's own (same-origin) context. It is the deliberate same-origin
* surface for analytics snippets (Google Analytics, Yandex.Metrika, etc.).
* Admin only — the workspace settings write is admin-gated server-side, and the
* Save button is disabled for non-admins.
*/
export default function TrackerSettings() {
const { t } = useTranslation();
const { workspace, isLoading, save } = useWorkspaceSetting("trackerHead");
const { isAdmin } = useUserRole();
const [value, setValue] = useState<string>(
workspace?.settings?.trackerHead ?? "",
);
async function handleSave() {
await save(value);
}
return (
<Stack mt="sm">
<Group justify="space-between" align="center">
<Text fw={700} size="lg">
{t("Analytics / tracker")}
</Text>
<Text size="xs" c="dimmed" tt="uppercase" fw={600}>
{t("advanced")}
</Text>
</Group>
<Paper withBorder radius="md" p="lg">
<Text size="xs" c="dimmed" mb="xs">
{t(
"Injected verbatim into the <head> of PUBLIC SHARE pages only (same-origin). For analytics snippets (Google Analytics, Yandex.Metrika, etc.). Admin only.",
)}
</Text>
<Textarea
autosize
minRows={6}
maxRows={20}
aria-label={t("Analytics / tracker")}
value={value}
onChange={(e) => setValue(e.currentTarget.value)}
placeholder={t("<script>...</script>")}
styles={{ input: { fontFamily: "monospace" } }}
disabled={!isAdmin || isLoading}
/>
<Group justify="flex-end" mt="md">
<Button
onClick={handleSave}
loading={isLoading}
disabled={!isAdmin}
>
{t("Save")}
</Button>
</Group>
</Paper>
</Stack>
);
}

View File

@@ -0,0 +1,65 @@
import { workspaceAtom } from "@/features/user/atoms/current-user-atom.ts";
import { useAtom } from "jotai";
import { useCallback, useState } from "react";
import { updateWorkspace } from "@/features/workspace/services/workspace-service.ts";
import { IWorkspace } from "@/features/workspace/types/workspace.types.ts";
import { notifications } from "@mantine/notifications";
import { useTranslation } from "react-i18next";
/**
* Workspace setting keys that this hook can persist. Each key is both a
* write-only field on the update payload and a read field under
* `workspace.settings`, so the value type is derived from the settings shape.
*/
type WorkspaceSettingKey = "htmlEmbed" | "trackerHead";
type WorkspaceSettingValue<K extends WorkspaceSettingKey> =
NonNullable<IWorkspace["settings"][K]>;
/**
* Shared "save a workspace setting" plumbing extracted from the individual
* settings components. Owns the `isLoading` state and the persist-then-merge
* flow (call `updateWorkspace`, merge the response back into the workspace atom
* while forcing `settings[key]` to the saved value, and surface a success/error
* notification). Callers keep their own interaction model (optimistic toggle,
* edit-then-save, etc.) on top of this.
*/
export function useWorkspaceSetting<K extends WorkspaceSettingKey>(key: K) {
const [workspace, setWorkspace] = useAtom(workspaceAtom);
const { t } = useTranslation();
const [isLoading, setIsLoading] = useState(false);
const save = useCallback(
async (value: WorkspaceSettingValue<K>): Promise<boolean> => {
setIsLoading(true);
try {
const updated = await updateWorkspace({
[key]: value,
} as Partial<IWorkspace>);
// Force settings[key] to the new value so the atom is consistent even
// if the response shape omits it.
setWorkspace({
...updated,
settings: {
...updated.settings,
[key]: value,
},
});
notifications.show({ message: t("Updated successfully") });
return true;
} catch (err) {
console.error(`Failed to update workspace setting "${key}"`, err);
notifications.show({
message:
(err as any)?.response?.data?.message ?? t("Failed to update data"),
color: "red",
});
return false;
} finally {
setIsLoading(false);
}
},
[key, setWorkspace, t],
);
return { workspace, isLoading, save };
}

View File

@@ -33,6 +33,9 @@ export interface IWorkspace {
// Write-only field for updateWorkspace({ htmlEmbed }). Read state lives at
// settings.htmlEmbed.
htmlEmbed?: boolean;
// Write-only field for updateWorkspace({ trackerHead }). Read state lives at
// settings.trackerHead.
trackerHead?: string;
}
export interface IWorkspaceSettings {
@@ -40,8 +43,13 @@ export interface IWorkspaceSettings {
sharing?: IWorkspaceSharingSettings;
api?: IWorkspaceApiSettings;
templates?: IWorkspaceTemplateSettings;
// Admin-only HTML embed feature toggle. ABSENT/false => OFF (default).
// HTML embed master toggle (enables/disables the block type). The block
// renders in a sandboxed iframe, so this is a feature switch, not a security
// gate. ABSENT/false => OFF (default).
htmlEmbed?: boolean;
// Admin-only analytics/tracker snippet injected into the <head> of public
// share pages (same-origin). ABSENT/empty => none.
trackerHead?: string;
}
export interface IWorkspaceApiSettings {

View File

@@ -2,6 +2,7 @@ import SettingsTitle from "@/components/settings/settings-title.tsx";
import WorkspaceNameForm from "@/features/workspace/components/settings/components/workspace-name-form";
import WorkspaceIcon from "@/features/workspace/components/settings/components/workspace-icon.tsx";
import HtmlEmbedSettings from "@/features/workspace/components/settings/components/html-embed-settings.tsx";
import TrackerSettings from "@/features/workspace/components/settings/components/tracker-settings.tsx";
import { useTranslation } from "react-i18next";
import { getAppName } from "@/lib/config.ts";
import { Helmet } from "react-helmet-async";
@@ -17,6 +18,7 @@ export default function WorkspaceSettings() {
<WorkspaceIcon />
<WorkspaceNameForm />
<HtmlEmbedSettings />
<TrackerSettings />
</>
);
}

View File

@@ -79,7 +79,11 @@ export default function SharedPage() {
{/* Anonymous "Ask AI" widget — only when the workspace enables the
public-share assistant (server-resolved flag on /shares/page-info). */}
{data?.aiAssistant && data.share?.id && data.page?.id && (
<ShareAiWidget shareId={data.share.id} pageId={data.page.id} />
<ShareAiWidget
shareId={data.share.id}
pageId={data.page.id}
assistantName={data.aiAssistantName ?? undefined}
/>
)}
</div>
);

View File

@@ -1,120 +0,0 @@
import * as Y from 'yjs';
import { TiptapTransformer } from '@hocuspocus/transformer';
import { CollaborationHandler } from './collaboration.handler';
import { hasHtmlEmbedNode } from '../common/helpers/prosemirror/html-embed.util';
// Exercises the REAL CollaborationHandler.updatePageContent admin gate (the
// REST/MCP/AI content-update entrypoint, used by the page update endpoint and
// the MCP/AI agent). updatePageContent reads `user?.role` and strips htmlEmbed
// BEFORE handing the json to withYdocConnection. We stub only
// withYdocConnection (which would otherwise open a real hocuspocus connection):
// the role-extraction (`user?.role`) + strip that run upstream of it are REAL
// production code. The 'replace' branch then runs the production
// TiptapTransformer.toYdoc on the gated json against a real Y.Doc, which we
// decode back to JSON and assert on. This replaces the re-implemented
// `applyAdminGate` stand-in for this entrypoint.
const docWithEmbed = () => ({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'keep' }] },
{
type: 'columns',
content: [
{
type: 'column',
attrs: { position: 'left' },
content: [
{ type: 'htmlEmbed', attrs: { source: '<script>nested</script>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'inner' }] },
],
},
{
type: 'column',
attrs: { position: 'right' },
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'r' }] },
],
},
],
},
{ type: 'htmlEmbed', attrs: { source: '<script>top</script>' } },
],
});
/**
* Run the REAL updatePageContent('replace') with a stubbed withYdocConnection.
* The stub provides a real Y.Doc + recording fragment; the production fn calls
* TiptapTransformer.toYdoc(<gated json>) and applies it to the doc, so decoding
* the doc afterward yields exactly the gated content.
*/
async function gatedContentFor(
role: string | null | undefined,
featureEnabled = true,
) {
// Workspace settings read used by the toggle-AND-admin gate.
const workspaceRepo = {
findById: jest.fn(async () => ({
id: 'ws-1',
settings: { htmlEmbed: featureEnabled },
})),
};
const handler = new CollaborationHandler(workspaceRepo as any);
const captureDoc = new Y.Doc();
jest
.spyOn(handler, 'withYdocConnection')
.mockImplementation(async (_hp, _name, _ctx, fn: any) => {
const fragment = captureDoc.getXmlFragment('default');
// Mirror the real Document surface the fn touches.
const docLike: any = {
getXmlFragment: () => fragment,
};
// The fn does: fragment.delete(0,len) then
// Y.applyUpdate(doc, encodeStateAsUpdate(toYdoc(gatedJson))). It calls
// Y.applyUpdate(doc, ...) — so docLike must be a real Y.Doc target.
fn(captureDoc);
});
const handlers = handler.getHandlers({} as any);
await handlers.updatePageContent('page-1', {
prosemirrorJson: docWithEmbed(),
operation: 'replace',
user: { id: 'u1', role, workspaceId: 'ws-1' } as any,
});
return TiptapTransformer.fromYdoc(captureDoc, 'default');
}
describe('CollaborationHandler.updatePageContent htmlEmbed admin gate (real code)', () => {
it('non-admin (member): every htmlEmbed (top-level + nested) stripped before the ydoc', async () => {
const gated = await gatedContentFor('member');
expect(hasHtmlEmbedNode(gated)).toBe(false);
// Non-embed siblings survive.
const json = JSON.stringify(gated);
expect(json).toContain('keep');
expect(json).toContain('inner');
});
it('unknown/empty role: fails closed (stripped)', async () => {
for (const role of [undefined, null, 'viewer'] as const) {
expect(hasHtmlEmbedNode(await gatedContentFor(role))).toBe(false);
}
});
it('toggle ON + admin: htmlEmbed preserved', async () => {
expect(hasHtmlEmbedNode(await gatedContentFor('admin', true))).toBe(true);
});
it('toggle ON + owner: htmlEmbed preserved', async () => {
expect(hasHtmlEmbedNode(await gatedContentFor('owner', true))).toBe(true);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(hasHtmlEmbedNode(await gatedContentFor('admin', false))).toBe(false);
});
it('toggle OFF + member: stripped', async () => {
expect(hasHtmlEmbedNode(await gatedContentFor('member', false))).toBe(false);
});
});

View File

@@ -8,11 +8,6 @@ import {
import { setYjsMark, updateYjsMarkAttribute, YjsSelection } from './yjs.util';
import * as Y from 'yjs';
import { User } from '@docmost/db/types/entity.types';
import {
isHtmlEmbedFeatureEnabled,
stripHtmlEmbedIfNotAllowed,
} from '../common/helpers/prosemirror/html-embed.util';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
export type CollabEventHandlers = ReturnType<
CollaborationHandler['getHandlers']
@@ -22,8 +17,6 @@ export type CollabEventHandlers = ReturnType<
export class CollaborationHandler {
private readonly logger = new Logger(CollaborationHandler.name);
constructor(private readonly workspaceRepo: WorkspaceRepo) {}
getHandlers(hocuspocus: Hocuspocus) {
return {
alterState: async (documentName: string, payload: { pageId: string }) => {
@@ -89,30 +82,9 @@ export class CollaborationHandler {
},
) => {
const { operation, user } = payload;
let { prosemirrorJson } = payload;
const { prosemirrorJson } = payload;
this.logger.debug('Updating page content via yjs', documentName);
// SECURITY (Variant C admin gate, REST/MCP/AI write path):
// updatePageContent is the server-side entrypoint used by the REST page
// update endpoint and by the MCP/AI agent. Raw `htmlEmbed` nodes execute
// arbitrary JS in every reader's browser, so a NON-admin caller must not
// be able to persist them here. If the editing user is not a workspace
// admin/owner, strip every htmlEmbed node before it reaches the ydoc.
// Toggle-AND-admin gate: htmlEmbed survives only when the workspace
// feature toggle is ON and the editing user is an admin/owner. OFF
// (default) => stripped for everyone.
const htmlEmbedEnabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(user?.workspaceId))?.settings,
);
prosemirrorJson = stripHtmlEmbedIfNotAllowed(prosemirrorJson, {
featureEnabled: htmlEmbedEnabled,
role: user?.role,
onStrip: () =>
this.logger.warn(
`Stripping htmlEmbed node(s) from update by user ${user?.id} on ${documentName}`,
),
});
await this.withYdocConnection(
hocuspocus,
documentName,

View File

@@ -1,456 +0,0 @@
import * as Y from 'yjs';
import { TiptapTransformer } from '@hocuspocus/transformer';
import { PersistenceExtension } from './persistence.extension';
import { tiptapExtensions } from '../collaboration.util';
import {
collectHtmlEmbedSources,
hasHtmlEmbedNode,
HTML_EMBED_NODE_NAME,
} from '../../common/helpers/prosemirror/html-embed.util';
// Exercises the REAL PersistenceExtension.onStoreDocument (the primary collab
// WebSocket write path) against a REAL ydoc, with thin repo/db/queue mocks.
// This replaces the prior re-implemented `applyAdminGate` stand-in for this
// entrypoint: if the role-extraction expression (`context?.user?.role`), the
// strip call, or the ydoc-rebuild branch is deleted/changed, these tests fail.
const RICH_DOC = {
type: 'doc',
content: [
{
type: 'paragraph',
content: [{ type: 'text', text: 'intro paragraph' }],
},
{
type: 'columns',
content: [
{
type: 'column',
attrs: { position: 'left' },
content: [
{
type: 'paragraph',
content: [
{ type: 'text', text: 'left col, mentioning ' },
{
type: 'mention',
attrs: {
id: 'mention-1',
label: 'Alice',
entityType: 'user',
entityId: 'user-123',
creatorId: 'creator-1',
},
},
],
},
// Nested embed inside a column — must be stripped recursively.
{
type: HTML_EMBED_NODE_NAME,
attrs: { source: '<script>nested()</script>' },
},
],
},
{
type: 'column',
attrs: { position: 'right' },
content: [
{
type: 'table',
content: [
{
type: 'tableRow',
content: [
{
type: 'tableHeader',
attrs: { colspan: 1, rowspan: 1 },
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'H' }] },
],
},
],
},
{
type: 'tableRow',
content: [
{
type: 'tableCell',
attrs: { colspan: 1, rowspan: 1 },
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'cell' }] },
],
},
],
},
],
},
],
},
],
},
// Top-level embed — must be stripped.
{
type: HTML_EMBED_NODE_NAME,
attrs: { source: '<script>top()</script>' },
},
{
type: 'paragraph',
content: [{ type: 'text', text: 'outro paragraph' }],
},
],
};
function buildYdoc(json: any): Y.Doc {
return TiptapTransformer.toYdoc(json, 'default', tiptapExtensions);
}
// Count nodes by type across the whole tree (excludes htmlEmbed by listing it
// separately) so we can assert every OTHER node type survived the strip.
function nodeTypeCounts(json: any): Record<string, number> {
const counts: Record<string, number> = {};
const walk = (n: any) => {
if (!n || typeof n !== 'object') return;
if (n.type) counts[n.type] = (counts[n.type] ?? 0) + 1;
if (Array.isArray(n.content)) n.content.forEach(walk);
};
walk(json);
return counts;
}
/**
* Construct a real PersistenceExtension with the minimum mocks needed for
* onStoreDocument to reach the strip + persist branch, and capture the content
* that would be written to the page row.
*/
function buildExtension(featureEnabled = true, priorContent?: any) {
const captured: { content?: any } = {};
const existingPage = {
id: 'page-1',
slugId: 'slug-1',
spaceId: 'space-1',
workspaceId: 'ws-1',
creatorId: 'creator-1',
contributorIds: [],
// The currently-persisted content. Defaults to an empty doc (differs from
// new content -> persist runs); a test may pass a prior admin embed here to
// exercise the preserve-admin-embed branch.
content: priorContent ?? { type: 'doc', content: [] },
createdAt: new Date(),
lastUpdatedSource: 'user',
};
const pageRepo = {
findById: jest.fn(async () => ({ ...existingPage })),
updatePage: jest.fn(async (values: any) => {
captured.content = values.content;
}),
};
const pageHistoryRepo = {
findPageLastHistory: jest.fn(async () => null),
saveHistory: jest.fn(async () => undefined),
};
// db.transaction().execute(cb) just runs the callback (no real DB).
const db = {
transaction: () => ({
execute: (cb: any) => cb({} as any),
}),
};
const noopQueue = { add: jest.fn(async () => undefined) } as any;
const collabHistory = { addContributors: jest.fn(async () => undefined) } as any;
const transclusionService = {
syncPageTransclusions: jest.fn(async () => undefined),
syncPageReferences: jest.fn(async () => undefined),
} as any;
// Workspace settings read used by the toggle-AND-admin gate.
const workspaceRepo = {
findById: jest.fn(async () => ({
id: 'ws-1',
settings: { htmlEmbed: featureEnabled },
})),
};
const ext = new PersistenceExtension(
pageRepo as any,
pageHistoryRepo as any,
db as any,
noopQueue,
noopQueue,
noopQueue,
collabHistory,
transclusionService,
workspaceRepo as any,
);
return { ext, captured, pageRepo };
}
async function runStore(
role: string | null | undefined,
doc: Y.Doc,
featureEnabled = true,
priorContent?: any,
) {
const { ext, captured } = buildExtension(featureEnabled, priorContent);
// hocuspocus augments the Y.Doc with broadcastStateless; a bare Y.Doc has
// none, so stub it (the post-persist broadcast is not under test here).
(doc as any).broadcastStateless = () => undefined;
await ext.onStoreDocument({
documentName: 'page-1',
document: doc,
context: { user: { id: 'u1', role } },
} as any);
return captured;
}
describe('PersistenceExtension.onStoreDocument htmlEmbed admin gate (real code)', () => {
it('non-admin store: strips EVERY htmlEmbed but preserves every other node', async () => {
const doc = buildYdoc(RICH_DOC);
const before = TiptapTransformer.fromYdoc(doc, 'default');
expect(hasHtmlEmbedNode(before)).toBe(true);
const beforeCounts = nodeTypeCounts(before);
const captured = await runStore('member', doc);
expect(captured.content).toBeDefined();
// htmlEmbed gone from the persisted content.
expect(hasHtmlEmbedNode(captured.content)).toBe(false);
// Every non-embed node type is preserved with the SAME count (guards against
// data loss if a node were missing from tiptapExtensions and dropped on the
// toYdoc rebuild).
const afterCounts = nodeTypeCounts(captured.content);
for (const [type, count] of Object.entries(beforeCounts)) {
if (type === HTML_EMBED_NODE_NAME) continue;
expect(afterCounts[type]).toBe(count);
}
// The two embeds are gone.
expect(beforeCounts[HTML_EMBED_NODE_NAME]).toBe(2);
expect(afterCounts[HTML_EMBED_NODE_NAME]).toBeUndefined();
// The shared ydoc fragment was also rewritten clean (re-decode it).
const reDecoded = TiptapTransformer.fromYdoc(doc, 'default');
expect(hasHtmlEmbedNode(reDecoded)).toBe(false);
});
it('toggle ON + admin store: htmlEmbed preserved in persisted content', async () => {
const captured = await runStore('admin', buildYdoc(RICH_DOC), true);
expect(captured.content).toBeDefined();
expect(hasHtmlEmbedNode(captured.content)).toBe(true);
expect(nodeTypeCounts(captured.content)[HTML_EMBED_NODE_NAME]).toBe(2);
});
it('toggle ON + owner store: htmlEmbed preserved', async () => {
const captured = await runStore('owner', buildYdoc(RICH_DOC), true);
expect(hasHtmlEmbedNode(captured.content)).toBe(true);
});
it('toggle OFF + admin store: stripped (feature disabled for everyone)', async () => {
const captured = await runStore('admin', buildYdoc(RICH_DOC), false);
expect(hasHtmlEmbedNode(captured.content)).toBe(false);
});
it('toggle OFF + owner store: stripped', async () => {
const captured = await runStore('owner', buildYdoc(RICH_DOC), false);
expect(hasHtmlEmbedNode(captured.content)).toBe(false);
});
it('toggle OFF + member store: stripped', async () => {
const captured = await runStore('member', buildYdoc(RICH_DOC), false);
expect(hasHtmlEmbedNode(captured.content)).toBe(false);
});
it('unknown/empty role: fails closed (stripped)', async () => {
expect(
hasHtmlEmbedNode((await runStore(undefined, buildYdoc(RICH_DOC))).content),
).toBe(false);
expect(
hasHtmlEmbedNode((await runStore(null, buildYdoc(RICH_DOC))).content),
).toBe(false);
expect(
hasHtmlEmbedNode((await runStore('viewer', buildYdoc(RICH_DOC))).content),
).toBe(false);
});
it('toggle ON + non-admin store: PRESERVES an admin embed already in the persisted content through an unrelated edit', async () => {
// Prior persisted content already holds an admin-authored embed.
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// A non-admin makes an UNRELATED edit (tweaks the paragraph) but the embed
// is still present in the merged doc.
const edited = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
const captured = await runStore('member', buildYdoc(edited), true, prior);
expect(captured.content).toBeDefined();
// The admin's pre-existing embed survives the non-admin store.
expect(collectHtmlEmbedSources(captured.content)).toEqual(
new Set([ADMIN_SOURCE]),
);
});
it('toggle ON + non-admin store: strips a NEWLY-added embed while keeping the prior admin one', async () => {
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// Non-admin keeps the admin embed, makes an unrelated paragraph edit (so the
// store is not a no-op and is persisted), and ALSO adds a brand-new embed.
const edited = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro edited' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>evil()</script>' } },
],
};
const captured = await runStore('member', buildYdoc(edited), true, prior);
expect(captured.content).toBeDefined();
// Only the admin-vetted source remains; the newly-introduced one is stripped.
expect(collectHtmlEmbedSources(captured.content)).toEqual(
new Set([ADMIN_SOURCE]),
);
});
it('empty-fragment ydoc (no content) does not throw and persists no embed', async () => {
const emptyDoc = buildYdoc({
type: 'doc',
content: [{ type: 'paragraph' }],
});
// Non-admin path with an empty/embed-free fragment must be a no-op strip,
// not throw.
await expect(runStore('member', emptyDoc)).resolves.toBeDefined();
});
});
// Exercises the REAL early onChange guard (Gitea #26): guardHtmlEmbed converges
// the shared ydoc sub-second, before the 10s store debounce. We call it directly
// (it is the debounced timer body) and assert the ydoc fragment no longer yields
// an htmlEmbed for the non-admin's transient embed, while admin-vetted embeds
// already in the persisted content survive.
describe('PersistenceExtension.guardHtmlEmbed early onChange guard (real code)', () => {
async function runGuard(
role: string | null | undefined,
doc: Y.Doc,
featureEnabled = true,
priorContent?: any,
) {
const { ext } = buildExtension(featureEnabled, priorContent);
await (ext as any).guardHtmlEmbed(
'page-1',
doc,
{ user: { id: 'u1', role, workspaceId: 'ws-1' } },
);
}
it('toggle ON + non-admin: strips a newly-added embed from the shared ydoc', async () => {
// Prior persisted content has NO embed; the live doc has one a non-admin
// just added.
const doc = buildYdoc({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>evil()</script>' } },
],
});
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
true,
);
await runGuard('member', doc, true, { type: 'doc', content: [] });
// The shared ydoc fragment no longer yields any htmlEmbed.
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
false,
);
});
it('toggle ON + non-admin: preserves a prior admin embed, strips the new one', async () => {
const ADMIN_SOURCE = '<script>adminAuthored()</script>';
const prior = {
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
],
};
// Live doc keeps the admin embed AND adds a brand-new one.
const doc = buildYdoc({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'intro' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: ADMIN_SOURCE } },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>evil()</script>' } },
],
});
await runGuard('member', doc, true, prior);
// Only the admin-vetted source survives in the shared ydoc.
expect(
collectHtmlEmbedSources(TiptapTransformer.fromYdoc(doc, 'default')),
).toEqual(new Set([ADMIN_SOURCE]));
});
it('toggle OFF + non-admin: strips ALL embeds (allow-list is null)', async () => {
// Even an embed that matches the prior content is stripped when the toggle
// is OFF, because the OFF path passes allowed=null (strip everything) and
// never reads the prior content for an allow-list.
const SOURCE = '<script>any()</script>';
const doc = buildYdoc({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] },
{ type: HTML_EMBED_NODE_NAME, attrs: { source: SOURCE } },
],
});
await runGuard('member', doc, false, {
type: 'doc',
content: [{ type: HTML_EMBED_NODE_NAME, attrs: { source: SOURCE } }],
});
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
false,
);
});
it('admin role: guard is a defensive no-op (embed preserved)', async () => {
const doc = buildYdoc({
type: 'doc',
content: [
{ type: HTML_EMBED_NODE_NAME, attrs: { source: '<script>ok()</script>' } },
],
});
await runGuard('admin', doc, true, { type: 'doc', content: [] });
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
true,
);
});
it('no embed present: guard is a cheap no-op (loop-safe re-fire)', async () => {
const doc = buildYdoc({
type: 'doc',
content: [{ type: 'paragraph', content: [{ type: 'text', text: 'plain' }] }],
});
await runGuard('member', doc, true, { type: 'doc', content: [] });
expect(hasHtmlEmbedNode(TiptapTransformer.fromYdoc(doc, 'default'))).toBe(
false,
);
});
});

View File

@@ -39,16 +39,6 @@ import {
HISTORY_INTERVAL,
} from '../constants';
import { TransclusionService } from '../../core/page/transclusion/transclusion.service';
import {
canAuthorHtmlEmbed,
collectHtmlEmbedSources,
hasHtmlEmbedNode,
htmlEmbedAllowed,
isHtmlEmbedFeatureEnabled,
stripDisallowedHtmlEmbedNodes,
stripHtmlEmbedNodes,
} from '../../common/helpers/prosemirror/html-embed.util';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
@Injectable()
export class PersistenceExtension implements Extension {
@@ -59,21 +49,6 @@ export class PersistenceExtension implements Extension {
// coalescing window" per document and OR it across all edits in the window,
// so the snapshot is marked 'agent' regardless of who wrote last.
private agentTouched: Map<string, boolean> = new Map();
// Per-document debounce timers for the early htmlEmbed guard (Gitea #26).
// onChange schedules a short (~300ms) debounced strip that converges the
// shared ydoc for all connected clients well before the 10s store debounce,
// shrinking the pre-persist broadcast window of a non-admin's transient embed.
private htmlEmbedGuardTimers: Map<string, NodeJS.Timeout> = new Map();
// Per-document cache of the workspace htmlEmbed toggle (Gitea #26). Populated
// in onLoadDocument (which already loads the page + has workspace context) and
// read in onChange to gate early-guard scheduling: when the toggle is OFF (the
// common default) we schedule NOTHING — no timer, no fromYdoc, no DB read — and
// rely on the onStoreDocument strip as the backstop (when OFF the embed does
// not execute in editable mode anyway). Cleared in afterUnloadDocument.
// STALENESS: if an admin flips the toggle ON mid-session this cache stays OFF
// until the document is reloaded, so the early guard won't schedule — accepted,
// the onStoreDocument backstop still strips on persist.
private htmlEmbedToggleByDoc: Map<string, boolean> = new Map();
constructor(
private readonly pageRepo: PageRepo,
@@ -84,7 +59,6 @@ export class PersistenceExtension implements Extension {
@InjectQueue(QueueName.NOTIFICATION_QUEUE) private notificationQueue: Queue,
private readonly collabHistory: CollabHistoryService,
private readonly transclusionService: TransclusionService,
private readonly workspaceRepo: WorkspaceRepo,
) {}
async onLoadDocument(data: onLoadDocumentPayload) {
@@ -105,23 +79,6 @@ export class PersistenceExtension implements Extension {
return;
}
// Cache the workspace htmlEmbed toggle for this document (Gitea #26). We
// already have the page (hence its workspaceId) here, so resolve the toggle
// once and cache it keyed by documentName. onChange reads this to decide
// whether to schedule the early guard at all — when OFF we skip the guard
// entirely (no timer, no fromYdoc, no DB read). Cleared in
// afterUnloadDocument. See htmlEmbedToggleByDoc for the staleness note.
try {
const enabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(page.workspaceId))?.settings,
);
this.htmlEmbedToggleByDoc.set(documentName, enabled);
} catch (err) {
// Fail OFF: if the toggle can't be resolved, never schedule the early
// guard; the onStoreDocument backstop still strips on persist.
this.htmlEmbedToggleByDoc.set(documentName, false);
}
if (page.ydoc) {
this.logger.debug(`ydoc loaded from db: ${pageId}`);
@@ -155,109 +112,7 @@ export class PersistenceExtension implements Extension {
const pageId = getPageId(documentName);
let tiptapJson = TiptapTransformer.fromYdoc(document, 'default');
// SECURITY (Variant C admin gate, collab WebSocket write path):
// The persisted snapshot is the merged ydoc, which may contain an htmlEmbed
// node inserted by ANY connected editor. htmlEmbed renders raw, unsanitized
// JS in every reader's browser, so only workspace admins/owners may author
// it. When the user whose store triggers this persist is not an admin, strip
// every htmlEmbed node before it is written to the page row AND before the
// ydoc state is re-encoded, so the node cannot be reintroduced by a
// non-admin via the collab socket.
// NOTE (defense-in-depth refinement, Gitea #29): the gate is keyed to the
// storing connection's user, but it no longer blindly strips EVERY embed on
// a non-admin store. We distinguish two cases inside the !allowed branch:
// - Feature toggle OFF => strip ALL embeds (the feature is disabled for
// everyone; existing embeds get cleaned up on the next save).
// - Toggle ON but the storer is a NON-admin => strip only NEWLY-introduced
// embeds and PRESERVE embeds already present in the currently-persisted
// page content (admin-authored, already vetted). So a non-admin still
// cannot ADD an embed, but an unrelated edit (e.g. a paragraph tweak) no
// longer destroys an admin's existing embed (the prior data-loss bug).
// The pre-existing-embed identity is the raw `attrs.source` (see
// collectHtmlEmbedSources). A non-admin who copies an existing admin embed's
// exact source elsewhere passes — acceptable, that HTML is already vetted.
//
// ACCEPTED RESIDUAL RISK (toggle-ON allow-list TOCTOU): the allow-list is a
// best-effort snapshot read OUTSIDE the locked transaction (the prior content
// is pre-read above, but inside executeTx the row is re-read withLock without
// recomputing the allow-list). A concurrent admin store that changes the
// persisted embeds between the pre-read and this write can make the preserve
// decision use a slightly stale snapshot — worst case one embed transiently
// kept or dropped; it converges on the next store, with no auth bypass or
// broader data loss. The race is accepted because it only affects concurrent
// authenticated editors on the (rare) toggle-ON non-admin path, converges on
// the next store, and the persisted row plus every share/readonly read path
// remain protected by the strip.
//
// RESIDUAL RISK (pre-persist broadcast window) — NOW MITIGATED (Gitea #26):
// this strip runs in the debounced onStoreDocument (up to 10s), but
// hocuspocus broadcasts each inbound Yjs update to connected clients
// immediately, so a non-admin's transient htmlEmbed can execute in OTHER open
// editors' browsers in the window before this persist strips it. The exposure
// is limited to concurrent AUTHENTICATED space members who have the doc open
// with Edit rights (semi-trusted) — anonymous public-share/readonly viewers do
// NOT open a collab socket (ReadonlyPageEditor renders fetched,
// already-stripped content; HocuspocusProvider is only used by the
// authenticated editable page-editor), and the PERSISTED page row plus every
// share/readonly read path are protected by this strip.
// The window is now SHRUNK to sub-second by an onChange-debounced early guard
// (~300ms) — see guardHtmlEmbed() — which runs the SAME preserve/strip gate as
// this block and re-encodes the cleaned ydoc, converging the doc for all
// clients long before this 10s store debounce fires. This onStoreDocument
// strip remains the authoritative backstop for persistence. The irreducible
// residual is only the VERY FIRST inbound broadcast before the ~300ms debounce
// fires: hocuspocus exposes no synchronous beforeBroadcast filter to drop the
// node before that first relay, so it cannot be eliminated entirely.
// Toggle-AND-admin gate: htmlEmbed survives only when the workspace feature
// toggle is ON and the storing user is an admin/owner. OFF (default) =>
// stripped for everyone (existing embeds get cleaned up on next save).
const htmlEmbedEnabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(context?.user?.workspaceId))?.settings,
);
if (!htmlEmbedAllowed(htmlEmbedEnabled, context?.user?.role)) {
if (hasHtmlEmbedNode(tiptapJson)) {
let strippedJson: typeof tiptapJson;
if (htmlEmbedEnabled === false) {
// Toggle OFF: feature disabled for everyone -> strip ALL embeds.
strippedJson = stripHtmlEmbedNodes(tiptapJson);
} else {
// Toggle ON, non-admin storer: preserve embeds already present in the
// currently-persisted (admin-vetted) page content; strip only the
// newly-introduced ones. Pre-read the prior content — a small extra
// query only on this rare non-admin + toggle-ON path.
const prior = await this.pageRepo.findById(pageId, {
includeContent: true,
});
const allowed = collectHtmlEmbedSources(prior?.content);
strippedJson = stripDisallowedHtmlEmbedNodes(tiptapJson, allowed);
}
// Only mutate the ydoc + log when the strip actually removed something;
// an unnecessary ydoc rewrite would churn the doc for all clients. With
// the toggle-ON branch a non-admin store that only touches admin-vetted
// embeds leaves the content unchanged here.
if (!isDeepStrictEqual(strippedJson, tiptapJson)) {
this.logger.warn(
`Stripping htmlEmbed node(s) from collab store by user ${context?.user?.id} on ${documentName}`,
);
tiptapJson = strippedJson;
// Reflect the stripped content back into the shared ydoc so the node
// is removed for all connected clients, not just the persisted row.
const fragment = document.getXmlFragment('default');
if (fragment.length > 0) {
fragment.delete(0, fragment.length);
}
const cleanDoc = TiptapTransformer.toYdoc(
tiptapJson,
'default',
tiptapExtensions,
);
Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc));
}
}
}
const tiptapJson = TiptapTransformer.fromYdoc(document, 'default');
const ydocState = Buffer.from(Y.encodeStateAsUpdate(document));
@@ -429,168 +284,12 @@ export class PersistenceExtension implements Extension {
if (data.context?.actor === 'agent') {
this.agentTouched.set(documentName, true);
}
// Early htmlEmbed guard scheduling (Gitea #26). Schedule the short debounced
// guard ONLY when (a) this document's workspace toggle is cached ON and
// (b) the changing connection's user is a NON-admin (cannot author
// htmlEmbed). When the toggle is OFF/unknown we schedule NOTHING — no timer,
// no fromYdoc, no DB read — killing the OFF-case overhead (the common
// default); the onStoreDocument strip is the backstop and an OFF embed does
// not execute in editable mode anyway. We do NO expensive work here — we only
// (re)schedule the timer; the debounce coalesces rapid edits into a single
// guard check.
if (
userId &&
this.htmlEmbedToggleByDoc.get(documentName) === true &&
!canAuthorHtmlEmbed(data.context?.user?.role)
) {
const existing = this.htmlEmbedGuardTimers.get(documentName);
if (existing) {
clearTimeout(existing);
}
const timer = setTimeout(() => {
this.htmlEmbedGuardTimers.delete(documentName);
void this.guardHtmlEmbed(documentName, data.document, data.context);
}, 300);
this.htmlEmbedGuardTimers.set(documentName, timer);
}
}
/**
* Early, onChange-debounced htmlEmbed strip (Gitea #26). Mirrors the
* onStoreDocument admin gate but runs ~300ms after a non-admin edit instead of
* waiting for the 10s store debounce, so a non-admin's transient embed is
* removed from the shared ydoc — and re-broadcast as cleaned state — for all
* connected clients in sub-second time. onStoreDocument remains the
* authoritative persistence backstop; this is an ADDITIONAL early pass.
*
* CONCURRENCY (the critical invariant): the Y.Doc mutation is a single
* SYNCHRONOUS block with NO `await` between the fromYdoc snapshot and the
* applyUpdate write. ALL async work (the workspace toggle lookup and the
* persisted-content read for the allow-list) happens FIRST, before that block.
* Because JS is single-threaded, a synchronous block cannot interleave with
* inbound Yjs update handlers, so a concurrent edit that lands while we await
* cannot be CLOBBERED: we re-snapshot the live doc only after all awaits, then
* delete + rebuild + applyUpdate without yielding. (An earlier version awaited
* DB reads BETWEEN the snapshot and the write, so a concurrent edit in that gap
* was lost — this restructure fixes that.)
*
* The allow-list is a best-effort snapshot read outside any lock (TOCTOU
* accepted, same as onStoreDocument): worst case one embed is transiently kept
* or dropped; it converges on the next guard/store, with no auth bypass.
*
* Loop-safety: the corrective applyUpdate has a null origin, so the re-fired
* onChange carries no userId and is not rescheduled; and after a strip no
* htmlEmbed remains, so a subsequent guard fire is a cheap no-op (the
* hasHtmlEmbedNode early-exit). NEVER throws — an unhandled rejection in a timer
* would crash the process — so the whole body is wrapped in try/catch.
*/
private async guardHtmlEmbed(
documentName: string,
document: Y.Doc,
context: any,
): Promise<void> {
// Defensive: ensure no stale timer entry survives for this document.
this.htmlEmbedGuardTimers.delete(documentName);
try {
// Re-check defensively: onChange only schedules for non-admins, but if an
// admin/owner somehow reaches here, the embed is authored content — do
// nothing (onStoreDocument's toggle-AND-admin gate handles persistence).
if (canAuthorHtmlEmbed(context?.user?.role)) {
return;
}
// ---- ASYNC PHASE: do ALL awaits up front, before touching the ydoc. ----
// Resolve the workspace toggle exactly as onStoreDocument does. When OFF we
// strip everything; when ON we use the preserve logic (keep admin-vetted
// embeds, strip only the non-admin's newly-introduced ones).
const enabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(context?.user?.workspaceId))
?.settings,
);
// The allow-list (admin-vetted sources already in the persisted content).
// null => strip ALL (toggle OFF). Read here, BEFORE the synchronous block,
// so no await sits between the doc snapshot and the doc write.
let allowed: Set<string> | null = null;
if (enabled !== false) {
const prior = await this.pageRepo.findById(getPageId(documentName), {
includeContent: true,
});
allowed = collectHtmlEmbedSources(prior?.content);
}
// The awaits above may have let the document be unloaded/destroyed. If so,
// bail — mutating a destroyed doc is pointless and could throw (the
// try/catch is the ultimate safety net regardless).
if ((document as { isDestroyed?: boolean }).isDestroyed) {
return;
}
// ---- SYNCHRONOUS PHASE: snapshot -> strip -> reflect, NO await here. ----
// Because there is no await between fromYdoc and applyUpdate, no inbound
// Yjs update can interleave, so a concurrent edit cannot be lost.
const json = TiptapTransformer.fromYdoc(document, 'default');
// Cheap exit: nothing to guard if the doc has no embed at all. This is also
// why a post-strip re-fire is a no-op (loop-safe).
if (!hasHtmlEmbedNode(json)) {
return;
}
const strippedJson =
allowed === null
? stripHtmlEmbedNodes(json)
: stripDisallowedHtmlEmbedNodes(json, allowed);
// Nothing was stripped (e.g. the only embed is an admin-vetted one) — do
// not churn the shared ydoc for all clients.
if (isDeepStrictEqual(strippedJson, json)) {
return;
}
// Reflect the stripped content back into the shared ydoc EXACTLY as
// onStoreDocument does, so the node is removed for all connected clients,
// not just on the eventual persist. This re-encode broadcasts the cleaned
// state; after it hasHtmlEmbedNode is false, so any later guard fire is a
// cheap no-op (loop-safe).
const fragment = document.getXmlFragment('default');
if (fragment.length > 0) {
fragment.delete(0, fragment.length);
}
const cleanDoc = TiptapTransformer.toYdoc(
strippedJson,
'default',
tiptapExtensions,
);
Y.applyUpdate(document, Y.encodeStateAsUpdate(cleanDoc));
this.logger.warn(
`Stripping htmlEmbed node(s) via early onChange guard by user ${context?.user?.id} on ${documentName}`,
);
} catch (err) {
// NEVER rethrow out of a timer-scheduled call.
this.logger.error(
`Early htmlEmbed guard failed on ${documentName}`,
err,
);
}
}
async afterUnloadDocument(data: afterUnloadDocumentPayload) {
const documentName = data.documentName;
this.contributors.delete(documentName);
this.agentTouched.delete(documentName);
// Drop the cached toggle for this document so a reload re-resolves it (and
// picks up a mid-session admin toggle flip).
this.htmlEmbedToggleByDoc.delete(documentName);
// Clear any pending early-guard timer so it cannot fire after the document
// is unloaded (leak / use-after-unload prevention).
const timer = this.htmlEmbedGuardTimers.get(documentName);
if (timer) {
clearTimeout(timer);
this.htmlEmbedGuardTimers.delete(documentName);
}
}
private consumeContributors(documentName: string): string[] {

View File

@@ -3,20 +3,17 @@ import { htmlToJson } from '../../../collaboration/collaboration.util';
import { hasHtmlEmbedNode, stripHtmlEmbedNodes } from './html-embed.util';
/**
* CONTRACT (security): an attacker who controls imported markdown/HTML could try
* to smuggle an htmlEmbed in the *serialized* DOM form —
* CONTRACT: imported markdown/HTML can carry an htmlEmbed in the *serialized*
* DOM form —
* <div data-type="htmlEmbed" data-source="...">
* — directly, bypassing the editor's `<!--html-embed:-->` comment marker.
*
* This exercises the REAL server import conversion path that ImportService uses
* The block renders inside a sandboxed iframe, so this is not an XSS surface;
* this exercises the REAL server import conversion path that ImportService uses
* (`markdownToHtml` then `htmlToJson`; `processHTML` adds only a cheerio
* link/iframe normalize pass which does not touch htmlEmbed divs) and asserts
* the ACTUAL behaviour so we know whether the strip gate can be bypassed.
*
* FINDING (documented): the raw embed div DOES round-trip through marked +
* htmlToJson into a real `htmlEmbed` node, so `hasHtmlEmbedNode` returns true and
* `stripHtmlEmbedNodes` removes it. The serialized-form bypass is therefore
* detectable and STRIPPABLE — the write-path gate covers it.
* that such a node is DETECTED and STRIPPABLE — so the share read path's
* master-toggle strip can remove it when the workspace toggle is OFF.
*/
describe('htmlEmbed smuggled via the raw serialized div in imported markdown/HTML', () => {
it('round-trips through markdownToHtml -> htmlToJson and is DETECTED (base64 data-source)', async () => {
@@ -38,7 +35,7 @@ describe('htmlEmbed smuggled via the raw serialized div in imported markdown/HTM
// The div parses into a real htmlEmbed node carrying the decoded source.
expect(hasHtmlEmbedNode(json)).toBe(true);
// Because it is detected, the write-path gate can strip it for non-admins.
// Because it is detected, the share master-toggle strip can remove it.
const stripped = stripHtmlEmbedNodes(json);
expect(hasHtmlEmbedNode(stripped)).toBe(false);
// Surrounding non-embed content is retained.

View File

@@ -1,11 +1,6 @@
import {
canAuthorHtmlEmbed,
collectHtmlEmbedSources,
hasHtmlEmbedNode,
htmlEmbedAllowed,
isHtmlEmbedFeatureEnabled,
stripDisallowedHtmlEmbedNodes,
stripHtmlEmbedIfNotAllowed,
stripHtmlEmbedNodes,
} from './html-embed.util';
import { htmlToJson, jsonToHtml } from '../../../collaboration/collaboration.util';
@@ -96,17 +91,6 @@ describe('stripHtmlEmbedNodes', () => {
expect(result).toEqual(doc);
});
it('neutralizes a root node that is itself an htmlEmbed', () => {
// Defensive: the PM root is always a `doc`, so this is unreachable in normal
// use, but the helper must still never return a bare htmlEmbed.
const root = {
type: 'htmlEmbed',
attrs: { source: '<script>alert(1)</script>' },
};
const result = stripHtmlEmbedNodes(root);
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('strips a deeply nested htmlEmbed (3+ levels: callout > column > paragraph-sibling)', () => {
// htmlEmbed sits as a sibling of a paragraph, nested four containers deep.
const doc = {
@@ -172,169 +156,6 @@ describe('stripHtmlEmbedNodes', () => {
});
});
describe('collectHtmlEmbedSources', () => {
it('collects the source of every htmlEmbed node, including nested ones', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<b>top</b>' } },
{
type: 'columns',
content: [
{
type: 'column',
content: [
{ type: 'htmlEmbed', attrs: { source: '<i>nested</i>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'x' }] },
],
},
],
},
],
};
const sources = collectHtmlEmbedSources(doc);
expect(sources).toEqual(new Set(['<b>top</b>', '<i>nested</i>']));
});
it('returns an empty set for a doc with no embeds', () => {
const doc = {
type: 'doc',
content: [{ type: 'paragraph', content: [{ type: 'text', text: 'hi' }] }],
};
expect(collectHtmlEmbedSources(doc).size).toBe(0);
});
it('gracefully skips embeds with absent attrs or non-string source', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed' }, // no attrs
{ type: 'htmlEmbed', attrs: {} }, // no source
{ type: 'htmlEmbed', attrs: { source: 42 } }, // non-string
{ type: 'htmlEmbed', attrs: { source: '<ok/>' } },
],
};
expect(collectHtmlEmbedSources(doc)).toEqual(new Set(['<ok/>']));
});
it('returns an empty set for non-object input', () => {
expect(collectHtmlEmbedSources(null).size).toBe(0);
expect(collectHtmlEmbedSources(undefined).size).toBe(0);
expect(collectHtmlEmbedSources('x' as any).size).toBe(0);
});
});
describe('stripDisallowedHtmlEmbedNodes', () => {
it('keeps an embed whose source is allowed and removes the rest', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'htmlEmbed', attrs: { source: '<new-evil/>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'keep' }] },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(collectHtmlEmbedSources(result)).toEqual(new Set(['<vetted/>']));
// The allowed embed and the paragraph survive; the new embed is gone.
expect(result.content).toHaveLength(2);
expect(result.content[0].attrs.source).toBe('<vetted/>');
expect(result.content[1].type).toBe('paragraph');
});
it('keeps BOTH embeds when two nodes share the same allowed source', () => {
// Source-identity semantics: identity is the raw `attrs.source`, so a
// non-admin who duplicates an existing admin-vetted source keeps both copies.
// This is intended — the raw HTML is already vetted, so a duplicate is safe.
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'paragraph', content: [{ type: 'text', text: 'mid' }] },
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(hasHtmlEmbedNode(result)).toBe(true);
const embeds = result.content.filter(
(n: any) => n.type === 'htmlEmbed',
);
expect(embeds).toHaveLength(2);
expect(embeds.every((n: any) => n.attrs.source === '<vetted/>')).toBe(true);
});
it('removes a newly-introduced embed when nothing is allowed', () => {
const doc = {
type: 'doc',
content: [{ type: 'htmlEmbed', attrs: { source: '<new/>' } }],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set());
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('filters nested embeds by the allow-list (e.g. inside columns)', () => {
const doc = {
type: 'doc',
content: [
{
type: 'columns',
content: [
{
type: 'column',
content: [
{ type: 'htmlEmbed', attrs: { source: '<vetted/>' } },
{ type: 'htmlEmbed', attrs: { source: '<new/>' } },
],
},
],
},
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
const col = findFirstChild(result, 'column');
expect(col.content).toHaveLength(1);
expect(col.content[0].attrs.source).toBe('<vetted/>');
});
it('treats an embed with absent/non-string source as not allowed (stripped)', () => {
const doc = {
type: 'doc',
content: [
{ type: 'htmlEmbed' },
{ type: 'htmlEmbed', attrs: {} },
],
};
const result = stripDisallowedHtmlEmbedNodes(doc, new Set(['<vetted/>']));
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('does not mutate the input document', () => {
const doc = {
type: 'doc',
content: [{ type: 'htmlEmbed', attrs: { source: '<new/>' } }],
};
stripDisallowedHtmlEmbedNodes(doc, new Set());
expect(doc.content).toHaveLength(1);
expect(doc.content[0].type).toBe('htmlEmbed');
});
it('neutralizes a root node that is itself a disallowed htmlEmbed', () => {
const root = { type: 'htmlEmbed', attrs: { source: '<new/>' } };
const result = stripDisallowedHtmlEmbedNodes(root, new Set());
expect(hasHtmlEmbedNode(result)).toBe(false);
});
it('keeps a root node that is an allowed htmlEmbed (defensive branch)', () => {
const root = { type: 'htmlEmbed', attrs: { source: '<vetted/>' } };
const result = stripDisallowedHtmlEmbedNodes(root, new Set(['<vetted/>']));
expect(collectHtmlEmbedSources(result)).toEqual(new Set(['<vetted/>']));
});
it('returns non-object input unchanged', () => {
expect(stripDisallowedHtmlEmbedNodes(null as any, new Set())).toBeNull();
});
});
describe('hasHtmlEmbedNode (root/odd-shape detection)', () => {
it('returns true when the ROOT node itself is an htmlEmbed (not only a child)', () => {
const rootEmbed = { type: 'htmlEmbed', attrs: { source: '<script>r</script>' } };
@@ -367,19 +188,6 @@ describe('hasHtmlEmbedNode (root/odd-shape detection)', () => {
});
});
describe('canAuthorHtmlEmbed', () => {
it('allows owner and admin', () => {
expect(canAuthorHtmlEmbed('owner')).toBe(true);
expect(canAuthorHtmlEmbed('admin')).toBe(true);
});
it('denies member and unknown/empty roles', () => {
expect(canAuthorHtmlEmbed('member')).toBe(false);
expect(canAuthorHtmlEmbed(null)).toBe(false);
expect(canAuthorHtmlEmbed(undefined)).toBe(false);
expect(canAuthorHtmlEmbed('viewer')).toBe(false);
});
});
describe('isHtmlEmbedFeatureEnabled', () => {
it('is true only when settings.htmlEmbed === true', () => {
expect(isHtmlEmbedFeatureEnabled({ htmlEmbed: true })).toBe(true);
@@ -394,165 +202,22 @@ describe('isHtmlEmbedFeatureEnabled', () => {
});
});
describe('htmlEmbedAllowed (toggle AND admin)', () => {
it('toggle OFF + admin/owner => not allowed (feature disabled for everyone)', () => {
expect(htmlEmbedAllowed(false, 'admin')).toBe(false);
expect(htmlEmbedAllowed(false, 'owner')).toBe(false);
});
it('toggle OFF + member => not allowed', () => {
expect(htmlEmbedAllowed(false, 'member')).toBe(false);
});
it('toggle ON + admin/owner => allowed', () => {
expect(htmlEmbedAllowed(true, 'admin')).toBe(true);
expect(htmlEmbedAllowed(true, 'owner')).toBe(true);
});
it('toggle ON + member/unknown => not allowed', () => {
expect(htmlEmbedAllowed(true, 'member')).toBe(false);
expect(htmlEmbedAllowed(true, null)).toBe(false);
expect(htmlEmbedAllowed(true, undefined)).toBe(false);
expect(htmlEmbedAllowed(true, 'viewer')).toBe(false);
});
});
// The shared write-path strip ritual extracted from the 5 plain call-sites
// (collab handler, page create/duplicate, import, file-import-task,
// transclusion-unsync). Tested here once instead of being re-verified in each
// call-site's spec.
describe('stripHtmlEmbedIfNotAllowed (shared write-path gate)', () => {
const docWithEmbed = () => ({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'keep' }] },
{ type: 'htmlEmbed', attrs: { source: '<script>x()</script>' } },
],
});
const docWithoutEmbed = () => ({
type: 'doc',
content: [{ type: 'paragraph', content: [{ type: 'text', text: 'keep' }] }],
});
it('keeps the doc unchanged when feature is ON and role is admin (allowed)', () => {
const json = docWithEmbed();
const onStrip = jest.fn();
const result = stripHtmlEmbedIfNotAllowed(json, {
featureEnabled: true,
role: 'admin',
onStrip,
});
// Allowed => same reference returned, embed preserved, no side-effect.
expect(result).toBe(json);
expect(hasHtmlEmbedNode(result)).toBe(true);
expect(onStrip).not.toHaveBeenCalled();
});
it('keeps the doc unchanged for an owner when feature is ON (allowed)', () => {
const json = docWithEmbed();
const onStrip = jest.fn();
const result = stripHtmlEmbedIfNotAllowed(json, {
featureEnabled: true,
role: 'owner',
onStrip,
});
expect(result).toBe(json);
expect(hasHtmlEmbedNode(result)).toBe(true);
expect(onStrip).not.toHaveBeenCalled();
});
it('strips the embed when the feature is OFF (even for an admin)', () => {
const json = docWithEmbed();
const onStrip = jest.fn();
const result = stripHtmlEmbedIfNotAllowed(json, {
featureEnabled: false,
role: 'admin',
onStrip,
});
expect(hasHtmlEmbedNode(result)).toBe(false);
expect(onStrip).toHaveBeenCalledTimes(1);
});
it('strips the embed for a non-admin when the feature is ON', () => {
const json = docWithEmbed();
const onStrip = jest.fn();
const result = stripHtmlEmbedIfNotAllowed(json, {
featureEnabled: true,
role: 'member',
onStrip,
});
expect(hasHtmlEmbedNode(result)).toBe(false);
expect(onStrip).toHaveBeenCalledTimes(1);
});
it('strips the embed for a null/undefined role when the feature is ON', () => {
for (const role of [null, undefined]) {
const onStrip = jest.fn();
const result = stripHtmlEmbedIfNotAllowed(docWithEmbed(), {
featureEnabled: true,
role,
onStrip,
});
expect(hasHtmlEmbedNode(result)).toBe(false);
expect(onStrip).toHaveBeenCalledTimes(1);
}
});
it('returns input unchanged and does NOT call onStrip when no embed is present', () => {
const json = docWithoutEmbed();
const onStrip = jest.fn();
// Not allowed (feature OFF), but there is nothing to strip.
const result = stripHtmlEmbedIfNotAllowed(json, {
featureEnabled: false,
role: 'member',
onStrip,
});
expect(result).toBe(json);
expect(onStrip).not.toHaveBeenCalled();
});
it('calls onStrip exactly once per strip', () => {
const onStrip = jest.fn();
stripHtmlEmbedIfNotAllowed(docWithEmbed(), {
featureEnabled: false,
role: 'member',
onStrip,
});
expect(onStrip).toHaveBeenCalledTimes(1);
});
it('works without an onStrip callback (optional)', () => {
const result = stripHtmlEmbedIfNotAllowed(docWithEmbed(), {
featureEnabled: false,
role: 'member',
});
expect(hasHtmlEmbedNode(result)).toBe(false);
});
});
// NOTE: a previous revision of this file re-implemented the write-path admin
// gate as a local `applyAdminGate` stand-in and asserted against THAT. A
// deleted/misplaced real guard would have kept those green. The stand-in is
// removed. The collab store, REST/MCP update, and transclusion-unsync paths are
// now tested against their REAL code in:
// - collaboration/extensions/persistence.extension.html-embed.spec.ts
// - collaboration/collaboration.handler.html-embed.spec.ts
// - core/page/transclusion/spec/transclusion-unsync-html-embed.spec.ts
// - core/page/services/page-service-html-embed-identity.spec.ts (create/dup)
// - integrations/import/services/import-html-embed-identity.spec.ts (import)
// The htmlEmbed node renders inside a sandboxed iframe, so the per-write role
// gate has been removed. `stripHtmlEmbedNodes` + `isHtmlEmbedFeatureEnabled`
// remain ONLY to honor the workspace master toggle on the anonymous public-share
// read path — tested against the real share code in:
// - core/share/share-html-embed.spec.ts
//
// The case below stays here because it asserts a REAL parse path
// (htmlToJson, the markdown/html create format) feeding the REAL helpers — not a
// re-implemented gate.
describe('htmlEmbed smuggled via the markdown/html <!--html-embed--> form (real parse + real helpers)', () => {
it('the parsed node is detected and stripped by the real helpers', () => {
// The markdown/html create formats decode to the same htmlEmbed node, so the
// gate (run on the parsed JSON) covers them identically.
const source = '<script>steal()</script>';
// The case below asserts that the REAL parse path (htmlToJson, the markdown/html
// form) produces an htmlEmbed node the master-toggle strip can detect & remove.
describe('htmlEmbed via the markdown/html form (real parse + real strip helper)', () => {
it('the parsed node is detected and stripped by the real helper', () => {
const source = '<script>track()</script>';
const encoded = encodeHtmlEmbedSource(source);
const html = `<div data-type="htmlEmbed" data-source="${encoded}"></div>`;
const parsed = htmlToJson(html);
expect(hasHtmlEmbedNode(parsed)).toBe(true);
// A non-admin role gates to strip via the real helpers.
expect(canAuthorHtmlEmbed('member')).toBe(false);
const stripped = stripHtmlEmbedNodes(parsed);
expect(hasHtmlEmbedNode(stripped)).toBe(false);
});

View File

@@ -5,12 +5,12 @@ export const HTML_EMBED_NODE_NAME = 'htmlEmbed';
/**
* Recursively remove every `htmlEmbed` node from a ProseMirror JSON document.
*
* SECURITY: `htmlEmbed` renders raw, unsanitized HTML/CSS/JS in the wiki origin
* (stored-XSS by design, Variant C). Only workspace admins/owners are allowed to
* author it. This helper is the server-side enforcement primitive: every WRITE
* path that may persist content from a NON-admin caller must run the incoming
* document through this function so a non-admin cannot smuggle the node in via
* the collab socket, the REST/MCP/AI content-update path, paste, or import.
* The `htmlEmbed` node renders inside a SANDBOXED iframe (no `allow-same-origin`)
* on the client, so its content cannot touch the viewer's session/cookies/API —
* it is NOT a stored-XSS surface. This helper is retained ONLY to honor the
* workspace master toggle (`settings.htmlEmbed`) on the anonymous public-share
* read path: an anonymous viewer cannot read the workspace toggle, so the server
* strips the block when the toggle is OFF before serving shared content.
*
* Returns a NEW document; the input is not mutated. If the input is not a valid
* doc object it is returned unchanged (callers persist what they were given).
@@ -22,15 +22,6 @@ export function stripHtmlEmbedNodes<T = JSONContent>(pmJson: T): T {
const node = pmJson as unknown as JSONContent;
// Defensive root-type check: if the ROOT node is itself an htmlEmbed, the
// children-filtering below could never drop it, so a bare htmlEmbed would be
// returned as-is. This branch is unreachable in normal use (the PM document
// root is always a `doc`) and exists only to make the helper total — a bare
// htmlEmbed can never be returned by this function.
if (node.type === HTML_EMBED_NODE_NAME) {
return { type: 'doc', content: [] } as unknown as T;
}
if (Array.isArray(node.content)) {
const filtered: JSONContent[] = [];
for (const child of node.content) {
@@ -48,111 +39,12 @@ export function stripHtmlEmbedNodes<T = JSONContent>(pmJson: T): T {
return { ...node } as unknown as T;
}
/**
* Walk the document and collect a stable identity for every `htmlEmbed` node.
*
* The identity is the node's `attrs.source` string — the raw HTML the embed
* renders. Two embeds that render the exact same HTML are treated as the same
* identity. Used by the collab persist path to know which embeds are ALREADY
* present in the currently-persisted (admin-vetted) page content, so a later
* non-admin store can strip only NEWLY-introduced embeds while preserving the
* pre-existing admin-authored ones.
*
* Absent attrs or a non-string/absent `source` are skipped gracefully (such a
* node contributes no identity to the set).
*/
export function collectHtmlEmbedSources(pmJson: unknown): Set<string> {
const sources = new Set<string>();
const walk = (node: unknown): void => {
if (!node || typeof node !== 'object') {
return;
}
const n = node as JSONContent;
if (n.type === HTML_EMBED_NODE_NAME) {
const source = (n.attrs as Record<string, unknown> | undefined)?.source;
if (typeof source === 'string') {
sources.add(source);
}
}
if (Array.isArray(n.content)) {
for (const child of n.content) {
walk(child);
}
}
};
walk(pmJson);
return sources;
}
/**
* Like {@link stripHtmlEmbedNodes}, but KEEP any `htmlEmbed` node whose
* `attrs.source` is in `allowedSources`; remove the rest.
*
* Used on the collab persist path when the feature toggle is ON but the storing
* user is a NON-admin: `allowedSources` is the set of embed sources already
* present in the currently-persisted page content (admin-authored, already
* vetted). A non-admin therefore cannot ADD a new embed, but their unrelated
* edit also cannot destroy an admin's existing one.
*
* NOTE: identity is the raw source string, so a non-admin who COPIES an existing
* admin embed's exact source into a NEW location passes this check. That is
* acceptable — the source is already admin-vetted content present in the doc; no
* new untrusted HTML is introduced.
*
* Returns a NEW document; the input is not mutated. Same defensive root-type
* check pattern as {@link stripHtmlEmbedNodes}.
*/
export function stripDisallowedHtmlEmbedNodes<T = JSONContent>(
pmJson: T,
allowedSources: Set<string>,
): T {
if (!pmJson || typeof pmJson !== 'object') {
return pmJson;
}
const node = pmJson as unknown as JSONContent;
// Defensive root-type check (mirrors stripHtmlEmbedNodes): if the ROOT node is
// itself an htmlEmbed and its source is NOT allowed, the children-filtering
// below could never drop it, so neutralize it here. Unreachable in normal use
// (the PM document root is always a `doc`).
if (node.type === HTML_EMBED_NODE_NAME) {
const source = (node.attrs as Record<string, unknown> | undefined)?.source;
if (typeof source === 'string' && allowedSources.has(source)) {
return { ...node } as unknown as T;
}
return { type: 'doc', content: [] } as unknown as T;
}
if (Array.isArray(node.content)) {
const filtered: JSONContent[] = [];
for (const child of node.content) {
// Drop a disallowed htmlEmbed child (newly introduced); keep an allowed
// one (already present in the persisted, admin-vetted content).
if (child && child.type === HTML_EMBED_NODE_NAME) {
const source = (child.attrs as Record<string, unknown> | undefined)
?.source;
if (typeof source === 'string' && allowedSources.has(source)) {
filtered.push({ ...child });
}
continue;
}
// Recurse so nested htmlEmbed nodes (e.g. inside columns/callouts) are
// also filtered by the same allow-list.
filtered.push(stripDisallowedHtmlEmbedNodes(child, allowedSources));
}
return { ...node, content: filtered } as unknown as T;
}
return { ...node } as unknown as T;
}
/**
* Returns true if the document contains at least one `htmlEmbed` node anywhere
* in its tree. Useful to decide whether a strip pass actually changed anything
* (e.g. for logging a rejected non-admin embed attempt).
* in its tree. Useful to decide whether a strip pass on the share read path
* actually changed anything. After the write-path role gate removal this is no
* longer called by production code; it is retained as a test-only assertion
* helper (and a detection primitive should a future read path need it).
*/
export function hasHtmlEmbedNode(pmJson: unknown): boolean {
if (!pmJson || typeof pmJson !== 'object') {
@@ -169,62 +61,9 @@ export function hasHtmlEmbedNode(pmJson: unknown): boolean {
}
/**
* Map the workspace user role to whether it may author `htmlEmbed` nodes.
* Owners and admins are trusted; everyone else (member, and any unknown role)
* is not. Kept here so every write path shares one definition of "trusted".
*/
export function canAuthorHtmlEmbed(role: string | null | undefined): boolean {
return role === 'owner' || role === 'admin';
}
/**
* Combined write-path gate for the htmlEmbed feature.
*
* htmlEmbed is allowed in a document only when the workspace feature toggle is
* ON and the authoring/saving user is a workspace admin/owner. OFF (default) =>
* stripped for EVERYONE, including admins (the feature is disabled).
*
* `featureEnabled` is read from the workspace settings for the relevant write
* (`workspace.settings?.htmlEmbed === true`). Every WRITE path that may persist
* htmlEmbed content must gate on this combined predicate, so that turning the
* toggle OFF strips existing embeds on the next save and prevents new ones from
* being persisted regardless of role.
*/
export function htmlEmbedAllowed(
featureEnabled: boolean,
role: string | null | undefined,
): boolean {
return featureEnabled === true && canAuthorHtmlEmbed(role);
}
/**
* Strip htmlEmbed nodes unless the (feature-enabled AND role-allowed) gate
* passes. Returns the possibly-stripped doc. The caller resolves featureEnabled
* (from workspace settings) and role (actor) itself — those legitimately differ
* per call-site (e.g. share path uses role=null) — this helper owns only the
* has-check + AND + strip + optional onStrip callback.
*
* Centralizes the 4-step write-path ritual (resolve role -> resolve
* featureEnabled -> htmlEmbedAllowed AND -> stripHtmlEmbedNodes) so the plain
* strip-all call-sites share one tested decision. Sites with CUSTOM strip logic
* (e.g. the collab persist path's preserve-admin variant) keep their own code.
*/
export function stripHtmlEmbedIfNotAllowed<T>(
json: T,
opts: { featureEnabled: boolean; role: string | null | undefined; onStrip?: () => void },
): T {
if (htmlEmbedAllowed(opts.featureEnabled, opts.role)) return json;
if (hasHtmlEmbedNode(json)) {
opts.onStrip?.();
return stripHtmlEmbedNodes(json);
}
return json;
}
/**
* Read the workspace-level htmlEmbed feature toggle from a workspace's settings
* jsonb. ABSENT/non-true => OFF (the default). Kept here so every server write
* path resolves the toggle the same way.
* Read the workspace-level htmlEmbed master toggle from a workspace's settings
* jsonb. ABSENT/non-true => OFF (the default). Kept here so the share read path
* resolves the toggle the same way it is persisted.
*/
export function isHtmlEmbedFeatureEnabled(
settings: unknown | null | undefined,

View File

@@ -65,21 +65,19 @@ export const MAX_SHARE_MESSAGES = 30;
export const MAX_SHARE_MESSAGE_CHARS = 8000;
/**
* Default per-request output cap for the anonymous share assistant. Bounds the
* tokens a single anonymous request can generate; worst case = steps x this.
*/
export const SHARE_AI_MAX_OUTPUT_TOKENS = 512;
/**
* Read the per-request output cap from the environment (overridable seam),
* falling back to the sane default. A non-positive / unparseable value uses the
* default. Mirrors resolveShareAiWorkspaceMax().
* Per-request output-token ceiling for the anonymous assistant. `streamText`
* runs up to `stepCountIs(5)` steps, so the worst-case output of one accepted
* request is bounded by (steps × this). The per-workspace cap bounds the COUNT
* of calls; this bounds the SIZE of each, so a single anonymous call cannot run
* up the provider bill even if the per-IP throttle is evaded. Env-overridable
* seam; a non-positive or unparseable value falls back to the default.
*/
export const SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT = 512;
export function resolveShareAiMaxOutputTokens(): number {
const raw = Number(process.env.SHARE_AI_MAX_OUTPUT_TOKENS);
return Number.isFinite(raw) && raw > 0
? Math.floor(raw)
: SHARE_AI_MAX_OUTPUT_TOKENS;
: SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT;
}
/**
@@ -225,8 +223,8 @@ export class PublicShareChatService {
tools,
// Bound the agent loop for anonymous callers.
stopWhen: stepCountIs(5),
// Bounds per-request output so one anonymous request can't run up the
// provider bill; worst case = steps x this.
// Cap per-request output so one anonymous call cannot run up the provider
// bill even if the per-IP throttle is evaded; worst case = steps × this.
maxOutputTokens: resolveShareAiMaxOutputTokens(),
abortSignal: signal,
onError: ({ error }) => {

View File

@@ -5,6 +5,8 @@ import { buildShareSystemPrompt } from './public-share-chat.prompt';
import {
PublicShareChatService,
filterShareTranscript,
resolveShareAiMaxOutputTokens,
SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT,
} from './public-share-chat.service';
import { PublicShareChatToolsService } from './tools/public-share-chat-tools.service';
import {
@@ -400,6 +402,44 @@ describe('resolveShareAiWorkspaceMax (env-overridable per-workspace cap)', () =>
});
});
describe('resolveShareAiMaxOutputTokens (env-overridable per-request output cap)', () => {
const ENV = 'SHARE_AI_MAX_OUTPUT_TOKENS';
const original = process.env[ENV];
afterEach(() => {
if (original === undefined) delete process.env[ENV];
else process.env[ENV] = original;
});
it('falls back to the default when unset', () => {
delete process.env[ENV];
expect(resolveShareAiMaxOutputTokens()).toBe(
SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT,
);
expect(SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT).toBe(512);
});
it('uses (and floors) a valid positive value from the env', () => {
process.env[ENV] = '1024.9';
expect(resolveShareAiMaxOutputTokens()).toBe(1024);
});
it('falls back to the default for zero, a negative, or a non-numeric value', () => {
process.env[ENV] = '0';
expect(resolveShareAiMaxOutputTokens()).toBe(
SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT,
);
process.env[ENV] = '-5';
expect(resolveShareAiMaxOutputTokens()).toBe(
SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT,
);
process.env[ENV] = 'not-a-number';
expect(resolveShareAiMaxOutputTokens()).toBe(
SHARE_AI_MAX_OUTPUT_TOKENS_DEFAULT,
);
});
});
describe('PublicShareWorkspaceLimiter (cluster-wide sliding-window per-workspace cap)', () => {
it('allows up to the cap within a window, then 429s (returns false)', async () => {
const limiter = makeLimiter(3, 60_000, () => 1_000);
@@ -482,9 +522,11 @@ describe('PublicShareWorkspaceLimiter (cluster-wide sliding-window per-workspace
});
it('FAILS CLOSED (returns false) when the Redis eval rejects', async () => {
// FAIL CLOSED (#62): if Redis is down we cannot prove the workspace is under
// its cap, so DENY (the controller 429s) rather than admit an unmetered,
// billable anonymous call. The feature is optional, so denial is harmless.
// The per-workspace cap is the COST backstop for an OPTIONAL anonymous
// assistant. If Redis is unavailable we cannot prove the workspace is under
// its cap, so we DENY (controller 429s) rather than admit an unmetered,
// billable call — a brief Redis blip disabling the assistant is safer than
// an unbounded provider bill.
const failingRedis = {
eval: () => Promise.reject(new Error('redis down')),
} as unknown as import('ioredis').Redis;

View File

@@ -99,11 +99,11 @@ export class PublicShareWorkspaceLimiter {
/**
* Account one call for `key`. Returns true if it is within the cap (allowed),
* false if the cap over the trailing window is exceeded (caller must 429).
* On a Redis failure we FAIL CLOSED (return false): if Redis is down we cannot
* prove the workspace is under its cap, so we DENY rather than admit an
* unmetered, billable anonymous call. The feature is optional, so the
* temporary denial is harmless. (Operators wanting a tighter steady-state cap
* can lower the default via SHARE_AI_WORKSPACE_MAX_PER_HOUR, e.g. =100.)
* On a Redis failure we FAIL CLOSED (return false): this cap is the COST
* backstop for an OPTIONAL anonymous assistant, so when Redis is unavailable we
* cannot prove the workspace is under its cap and therefore DENY rather than
* admit an unmetered, billable anonymous call. A transient Redis blip briefly
* disabling the assistant is preferable to an unbounded provider bill.
*/
async tryConsume(key: string): Promise<boolean> {
const t = this.now();
@@ -122,9 +122,11 @@ export class PublicShareWorkspaceLimiter {
);
return admitted === 1;
} catch (err) {
// FAIL CLOSED: if Redis is down we cannot prove the workspace is under its
// cap, so DENY (controller 429s) rather than admit an unmetered, billable
// anonymous call. The feature is optional, so denial is harmless.
// FAIL CLOSED: when Redis is unavailable we cannot prove the workspace is
// under its cap, so we DENY (the controller 429s) rather than admit an
// unmetered, billable anonymous call. The assistant is optional, so a
// transient Redis blip briefly disabling it is the safer failure mode than
// an unbounded provider bill.
this.logger.error(
`share-ai workspace limiter Redis failure for key "${key}"; failing closed`,
err as Error,

View File

@@ -10,7 +10,6 @@ describe('PageController', () => {
controller = new PageController(
{} as any, // pageService
{} as any, // pageRepo
{} as any, // workspaceRepo
{} as any, // pageHistoryService
{} as any, // spaceAbility
{} as any, // pageAccessService

View File

@@ -39,11 +39,6 @@ import {
} from '../casl/interfaces/space-ability.type';
import SpaceAbilityFactory from '../casl/abilities/space-ability.factory';
import { PageRepo } from '@docmost/db/repos/page/page.repo';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
import {
isHtmlEmbedFeatureEnabled,
stripHtmlEmbedNodes,
} from '../../common/helpers/prosemirror/html-embed.util';
import { RecentPageDto } from './dto/recent-page.dto';
import { CreatedByUserDto } from './dto/created-by-user.dto';
import { DuplicatePageDto } from './dto/duplicate-page.dto';
@@ -68,7 +63,6 @@ export class PageController {
constructor(
private readonly pageService: PageService,
private readonly pageRepo: PageRepo,
private readonly workspaceRepo: WorkspaceRepo,
private readonly pageHistoryService: PageHistoryService,
private readonly spaceAbility: SpaceAbilityFactory,
private readonly pageAccessService: PageAccessService,
@@ -98,18 +92,6 @@ export class PageController {
const permissions = { canEdit, hasRestriction };
if (page.content) {
const workspace = await this.workspaceRepo.findById(page.workspaceId);
if (!isHtmlEmbedFeatureEnabled(workspace?.settings)) {
// Kill-switch: when the workspace feature is OFF, never serve raw
// htmlEmbed nodes on the read path (mirrors the public-share strip),
// so disabling the feature is an immediate, total kill-switch and not
// dependent on the page being re-saved. Admin-authored content only.
// Fail-closed: a missing workspace resolves to OFF and is stripped.
page.content = stripHtmlEmbedNodes(page.content) as any;
}
}
if (dto.format && dto.format !== 'json' && page.content) {
const contentOutput =
dto.format === 'markdown'
@@ -255,9 +237,6 @@ export class PageController {
user.id,
workspace.id,
createPageDto,
// Pass the caller's workspace role so create() can enforce the htmlEmbed
// admin gate (non-admins cannot author raw-JS embeds).
user.role,
provenance,
);
@@ -554,16 +533,6 @@ export class PageController {
await this.pageAccessService.validateCanView(page, user);
if (history.content) {
const workspace = await this.workspaceRepo.findById(page.workspaceId);
if (!isHtmlEmbedFeatureEnabled(workspace?.settings)) {
// Kill-switch: history snapshots are an authenticated read path too, so
// strip htmlEmbed when the workspace feature is OFF (same as /info and
// the public-share path). Fail-closed on a missing workspace.
history.content = stripHtmlEmbedNodes(history.content) as any;
}
}
return history;
}

View File

@@ -1,240 +0,0 @@
// Exercises the REAL PageService htmlEmbed admin gate on its two non-collab
// write paths: PageService.create() and PageService.duplicatePage(). Both build
// content/textContent/ydoc directly and persist, bypassing the collab
// onStoreDocument strip, so each must run the incoming document through the
// toggle-AND-admin gate (`htmlEmbedAllowed(featureEnabled, role)` -> if not
// allowed, `stripHtmlEmbedNodes`) BEFORE persisting.
//
// This spec constructs the REAL PageService with every constructor dep mocked,
// feeds content containing an `htmlEmbed`, calls the real method, and asserts on
// the PERSISTED content (captured at the repo insert / db insert boundary) that
// the embed was actually stripped (member/unknown role) or preserved
// (admin/owner + toggle ON). Mirrors the GOOD pattern in
// transclusion/spec/transclusion-unsync-html-embed.spec.ts.
//
// page.service.ts pulls in the collaboration gateway (a transitive ESM chain
// `lib0/decoding.js` that jest's transformIgnorePatterns does not transpile), so
// that single module is mocked away — it is never used on the create/duplicate
// gate paths.
jest.mock('../../../collaboration/collaboration.gateway', () => ({
CollaborationGateway: class {},
}));
import { PageService } from './page.service';
import { hasHtmlEmbedNode } from '../../../common/helpers/prosemirror/html-embed.util';
const WS = 'ws-1';
const SPACE = 'space-1';
const USER = 'u1';
const docWithEmbed = () => ({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'body' }] },
{ type: 'htmlEmbed', attrs: { source: '<script>alert(1)</script>' } },
],
});
// Minimal chainable kysely stub. `nextPagePosition` (used by create) and
// duplicatePage's bulk insert go through `this.db`; only the calls those paths
// make need to resolve. `capturedInserts` collects every page row handed to
// `insertInto('pages').values(...)` so we can assert on the persisted content.
function buildDb(capturedInserts: any[]) {
const selectChain: any = {
select: () => selectChain,
selectAll: () => selectChain,
where: () => selectChain,
orderBy: () => selectChain,
limit: () => selectChain,
execute: async () => [],
executeTakeFirst: async () => undefined,
};
const db: any = {
selectFrom: () => selectChain,
insertInto: (table: string) => ({
values: (rows: any) => {
if (table === 'pages') {
for (const row of Array.isArray(rows) ? rows : [rows]) {
capturedInserts.push(row);
}
}
return { execute: async () => undefined };
},
}),
// executeTx -> db.transaction().execute(cb): run the callback with `db`
// itself acting as the transaction so any in-tx inserts are captured too.
transaction: () => ({ execute: async (cb: any) => cb(db) }),
};
return db;
}
// Build the REAL PageService with all 13 constructor deps mocked. `featureEnabled`
// drives the workspace toggle the gate reads via workspaceRepo.findById.
function buildService(opts: {
featureEnabled: boolean;
capturedInserts: any[];
rootPage?: any; // for duplicatePage
}) {
const { featureEnabled, capturedInserts } = opts;
const pageRepo: any = {
findById: jest.fn(async () => null), // no parent page in create tests
// create() persists here; capture the row so we can inspect content.
insertPage: jest.fn(async (row: any) => {
capturedInserts.push(row);
return { id: 'new-page', slugId: 'slug-1', ...row };
}),
getPageAndDescendants: jest.fn(async () => [opts.rootPage].filter(Boolean)),
};
const pagePermissionRepo: any = {
// duplicatePage filters accessible pages; grant the root so it is copied.
filterAccessiblePageIds: jest.fn(async () =>
opts.rootPage ? [opts.rootPage.id] : [],
),
};
const workspaceRepo: any = {
findById: jest.fn(async () => ({
id: WS,
settings: { htmlEmbed: featureEnabled },
})),
};
const attachmentRepo: any = { findByIds: jest.fn(async () => []) };
const storageService: any = { copy: jest.fn(async () => undefined) };
const noopQueue: any = { add: jest.fn(async () => undefined) };
const eventEmitter: any = { emit: jest.fn() };
const collaborationGateway: any = {};
const watcherService: any = {};
// duplicatePage fires transclusion bulk inserts after persisting; they are
// best-effort (wrapped in try/catch) and irrelevant to the gate.
const transclusionService: any = {
insertTransclusionsForPages: jest.fn(async () => undefined),
insertReferencesForPages: jest.fn(async () => undefined),
insertTemplateReferencesForPages: jest.fn(async () => undefined),
};
const db = buildDb(capturedInserts);
const service = new PageService(
pageRepo,
pagePermissionRepo,
attachmentRepo,
db,
storageService,
noopQueue, // attachmentQueue
noopQueue, // aiQueue
noopQueue, // generalQueue
eventEmitter,
collaborationGateway,
watcherService,
transclusionService,
workspaceRepo,
);
return service;
}
describe('PageService.create htmlEmbed admin gate (real code)', () => {
// Run create() and return the content actually persisted via insertPage.
async function persistedContent(
featureEnabled: boolean,
callerRole: string | null | undefined,
) {
const capturedInserts: any[] = [];
const service = buildService({ featureEnabled, capturedInserts });
await service.create(
USER,
WS,
{
spaceId: SPACE,
title: 'p',
// 'json' format is used as-is by parseProsemirrorContent (passed to the
// real jsonToNode schema validation), so hand it the PM-JSON object.
content: docWithEmbed(),
format: 'json' as any,
} as any,
callerRole,
);
expect(capturedInserts).toHaveLength(1);
return capturedInserts[0].content;
}
it('toggle ON + member: persisted content has htmlEmbed stripped', async () => {
const content = await persistedContent(true, 'member');
expect(hasHtmlEmbedNode(content)).toBe(false);
// Non-embed content survives.
expect(JSON.stringify(content)).toContain('body');
});
it('toggle ON + admin: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'admin'))).toBe(true);
});
it('toggle ON + owner: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'owner'))).toBe(true);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(hasHtmlEmbedNode(await persistedContent(false, 'admin'))).toBe(false);
});
it('unknown/empty role: fails closed (stripped)', async () => {
for (const role of [undefined, null, 'viewer'] as const) {
expect(hasHtmlEmbedNode(await persistedContent(true, role))).toBe(false);
}
});
});
describe('PageService.duplicatePage htmlEmbed admin gate (real code)', () => {
// Duplicate a single source page that contains an embed and return the content
// persisted for the copy (captured at db.insertInto('pages').values(...)).
async function persistedContent(
featureEnabled: boolean,
role: string | null | undefined,
) {
const rootPage: any = {
id: 'src-page',
slugId: 'src-slug',
title: 'Source',
icon: null,
position: 'a0',
spaceId: SPACE,
workspaceId: WS,
parentPageId: null,
content: docWithEmbed(),
};
const capturedInserts: any[] = [];
const service = buildService({ featureEnabled, capturedInserts, rootPage });
const authUser: any = { id: USER, workspaceId: WS, role };
await service.duplicatePage(rootPage, undefined, authUser);
// The bulk insert is the page persist boundary; one source page -> one copy.
const pageRows = capturedInserts.filter((r) => r.content);
expect(pageRows.length).toBeGreaterThanOrEqual(1);
return pageRows[0].content;
}
it('toggle ON + member: persisted copy has htmlEmbed stripped', async () => {
const content = await persistedContent(true, 'member');
expect(hasHtmlEmbedNode(content)).toBe(false);
expect(JSON.stringify(content)).toContain('body');
});
it('toggle ON + admin: persisted copy keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'admin'))).toBe(true);
});
it('toggle ON + owner: persisted copy keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, 'owner'))).toBe(true);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(hasHtmlEmbedNode(await persistedContent(false, 'admin'))).toBe(false);
});
it('unknown/empty role: fails closed (stripped)', async () => {
for (const role of [undefined, null, 'viewer'] as const) {
expect(hasHtmlEmbedNode(await persistedContent(true, role))).toBe(false);
}
});
});

View File

@@ -20,7 +20,6 @@ describe('PageService', () => {
{} as any, // collaborationGateway
{} as any, // watcherService
{} as any, // transclusionService
{} as any, // workspaceRepo
);
});

View File

@@ -31,11 +31,6 @@ import {
isAttachmentNode,
removeMarkTypeFromDoc,
} from '../../../common/helpers/prosemirror/utils';
import {
isHtmlEmbedFeatureEnabled,
stripHtmlEmbedIfNotAllowed,
} from '../../../common/helpers/prosemirror/html-embed.util';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
import {
htmlToJson,
jsonToNode,
@@ -81,7 +76,6 @@ export class PageService {
private collaborationGateway: CollaborationGateway,
private readonly watcherService: WatcherService,
private readonly transclusionService: TransclusionService,
private readonly workspaceRepo: WorkspaceRepo,
) {}
async findById(
@@ -101,10 +95,6 @@ export class PageService {
userId: string,
workspaceId: string,
createPageDto: CreatePageDto,
// Workspace role of the caller. Used to enforce the htmlEmbed admin gate on
// the create write path (see below). Optional/typed loosely so unknown or
// missing roles fall through to the non-admin (strip) branch by default.
callerRole?: string | null,
// Optional agent-edit provenance (from the signed access claim). When the
// actor is 'agent', stamp the page's source marker so a freshly created page
// shows it was created by the AI agent (§14 N2) — create goes through REST,
@@ -135,35 +125,11 @@ export class PageService {
let ydoc = undefined;
if (createPageDto?.content && createPageDto?.format) {
let prosemirrorJson = await this.parseProsemirrorContent(
const prosemirrorJson = await this.parseProsemirrorContent(
createPageDto.content,
createPageDto.format,
);
// SECURITY (Variant C admin gate, plain page-create write path):
// create() builds content/textContent/ydoc directly and persists them via
// insertPage, bypassing the collab onStoreDocument strip. htmlEmbed renders
// raw, unsanitized JS in readers' browsers, so only workspace admins/owners
// may author it. The create controller requires only space Edit, so a
// regular member could otherwise POST a doc (json, or the markdown/html
// <!--html-embed:BASE64--> forms that parse to the same node) containing an
// htmlEmbed and store XSS for every reader. Strip every htmlEmbed node when
// the caller is not an admin, BEFORE deriving textContent/ydoc/insert.
// The gate is toggle-AND-admin: htmlEmbed survives only when the workspace
// feature toggle is ON and the caller is an admin/owner. OFF (default) =>
// stripped for everyone. Cheap settings read keyed to the workspace.
const htmlEmbedEnabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(workspaceId))?.settings,
);
prosemirrorJson = stripHtmlEmbedIfNotAllowed(prosemirrorJson, {
featureEnabled: htmlEmbedEnabled,
role: callerRole,
onStrip: () =>
this.logger.warn(
`Stripping htmlEmbed node(s) from page creation by user ${userId} (space ${createPageDto.spaceId})`,
),
});
content = prosemirrorJson;
textContent = jsonToText(prosemirrorJson);
ydoc = createYdocFromJson(prosemirrorJson);
@@ -653,12 +619,6 @@ export class PageService {
const attachmentMap = new Map<string, ICopyPageAttachment>();
// Resolve the htmlEmbed toggle ONCE for the workspace; the per-page gate
// below is toggle-AND-admin (OFF default => stripped for everyone).
const htmlEmbedEnabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(rootPage.workspaceId))?.settings,
);
const insertablePages: InsertablePage[] = await Promise.all(
pages.map(async (page) => {
const pageContent = getProsemirrorContent(page.content);
@@ -769,24 +729,7 @@ export class PageService {
}
});
let prosemirrorJson = prosemirrorDoc.toJSON();
// SECURITY (Variant C admin gate, duplication write path):
// Duplication builds the ydoc directly and bypasses the collab
// onStoreDocument strip. htmlEmbed renders raw, unsanitized JS in
// readers' browsers, so only workspace admins/owners may author it. A
// non-admin with space Edit could otherwise duplicate an admin page
// that contains an embed into a new page authored by them. Strip every
// htmlEmbed node from each duplicated page when the duplicating user is
// not an admin, BEFORE computing textContent/ydoc/insert.
prosemirrorJson = stripHtmlEmbedIfNotAllowed(prosemirrorJson, {
featureEnabled: htmlEmbedEnabled,
role: authUser.role,
onStrip: () =>
this.logger.warn(
`Stripping htmlEmbed node(s) from page duplication by user ${authUser.id} (source page ${page.id})`,
),
});
const prosemirrorJson = prosemirrorDoc.toJSON();
// Add "Copy of " prefix to the root page title only for duplicates in same space
let title = page.title;

View File

@@ -68,7 +68,6 @@ describe('TransclusionService — template access core (real filter)', () => {
{} as any, // attachmentRepo
{} as any, // storageService
{} as any, // pageAccessService
{} as any, // workspaceRepo
);
return { service, db, pageRepo, spaceMemberRepo, pagePermissionRepo };
@@ -227,7 +226,6 @@ describe('TransclusionService.filterViewerAccessiblePageIds — AND ordering (co
{} as any, // attachmentRepo
{} as any, // storageService
{} as any, // pageAccessService
{} as any, // workspaceRepo
);
return { service, filterAccessiblePageIds };
@@ -324,7 +322,6 @@ describe('TransclusionService.syncPageTemplateReferences — workspace scoping',
{} as any, // attachmentRepo
{} as any, // storageService
{} as any, // pageAccessService
{} as any, // workspaceRepo
);
return {
@@ -471,7 +468,6 @@ describe('TransclusionService.insertTemplateReferencesForPages — per-workspace
{} as any, // attachmentRepo
{} as any, // storageService
{} as any, // pageAccessService
{} as any, // workspaceRepo
);
return { service, insertMany };
}

View File

@@ -41,7 +41,6 @@ describe('TransclusionService.lookupTemplate — anti-leak catch branch', () =>
{} as any, // attachmentRepo
{} as any, // storageService
{} as any, // pageAccessService
{} as any, // workspaceRepo
);
// Stub the access decision; we are testing the content-prep stage, not access.
@@ -158,7 +157,6 @@ describe('TransclusionService.lookupTemplate — soft-deleted source via real fi
{} as any,
{} as any,
{} as any,
{} as any,
);
const { items } = await service.lookupTemplate(['deleted-src'], 'u1', 'w1');

View File

@@ -35,7 +35,6 @@ describe('TransclusionService.lookupTemplate (access mapping)', () => {
{} as any, // attachmentRepo
{} as any, // storageService
{} as any, // pageAccessService
{} as any, // workspaceRepo
);
jest

View File

@@ -57,7 +57,6 @@ function buildService(opts: {
{} as any, // attachmentRepo
{} as any, // storageService
{} as any, // pageAccessService
{} as any, // workspaceRepo
);
}

View File

@@ -1,145 +0,0 @@
import { TransclusionService } from '../transclusion.service';
import { hasHtmlEmbedNode } from '../../../../common/helpers/prosemirror/html-embed.util';
// Exercises the REAL TransclusionService.unsyncReference htmlEmbed admin gate.
// unsync returns a source snapshot the client materializes into the reference
// page; a non-admin must never receive an embed payload to re-persist. The gate
// reads `user.role` and strips before returning. All repos / access checks are
// mocked so the REAL gate logic runs end-to-end. Complements the existing
// transclusion specs (rewriteAttachmentsForUnsync, controller).
const WS = 'ws-1';
const REF_PAGE = 'ref-1';
const SRC_PAGE = 'src-1';
const TX_ID = 'tx-1';
const sourceContentWithEmbed = () => ({
type: 'doc',
content: [
{ type: 'paragraph', content: [{ type: 'text', text: 'snapshot body' }] },
{ type: 'htmlEmbed', attrs: { source: '<script>steal()</script>' } },
],
});
function buildService(featureEnabled = true) {
const pageRepo = {
findById: jest.fn(async (id: string) => ({
id,
workspaceId: WS,
spaceId: 'space-1',
deletedAt: null,
})),
};
const pageTransclusionsRepo = {
findByPageAndTransclusion: jest.fn(async () => ({
content: sourceContentWithEmbed(),
})),
};
const pageTransclusionReferencesRepo = {
deleteOne: jest.fn(async () => undefined),
};
const attachmentRepo = { findByIds: jest.fn(async () => []) };
const storageService = { copy: jest.fn(async () => undefined) };
const pageAccessService = {
validateCanEdit: jest.fn(async () => undefined),
validateCanView: jest.fn(async () => undefined),
};
// Workspace settings read used by the toggle-AND-admin gate.
const workspaceRepo = {
findById: jest.fn(async () => ({
id: WS,
settings: { htmlEmbed: featureEnabled },
})),
};
const service = new TransclusionService(
{} as any, // db (unused on this path)
pageTransclusionsRepo as any,
pageTransclusionReferencesRepo as any,
{} as any, // pageTemplateReferencesRepo (unused on this path)
pageRepo as any,
{} as any, // pagePermissionRepo (unused)
{} as any, // spaceMemberRepo (unused)
attachmentRepo as any,
storageService as any,
pageAccessService as any,
workspaceRepo as any,
);
return service;
}
function userWithRole(role: string | null | undefined) {
return { id: 'u1', workspaceId: WS, role } as any;
}
describe('TransclusionService.unsyncReference htmlEmbed admin gate (real code)', () => {
it('non-admin (member): returned content has htmlEmbed stripped', async () => {
const service = buildService();
const { content } = await service.unsyncReference(
REF_PAGE,
SRC_PAGE,
TX_ID,
userWithRole('member'),
);
expect(hasHtmlEmbedNode(content)).toBe(false);
// Non-embed content is preserved.
expect(JSON.stringify(content)).toContain('snapshot body');
});
it('unknown/empty role: fails closed (stripped)', async () => {
for (const role of [undefined, null, 'viewer'] as const) {
const service = buildService();
const { content } = await service.unsyncReference(
REF_PAGE,
SRC_PAGE,
TX_ID,
userWithRole(role),
);
expect(hasHtmlEmbedNode(content)).toBe(false);
}
});
it('toggle ON + admin: returned content keeps the htmlEmbed', async () => {
const service = buildService(true);
const { content } = await service.unsyncReference(
REF_PAGE,
SRC_PAGE,
TX_ID,
userWithRole('admin'),
);
expect(hasHtmlEmbedNode(content)).toBe(true);
});
it('toggle ON + owner: returned content keeps the htmlEmbed', async () => {
const service = buildService(true);
const { content } = await service.unsyncReference(
REF_PAGE,
SRC_PAGE,
TX_ID,
userWithRole('owner'),
);
expect(hasHtmlEmbedNode(content)).toBe(true);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
const service = buildService(false);
const { content } = await service.unsyncReference(
REF_PAGE,
SRC_PAGE,
TX_ID,
userWithRole('admin'),
);
expect(hasHtmlEmbedNode(content)).toBe(false);
});
it('toggle OFF + member: stripped', async () => {
const service = buildService(false);
const { content } = await service.unsyncReference(
REF_PAGE,
SRC_PAGE,
TX_ID,
userWithRole('member'),
);
expect(hasHtmlEmbedNode(content)).toBe(false);
});
});

View File

@@ -33,11 +33,6 @@ import {
import { jsonToNode } from '../../../collaboration/collaboration.util';
import { Page, User } from '@docmost/db/types/entity.types';
import { PageAccessService } from '../page-access/page-access.service';
import {
isHtmlEmbedFeatureEnabled,
stripHtmlEmbedIfNotAllowed,
} from '../../../common/helpers/prosemirror/html-embed.util';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
type ReferencingPageInfo = {
id: string;
@@ -63,7 +58,6 @@ export class TransclusionService {
private readonly attachmentRepo: AttachmentRepo,
private readonly storageService: StorageService,
private readonly pageAccessService: PageAccessService,
private readonly workspaceRepo: WorkspaceRepo,
) {}
async syncPageTransclusions(
@@ -773,26 +767,6 @@ export class TransclusionService {
transclusionId,
);
// SECURITY (Variant C admin gate, transclusion unsync write path):
// The returned content is a source snapshot that the client materializes
// into the reference page via insertContentAt. The snapshot keeps any
// htmlEmbed verbatim, and unsync requires only space Edit/View. If the
// requesting user is not a workspace admin/owner, strip htmlEmbed nodes so a
// non-admin can never receive an embed payload to re-persist (the collab
// strip on the subsequent save is debounced/race-prone and must not be the
// only guard). Admin behavior is unchanged.
const htmlEmbedEnabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(user.workspaceId))?.settings,
);
content = stripHtmlEmbedIfNotAllowed(content, {
featureEnabled: htmlEmbedEnabled,
role: user.role,
onStrip: () =>
this.logger.warn(
`Stripping htmlEmbed node(s) from transclusion unsync by user ${user.id} (reference page ${referencePageId}, source page ${sourcePageId})`,
),
});
return { content };
}
}

View File

@@ -1,12 +1,14 @@
import { ShareService } from './share.service';
import { hasHtmlEmbedNode } from '../../common/helpers/prosemirror/html-embed.util';
// Exercises the REAL ShareService server-authoritative htmlEmbed kill-switch for
// shared content. An anonymous public-share viewer cannot read the per-workspace
// htmlEmbed toggle, so the SERVER must decide what to serve: when the toggle is
// OFF, htmlEmbed nodes are stripped from the shared doc; when ON they are kept so
// the read-only client executes them. All repos / token service are mocked so the
// real prepareContentForShare logic runs end-to-end via getSharedPage.
// Exercises the REAL ShareService server-authoritative htmlEmbed master toggle
// for shared content. The block renders inside a sandboxed iframe (harmless), so
// this is NOT an XSS guard — it is the master-toggle enforcement for anonymous
// shares: an anonymous public-share viewer cannot read the per-workspace
// htmlEmbed toggle, so the SERVER must decide what to serve. When the toggle is
// OFF, htmlEmbed nodes are stripped from the shared doc; when ON they are served
// and rendered in their sandboxed frame. All repos / token service are mocked so
// the real prepareContentForShare logic runs end-to-end via getSharedPage.
const WS = 'ws-1';
const PAGE = 'page-1';

View File

@@ -1,4 +1,4 @@
import { Controller, Get, Param, Req, Res } from '@nestjs/common';
import { Controller, Get, Logger, Param, Req, Res } from '@nestjs/common';
import { ShareService } from './share.service';
import { FastifyReply, FastifyRequest } from 'fastify';
import { join } from 'path';
@@ -11,6 +11,8 @@ import { htmlEscape } from '../../common/helpers/html-escaper';
@Controller('share')
export class ShareSeoController {
private readonly logger = new Logger(ShareSeoController.name);
constructor(
private readonly shareService: ShareService,
private workspaceRepo: WorkspaceRepo,
@@ -84,10 +86,34 @@ export class ShareSeoController {
.join('\n ');
const html = fs.readFileSync(indexFilePath, 'utf8');
const transformedHtml = html
let transformedHtml = html
.replace(/<title>[\s\S]*?<\/title>/i, `<title>${metaTitle}</title>`)
.replace(metaTagVar, metaTags);
// Deliberate same-origin tracker surface: this is the ONE place where an
// admin-authored analytics/tracker snippet (settings.trackerHead) is
// injected verbatim into the page origin. It is admin-only (writable only
// via the admin-gated workspace settings) and applies to PUBLIC SHARE
// pages only. It is trusted content, so it is NOT escaped. The htmlEmbed
// block itself is sandboxed and is the safe surface for everyone else.
const trackerHead = (workspace?.settings as any)?.trackerHead;
if (typeof trackerHead === 'string' && trackerHead.trim().length > 0) {
if (transformedHtml.includes('</head>')) {
// Function replacer: the snippet is admin-authored trusted content and
// must be injected verbatim. A string replacement would interpret `$&`,
// `$'`, `` $` `` and `$$` inside it as substitution patterns and mangle
// the tracker; a function return value is inserted literally.
transformedHtml = transformedHtml.replace(
'</head>',
() => `${trackerHead}\n</head>`,
);
} else {
this.logger.warn(
'trackerHead is configured but no </head> marker was found in the share index HTML; tracker snippet was not injected.',
);
}
}
res.type('text/html').send(transformedHtml);
}
}

View File

@@ -87,9 +87,16 @@ export class ShareController {
workspace.id,
);
// Resolve the identity name only when the assistant is enabled, so the
// anonymous widget can label messages with the configured persona name.
const aiAssistantName = aiAssistant
? await this.aiSettings.resolvePublicShareAssistantName(workspace.id)
: null;
return {
...shareData,
aiAssistant,
aiAssistantName,
features: this.licenseCheckService.resolveFeatures(
workspace.licenseKey,
workspace.plan,

View File

@@ -524,12 +524,14 @@ export class ShareService {
* not leak structure (existence, location, count, resolved state, or
* comment ids) to public viewers.
*
* 3. Strip `htmlEmbed` nodes when the workspace feature toggle is OFF. This
* makes the toggle a SERVER-AUTHORITATIVE kill-switch for shared content:
* when OFF the embed is never served to the anonymous viewer (who can't
* read the per-workspace toggle), when ON the embed is served so the
* read-only client executes it. `htmlEmbedEnabled` is resolved fail-closed
* by the callers (missing workspace => OFF => strip).
* 3. Strip `htmlEmbed` nodes when the workspace master toggle is OFF. The
* block renders inside a sandboxed iframe on the client (harmless, no
* same-origin access), so this is NOT an XSS guard — it is the
* SERVER-AUTHORITATIVE enforcement of the workspace master toggle for
* anonymous shares: an anonymous viewer cannot read the per-workspace
* toggle, so when OFF the block is never served, and when ON it is served
* and rendered in its sandboxed frame. `htmlEmbedEnabled` is resolved
* fail-closed by the callers (missing workspace => OFF => strip).
*
* Both share-content paths — the host page (`updatePublicAttachments`) and
* the share-scoped transclusion lookup (`lookupTransclusionForShare`) —
@@ -544,8 +546,9 @@ export class ShareService {
): Promise<Node | null> {
let pmJson = getProsemirrorContent(content);
// Kill-switch: when the workspace toggle is OFF, never serve htmlEmbed
// nodes to public viewers. Strip before tokenizing/serializing.
// Master-toggle enforcement: when the workspace toggle is OFF, never serve
// htmlEmbed nodes to anonymous public viewers (who cannot read the toggle).
// Strip before tokenizing/serializing.
if (!htmlEmbedEnabled) {
pmJson = stripHtmlEmbedNodes(pmJson);
}

View File

@@ -5,6 +5,8 @@ import {
IsBoolean,
IsInt,
IsOptional,
IsString,
MaxLength,
Min,
} from 'class-validator';
@@ -53,12 +55,22 @@ export class UpdateWorkspaceDto extends PartialType(CreateWorkspaceDto) {
@IsBoolean()
aiDictation: boolean;
// Workspace feature toggle for the admin-only HTML embed feature. Persisted at
// settings.htmlEmbed. ABSENT/false => OFF (default).
// Workspace master toggle that enables/disables the HTML embed block type.
// Persisted at settings.htmlEmbed. ABSENT/false => OFF (default). The block
// itself renders in a sandboxed iframe, so this is a feature switch, not a
// security gate.
@IsOptional()
@IsBoolean()
htmlEmbed: boolean;
// Admin-only analytics/tracker snippet (raw HTML/JS) injected verbatim into
// the <head> of PUBLIC SHARE pages only (same-origin). Persisted at
// settings.trackerHead. Admin-authored trusted content.
@IsOptional()
@IsString()
@MaxLength(20000)
trackerHead?: string;
@IsOptional()
@IsBoolean()
aiPublicShareAssistant: boolean;

View File

@@ -108,4 +108,38 @@ describe('WorkspaceService.update — htmlEmbed toggle persistence (real code)',
expect(logged.changes.before.htmlEmbed).toBe(false);
expect(logged.changes.after.htmlEmbed).toBe(true);
});
it('persists trackerHead via updateSetting with the trackerHead key', async () => {
const { service, updateSetting } = buildService({});
await service.update('w1', { trackerHead: '<script>ga()</script>' } as any);
expect(updateSetting).toHaveBeenCalledWith(
'w1',
'trackerHead',
'<script>ga()</script>',
expect.anything(),
);
});
it('does NOT call updateSetting when trackerHead is undefined in the dto', async () => {
const { service, updateSetting } = buildService({});
await service.update('w1', { name: 'New name' } as any);
expect(updateSetting).not.toHaveBeenCalled();
});
it('audits the trackerHead change (before/after) when the value changes', async () => {
const { service, auditService } = buildService({
settingsBefore: { trackerHead: '' },
});
await service.update('w1', { trackerHead: '<script>m()</script>' } as any);
expect(auditService.log).toHaveBeenCalledTimes(1);
const logged = auditService.log.mock.calls[0][0];
expect(logged.changes.before.trackerHead).toBe('');
expect(logged.changes.after.trackerHead).toBe('<script>m()</script>');
});
});

View File

@@ -525,6 +525,22 @@ export class WorkspaceService {
);
}
if (typeof updateWorkspaceDto.trackerHead !== 'undefined') {
// Admin-only analytics/tracker snippet injected into the <head> of
// public share pages (same-origin). Persisted at settings.trackerHead.
const prev = (settingsBefore as any)?.trackerHead ?? '';
if (prev !== updateWorkspaceDto.trackerHead) {
before.trackerHead = prev;
after.trackerHead = updateWorkspaceDto.trackerHead;
}
await this.workspaceRepo.updateSetting(
workspaceId,
'trackerHead',
updateWorkspaceDto.trackerHead,
trx,
);
}
if (typeof updateWorkspaceDto.aiPublicShareAssistant !== 'undefined') {
const prev = settingsBefore?.ai?.publicShareAssistant ?? false;
if (prev !== updateWorkspaceDto.aiPublicShareAssistant) {
@@ -549,6 +565,7 @@ export class WorkspaceService {
delete updateWorkspaceDto.aiChat;
delete updateWorkspaceDto.aiDictation;
delete updateWorkspaceDto.htmlEmbed;
delete updateWorkspaceDto.trackerHead;
delete updateWorkspaceDto.aiPublicShareAssistant;
await this.workspaceRepo.updateWorkspace(

View File

@@ -3,6 +3,7 @@ import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { QueueName, QueueJob } from '../queue/constants';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
import { AiAgentRoleRepo } from '@docmost/db/repos/ai-agent-roles/ai-agent-roles.repo';
import { AiProviderCredentialsRepo } from '@docmost/db/repos/ai-chat/ai-provider-credentials.repo';
import { PageEmbeddingRepo } from '@docmost/db/repos/ai-chat/page-embedding.repo';
import { PageRepo } from '@docmost/db/repos/page/page.repo';
@@ -49,6 +50,7 @@ export interface UpdateAiSettingsInput {
export class AiSettingsService {
constructor(
private readonly workspaceRepo: WorkspaceRepo,
private readonly aiAgentRoleRepo: AiAgentRoleRepo,
private readonly aiProviderCredentialsRepo: AiProviderCredentialsRepo,
private readonly pageEmbeddingRepo: PageEmbeddingRepo,
private readonly pageRepo: PageRepo,
@@ -110,6 +112,26 @@ export class AiSettingsService {
return settings?.ai?.publicShareAssistant === true;
}
/**
* Resolve the display name of the agent role acting as the public-share
* assistant's identity, so the anonymous widget can label messages with the
* persona name instead of the generic "AI agent". Returns null when no role
* is configured, or the referenced role is missing/disabled (built-in persona
* → the client falls back to "AI agent"). Mirrors the role resolution in
* PublicShareChatService.resolveShareRole.
*/
async resolvePublicShareAssistantName(
workspaceId: string,
): Promise<string | null> {
const resolved = await this.resolve(workspaceId);
const roleId = resolved?.publicShareAssistantRoleId;
if (!roleId) return null;
const role = await this.aiAgentRoleRepo.findById(roleId, workspaceId);
if (!role || !role.enabled) return null;
const name = role.name?.trim();
return name ? name : null;
}
/** Read the stored non-secret provider settings for a workspace. */
private async readProvider(
workspaceId: string,

View File

@@ -20,12 +20,6 @@ import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { FileTask, InsertablePage } from '@docmost/db/types/entity.types';
import { markdownToHtml } from '@docmost/editor-ext';
import { getProsemirrorContent } from '../../../common/helpers/prosemirror/utils';
import {
isHtmlEmbedFeatureEnabled,
stripHtmlEmbedIfNotAllowed,
} from '../../../common/helpers/prosemirror/html-embed.util';
import { UserRepo } from '@docmost/db/repos/user/user.repo';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
import { formatImportHtml } from '../utils/import-formatter';
import {
buildAttachmentCandidates,
@@ -59,8 +53,6 @@ export class FileImportTaskService {
private readonly backlinkRepo: BacklinkRepo,
@InjectKysely() private readonly db: KyselyDB,
private readonly importAttachmentService: ImportAttachmentService,
private readonly userRepo: UserRepo,
private readonly workspaceRepo: WorkspaceRepo,
private eventEmitter: EventEmitter2,
@Inject(AUDIT_SERVICE) private readonly auditService: IAuditService,
) {}
@@ -157,25 +149,6 @@ export class FileImportTaskService {
.where('id', '=', fileTask.spaceId)
.executeTakeFirst();
// SECURITY (Variant C admin gate, zip/multi-file import write path):
// An imported .html/.md file can carry an htmlEmbed marker (the node's
// serialized form), which would execute raw, unsanitized JS in readers'
// browsers. Only workspace admins/owners may author it. Resolve the
// importer's role ONCE here; each page's prosemirror JSON is run through the
// strip below before textContent/ydoc/insert when the importer is not an
// admin, so a non-admin cannot smuggle the node in via a zip import (which
// requires only space Edit).
const importingUser = await this.userRepo.findById(
fileTask.creatorId,
fileTask.workspaceId,
);
// Toggle-AND-admin gate, resolved ONCE for the whole import: htmlEmbed
// survives only when the workspace feature toggle is ON and the importer is
// an admin/owner. OFF (default) => stripped for everyone.
const htmlEmbedEnabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(fileTask.workspaceId))?.settings,
);
const pagesMap = new Map<string, ImportPageNode>();
for (const absPath of allFiles) {
@@ -523,22 +496,9 @@ export class FileImportTaskService {
await this.importService.processHTML(html),
);
let { title, prosemirrorJson } =
const { title, prosemirrorJson } =
this.importService.extractTitleAndRemoveHeading(pmState);
// SECURITY (Variant C admin gate): strip htmlEmbed nodes from pages
// imported by a non-admin BEFORE computing textContent/ydoc/insert.
// Gate (featureEnabled AND admin) is resolved once above and recomputed
// by the helper from the same htmlEmbedEnabled + importer role.
prosemirrorJson = stripHtmlEmbedIfNotAllowed(prosemirrorJson, {
featureEnabled: htmlEmbedEnabled,
role: importingUser?.role,
onStrip: () =>
this.logger.warn(
`Stripping htmlEmbed node(s) from non-admin import by user ${fileTask.creatorId} (page ${page.id}, file ${filePath})`,
),
});
const insertablePage: InsertablePage = {
id: page.id,
slugId: page.slugId,

View File

@@ -1,266 +0,0 @@
// Exercises the REAL htmlEmbed admin gate on the two import write paths:
//
// (1) ImportService.importPage() — single .html/.md upload
// (2) FileImportTaskService.processGenericImport() — zip / multi-file import
//
// Both build content/textContent/ydoc directly and persist (bypassing the
// collab onStoreDocument strip), so each must run the imported document through
// the toggle-AND-admin gate: resolve the importer via userRepo.findById, read
// the workspace toggle, then `htmlEmbedAllowed(enabled, role)` -> if not allowed,
// `stripHtmlEmbedNodes` BEFORE persisting.
//
// This spec constructs the REAL services with deps mocked, feeds an imported
// HTML document that contains an `htmlEmbed` div (parsed into a real htmlEmbed
// node by the REAL htmlToJson), runs the real method, and asserts the PERSISTED
// content (captured at the insert boundary) is stripped for a non-admin /
// missing user and preserved for admin/owner + toggle ON. Mirrors the GOOD
// pattern in transclusion/spec/transclusion-unsync-html-embed.spec.ts.
//
// Three modules are mocked away because they pull transitive ESM deps that
// jest's transformIgnorePatterns does not transpile (`lib0/decoding.js` via the
// collab gateway, `@sindresorhus/slugify` via import-formatter, `p-limit` via
// import-attachment). None of them participate in the gate decision:
// - import-formatter: contextless HTML cleanup + link rewriting; replaced with
// faithful passthroughs (the embed div has no href/iframe, so the real
// normalizer would leave it untouched anyway).
// - import-attachment: attachment rewriting; passthrough returns html as-is.
jest.mock('../../../collaboration/collaboration.gateway', () => ({
CollaborationGateway: class {},
}));
jest.mock('../utils/import-formatter', () => ({
normalizeImportHtml: () => {},
formatImportHtml: async (opts: any) => ({
html: opts.html,
backlinks: [],
pageIcon: undefined,
}),
}));
jest.mock('./import-attachment.service', () => ({
ImportAttachmentService: class {},
}));
import { promises as fs } from 'node:fs';
import * as os from 'node:os';
import * as path from 'node:path';
import { ImportService } from './import.service';
import { FileImportTaskService } from './file-import-task.service';
import { hasHtmlEmbedNode } from '../../../common/helpers/prosemirror/html-embed.util';
const WS = 'ws-1';
const SPACE = 'space-1';
const USER = 'importer-1';
// HTML carrying the serialized htmlEmbed node. The REAL htmlToJson parses
// `<div data-type="htmlEmbed" data-source="BASE64">` into an htmlEmbed PM node
// (base64 below decodes to `<script>x</script>`).
const HTML_WITH_EMBED =
'<p>imported body</p>' +
'<div data-type="htmlEmbed" data-source="PHNjcmlwdD54PC9zY3JpcHQ+"></div>';
function workspaceRepoFor(featureEnabled: boolean) {
return {
findById: jest.fn(async () => ({
id: WS,
settings: { htmlEmbed: featureEnabled },
})),
};
}
// userRepo.findById resolves the importer's role (or undefined for a missing
// user -> fail closed).
function userRepoFor(user: { role?: string } | undefined) {
return { findById: jest.fn(async () => user) };
}
describe('ImportService.importPage htmlEmbed admin gate (real code)', () => {
// Run importPage with a single uploaded .html and return the persisted content
// captured at pageRepo.insertPage.
async function persistedContent(
featureEnabled: boolean,
user: { role?: string } | undefined,
) {
const captured: any[] = [];
const pageRepo: any = {
insertPage: jest.fn(async (row: any) => {
captured.push(row);
return { id: 'p1', slugId: 's1', ...row };
}),
};
// db is only used for getNewPagePosition (a select chain).
const selectChain: any = {
select: () => selectChain,
where: () => selectChain,
orderBy: () => selectChain,
limit: () => selectChain,
executeTakeFirst: async () => undefined,
};
const db: any = { selectFrom: () => selectChain };
const service = new ImportService(
pageRepo,
userRepoFor(user) as any,
{ putBuffer: jest.fn() } as any, // storageService (unused on this path)
db,
{ add: jest.fn() } as any, // fileTaskQueue (unused)
workspaceRepoFor(featureEnabled) as any,
);
const file: any = {
filename: 'doc.html',
toBuffer: async () => Buffer.from(HTML_WITH_EMBED, 'utf-8'),
};
await service.importPage(Promise.resolve(file), USER, SPACE, WS);
expect(captured).toHaveLength(1);
return captured[0].content;
}
it('toggle ON + member: persisted content has htmlEmbed stripped', async () => {
const content = await persistedContent(true, { role: 'member' });
expect(hasHtmlEmbedNode(content)).toBe(false);
expect(JSON.stringify(content)).toContain('imported body');
});
it('toggle ON + missing user (findById -> undefined): fails closed (stripped)', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, undefined))).toBe(
false,
);
});
it('toggle ON + admin: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'admin' }))).toBe(
true,
);
});
it('toggle ON + owner: persisted content keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'owner' }))).toBe(
true,
);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(
hasHtmlEmbedNode(await persistedContent(false, { role: 'admin' })),
).toBe(false);
});
});
describe('FileImportTaskService.processGenericImport htmlEmbed admin gate (real code)', () => {
let extractDir: string;
beforeEach(async () => {
// Real temp dir holding a single .html page that carries the embed; the
// method reads it from disk via fs.readFile.
extractDir = await fs.mkdtemp(path.join(os.tmpdir(), 'html-embed-import-'));
await fs.writeFile(path.join(extractDir, 'page.html'), HTML_WITH_EMBED);
});
afterEach(async () => {
await fs.rm(extractDir, { recursive: true, force: true });
});
// Run processGenericImport over the temp dir and return the content persisted
// for the imported page (captured at trx.insertInto('pages').values(...)).
async function persistedContent(
featureEnabled: boolean,
user: { role?: string } | undefined,
) {
const captured: any[] = [];
const trxInsertChain = (table: string) => ({
values: (row: any) => {
if (table === 'pages') captured.push(row);
return { execute: async () => undefined };
},
});
const trx: any = { insertInto: trxInsertChain };
const db: any = {
// spaces lookup at the top of processGenericImport
selectFrom: () => ({
select: () => ({
where: () => ({ executeTakeFirst: async () => ({ slug: 'sp' }) }),
}),
}),
// executeTx -> db.transaction().execute(cb)
transaction: () => ({ execute: async (cb: any) => cb(trx) }),
};
// importService stub: only the real, gate-relevant helpers are used. We give
// it the REAL implementations by delegating to a real ImportService for
// processHTML/extractTitleAndRemoveHeading/createYdoc so the embed parse and
// strip path runs for real.
const realImport = new ImportService(
{} as any,
{} as any,
{} as any,
{} as any,
{} as any,
{} as any,
);
const importService: any = {
processHTML: (html: string) => realImport.processHTML(html),
extractTitleAndRemoveHeading: (s: any) =>
realImport.extractTitleAndRemoveHeading(s),
createYdoc: (j: any) => realImport.createYdoc(j),
};
const importAttachmentService: any = {
// passthrough: no attachment rewriting, return html unchanged
processAttachments: jest.fn(async (opts: any) => opts.html),
};
const service = new FileImportTaskService(
{ putBuffer: jest.fn() } as any, // storageService
importService,
{ nextPagePosition: jest.fn(async () => 'a0') } as any, // pageService (position only)
{ insertBacklink: jest.fn() } as any, // backlinkRepo
db,
importAttachmentService,
userRepoFor(user) as any,
workspaceRepoFor(featureEnabled) as any,
{ emit: jest.fn() } as any, // eventEmitter
{ logBatchWithContext: jest.fn() } as any, // auditService
);
const fileTask: any = {
id: 'task-1',
creatorId: USER,
workspaceId: WS,
spaceId: SPACE,
source: 'generic',
};
await service.processGenericImport({ extractDir, fileTask });
expect(captured.length).toBeGreaterThanOrEqual(1);
return captured[0].content;
}
it('toggle ON + member: persisted page has htmlEmbed stripped', async () => {
const content = await persistedContent(true, { role: 'member' });
expect(hasHtmlEmbedNode(content)).toBe(false);
expect(JSON.stringify(content)).toContain('imported body');
});
it('toggle ON + missing user (creatorId resolves to undefined): fails closed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, undefined))).toBe(
false,
);
});
it('toggle ON + admin: persisted page keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'admin' }))).toBe(
true,
);
});
it('toggle ON + owner: persisted page keeps the htmlEmbed', async () => {
expect(hasHtmlEmbedNode(await persistedContent(true, { role: 'owner' }))).toBe(
true,
);
});
it('toggle OFF + admin: stripped (feature disabled for everyone)', async () => {
expect(
hasHtmlEmbedNode(await persistedContent(false, { role: 'admin' })),
).toBe(false);
});
});

View File

@@ -1,12 +1,5 @@
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
import { PageRepo } from '@docmost/db/repos/page/page.repo';
import { UserRepo } from '@docmost/db/repos/user/user.repo';
import {
hasHtmlEmbedNode,
isHtmlEmbedFeatureEnabled,
stripHtmlEmbedIfNotAllowed,
} from '../../../common/helpers/prosemirror/html-embed.util';
import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo';
import { MultipartFile } from '@fastify/multipart';
import * as path from 'path';
import {
@@ -44,12 +37,10 @@ export class ImportService {
constructor(
private readonly pageRepo: PageRepo,
private readonly userRepo: UserRepo,
private readonly storageService: StorageService,
@InjectKysely() private readonly db: KyselyDB,
@InjectQueue(QueueName.FILE_TASK_QUEUE)
private readonly fileTaskQueue: Queue,
private readonly workspaceRepo: WorkspaceRepo,
) {}
async importPage(
@@ -94,32 +85,7 @@ export class ImportService {
const extracted = this.extractTitleAndRemoveHeading(prosemirrorState);
const title = extracted.title;
let prosemirrorJson = extracted.prosemirrorJson;
// SECURITY (Variant C admin gate, import write path):
// An imported .html/.md file can carry an htmlEmbed marker (the node's
// serialized form), which would execute raw JS in readers' browsers. Only
// workspace admins/owners may author it, so strip htmlEmbed nodes from
// imports performed by a non-admin user.
// Outer has-check first so the user/workspace lookups below run only when an
// embed is actually present (the common case carries none).
if (prosemirrorJson && hasHtmlEmbedNode(prosemirrorJson)) {
const importingUser = await this.userRepo.findById(userId, workspaceId);
// Toggle-AND-admin gate: htmlEmbed survives only when the workspace
// feature toggle is ON and the importer is an admin/owner. OFF (default)
// => stripped for everyone.
const htmlEmbedEnabled = isHtmlEmbedFeatureEnabled(
(await this.workspaceRepo.findById(workspaceId))?.settings,
);
prosemirrorJson = stripHtmlEmbedIfNotAllowed(prosemirrorJson, {
featureEnabled: htmlEmbedEnabled,
role: importingUser?.role,
onStrip: () =>
this.logger.warn(
`Stripping htmlEmbed node(s) from import by user ${userId}`,
),
});
}
const prosemirrorJson = extracted.prosemirrorJson;
const pageTitle = title || fileName;

View File

@@ -60,6 +60,36 @@ agent-claim, `docmost-client.loader.ts:159` — `getCollabToken`; см. план
встроенный агент получал устаревшую подсказку. Это и есть материализованный
parity-баг.
## Расширение: дублируется не только описания инструментов — ещё и конвертер (PM ↔ Markdown)
Зафиксировано при планировании встраивания git-синка (`docmost-sync` → gitmost,
нативная in-process интеграция). Та же болезнь «несколько рукописных копий одного
кода» теперь касается слоя конвертации ProseMirror ↔ Markdown и его lib, а не
только метаданных инструментов.
- **Копия в gitmost** — `packages/mcp/src/lib/`: `markdown-converter.ts` (~885
строк), `markdown-document.ts` (~136), `node-ops.ts`, `diff.ts`,
`docmost-schema.ts`. Канонизатора (`canonicalize.ts`) здесь НЕТ.
- **Копия в docmost-sync** — `packages/docmost-client/src/lib/`: тот же набор +
`canonicalize.ts` (~11 КБ, держит идемпотентность round-trip, SPEC §11) +
`markdown-document.ts` с режимом «тело + якоря, без тредов комментов»
(`includeCommentThreads:false`, на ~20 строк больше).
- **Третья копия (планируется)** — план git-синка вендорит чистую часть
конвертера в новый `packages/git-sync` (collab-файл не нужен: запись идёт
нативно через `openDirectConnection` + `@docmost/editor-ext`).
Копии уже молча разъехались (docmost-sync vs `packages/mcp`): `collaboration.ts`
~329 изменённых строк, `node-ops.ts` ~53, `markdown-converter.ts` ~24,
`markdown-document.ts` ~20. Отдельно: `docmost-schema.ts` в lib дублирует
**реальную** схему сервера `@docmost/editor-ext` (её использует collab/persistence)
— расхождение схем = риск битой конвертации нод.
Вывод: тот же фикс-вектор (единый источник правды), что и для инструментов, стоит
распространить на конвертер — общий пакет конвертации, потребляемый `mcp`,
`git-sync` и (в идеале) сервером. До конвергенции git-sync держит вендоренную
копию валидированного конвертера с гейтом round-trip против схемы `editor-ext`
(осознанный долг «третья копия сейчас, объединяем позже»).
## Фикс
Единый реестр спеков (полное устранение дублирования).** Вынести в

View File

@@ -1,121 +0,0 @@
# /pages/import отдаёт 400 «Error processing file content» (регресс)
Статус: **диагностируемость починена** (fix #1 применён); корневая причина **не
подтверждена** — на текущем коде локально баг воспроизвести не удалось.
Ниже — что удалось выяснить, главный подозреваемый и что проверить дальше.
## Симптом
На задеплоенном инстансе эндпоинт `POST /pages/import` отдаёт
`400 BadRequest` с телом «Error processing file content». Раньше работал —
похоже на регресс после редеплоя гитмоста.
Через этот эндпоинт грузит контент MCP-инструмент `create_page` (это
единственный эндпоинт, принимающий контент при создании страницы —
см. комментарий в `packages/mcp/src/client.ts:961`).
Что при этом **исправно** (важно для локализации):
- `POST /pages/create` — создание пустой страницы.
- `update_page_json` — запись контента через realtime-коллаборацию (Yjs).
## Где именно бросается ошибка
`apps/server/src/integrations/import/services/import.service.ts:93-97`
`try/catch` вокруг обработки контента:
```ts
} catch (err) {
const message = 'Error processing file content';
this.logger.error(message, err); // реальная причина логируется ТОЛЬКО в логи
throw new BadRequestException(message); // наружу уходит generic-строка
}
```
Реальный текст ошибки/стек **проглатывается** (наружу — generic-строка), что
нарушает конвенцию проекта (см. CLAUDE.md, «Errors must never be swallowed»).
Поэтому по ответу 400 причину не видно — её надо читать в логах сервера
(`logger.error(message, err)` пишет полный err) ИЛИ воспроизвести локально.
## Цепочка обработки для .md (что внутри try)
`importPage``processMarkdown(fileContent)`:
1. `markdownToHtml` (`packages/editor-ext/.../marked.utils.ts`) — marked, чистый JS, без DOM.
2. `processHTML`: cheerio `load``normalizeImportHtml` (`utils/import-formatter.ts`) — чистый JS.
3. `htmlToJson` (`apps/server/src/collaboration/collaboration.util.ts:118`) →
`generateJSON(html, tiptapExtensions)`.
## Ключевая зацепка: путь импорта зависит от happy-dom, рабочие пути — нет
`generateJSON` (`apps/server/src/common/helpers/prosemirror/html/generateJSON.ts`)
парсит HTML через **happy-dom**: `new Window()` + `new localWindow.DOMParser()` +
`parseFromString(...)`, затем `PMDOMParser.fromSchema(schema).parse(doc.body)`.
А исправные пути DOM-парсер НЕ используют:
- `/pages/create` — пустая страница, контент не парсится.
- `update_page_json` — пишет готовый ProseMirror-JSON в Yjs
(`TiptapTransformer.toYdoc`), без HTML→DOM.
То есть единственное, что есть в сломанном пути и отсутствует в рабочих, —
**серверный парсинг HTML через happy-dom**.
## Главный подозреваемый: бамп happy-dom (14 → 20)
- Изначально было `"happy-dom": "^14.12.3"`.
- Сейчас запинено `"happy-dom": "20.8.9"` в `apps/server/package.json:83`
(+ override в корневом `package.json`).
- Пин на `20.8.9` пришёл в коммите `17da7629 "overrides"`
(Philipinho, 2026-03-28), где `20.8.4``20.8.9`.
- Скачок 14 → 20 — это 6 мажоров; у happy-dom между мажорами ломающие
изменения в API `Window`/`DOMParser` и в поведении парсинга HTML. Очень
вероятно, что `generateJSON` ломается на новом happy-dom.
Версия в node_modules подтверждена: `happy-dom@20.8.9` (симлинк свежий).
## Второстепенный подозреваемый
`getSchema(tiptapExtensions)` / `PMDOMParser.parse(...)` могут спотыкаться на
`parseHTML`-правилах недавно добавленных нод (synced blocks/transclusion,
page break, indent, columns, status — все они в `tiptapExtensions`). Но
`getSchema` используется и в рабочем пути (`createYdoc`/`update_page_json`),
поэтому сам по себе билд схемы скорее всего цел — под подозрением именно
DOM-парс-ветка, уникальная для импорта.
## Направления фикса
1. **Диагностируемость — ✅ СДЕЛАНО (по конвенции проекта).** В catch-блоках
`import.service.ts` (обработка контента + вставка страницы) реальная
причина теперь прокидывается наружу: `BadRequestException` несёт
`${err.name}: ${err.message}`, а в лог пишется полный `err` со стеком.
Раньше наружу уходила generic-строка "Error processing file content".
Теперь при повторе 400 на проде реальный reason будет виден прямо в теле
ответа — без необходимости лезть в логи.
2. **Корневой фикс — ⏳ НЕ ПОДТВЕРЖДЁН.** Гипотеза happy-dom 14→20 **не
подтвердилась** при локальном воспроизведении на текущем коде (см. ниже).
Применять блайнд-даунгрейд happy-dom нельзя — нужен реальный stack из
логов/ответа после повторения.
## Локальное воспроизведение (выполнено)
На текущем `main` (happy-dom 20.8.9) вся цепочка импорта `.md` отработала
без ошибок через `tsx` (импорты прямо из source, не из dist):
- `markdownToHtml` → cheerio `load``normalizeImportHtml``generateJSON`
с полным набором из 44 `tiptapExtensions`**OK** для:
- базового markdown (заголовки, bold/italic, списки, таблицы, code-block,
blockquote)
- edge-cases: пустой контент, whitespace, HTML-сущности, вложенные списки,
task-list, emoji, кириллица, спецсимволы в code, ссылки, изображения, hr
- API happy-dom 20.8.9, используемые в `generateJSON`, существуют и работают:
`new Window()`, `new localWindow.DOMParser()`, `parseFromString('…',
'text/html')`, `happyDOM.abort()` (async), `happyDOM.close()` (async).
- Блок `finally` в `generateJSON` вызывает `abort()/close()` без `await` и без
`try/catch`, но эти методы не бросают синхронно и не перезаписывают
результат — **не является** причиной 400 (проверено отдельным тестом).
- Все `parseHTML`-правила расширений (status, transclusion, page-break,
columns, subpages и т.д.) участвуют в успешном тесте — ни одно не падает.
Вывод: на текущем коде баг **не воспроизводится**. Вероятные объяснения —
контент-специфичный кейс, которого нет в тестах; разница между source и
собранным `dist`; либо временное состояние задеплоенного инстанса. После
применения fix #1 повторный 400 покажет реальный reason — по нему и искать
корень.

534
docs/git-sync-plan.md Normal file
View File

@@ -0,0 +1,534 @@
# Git-sync: спека реализации (встраивание docmost-sync в gitmost)
Статус: **спецификация, код не менялся.** Детальный план реализации фичи
«двусторонний синк страниц Docmost ↔ локальная git-папка Markdown», встроенной
прямо в gitmost.
Источник движка: `https://gitea.vvzvlad.xyz/vvzvlad/docmost-sync`
(ветка `main`, на момент спеки HEAD `b03eb35`). Все сигнатуры ниже сверены с этим
исходником и с текущим кодом gitmost.
Предыстория и обоснование архитектурных развилок — в бэклоге
[ai-chat-tool-definitions-duplicated.md](backlog/ai-chat-tool-definitions-duplicated.md)
(раздел про дублирование конвертера) и в исходном `SPEC.md` репозитория
docmost-sync (нумерация §-параграфов ниже ссылается на него).
---
## 0. Зафиксированные решения
Из обсуждения архитектуры (выбор пользователя) и трёх суб-решений:
1. **Нативная in-process интеграция.** Никаких REST-к-себе и сервис-юзера: чтение
через репозитории gitmost, запись тела — через collab `openDirectConnection`,
триггеры — через `EventEmitter2` вместо поллинга `/recent`.
2. **Встроенный NestJS-модуль** `GitSyncModule` в `apps/server/src/integrations/git-sync`
с `@Interval`/событиями и **leader-lock на Redis** (single-writer при нескольких
репликах).
3. **Настройка по спейсам в UI** — флаг в `space.settings.gitSync`, секреты
(git-remote) — через ENV/`EnvironmentService`.
4. **Конвертер** — вендорим *чистую* часть из docmost-sync в `packages/git-sync`,
гейт = round-trip-идемпотентность против схемы `@docmost/editor-ext`.
5. **Vault****репозиторий на спейс**; `move-to-space` = кросс-репо delete+create.
6. **Провенанс** — отдельное значение `lastUpdatedSource = 'git-sync'`.
Вне scope v1 (как и в SPEC): комментарии (только якоря, без тредов), права/ACL,
вложения как отдельный поток (едут ссылками внутри контента), realtime-подписка
на Hocuspocus (остаётся поллинг-страховка + события).
---
## 1. Архитектура верхнего уровня
```
gitmost server (NestJS, один процесс)
┌─────────────────────────────────────────────────────────────┐
│ GitSyncModule │
│ │
│ GitSyncOrchestrator ── @Interval + Redis leader-lock │
│ │ (per enabled space: pull-cycle / push-cycle) │
│ │ │
│ ├── engine (vendored docmost-sync, IO инжектируется) │
│ │ pull.ts / push.ts / reconcile / layout / stabilize │
│ │ │
│ ├── GitmostDataSource ── реализует подмножество │
│ │ DocmostClient НАТИВНО: │
│ │ reads → PageRepo / SpaceRepo (Kysely) │
│ │ writes → CollaborationGateway.openDirectConnection│
│ │ + PageService (create/move/delete/...) │
│ │ │
│ └── VaultGit ── shell-out в системный git (как есть) │
│ │
│ PageChangeListener ── подписка на EventName.PAGE_* → │
│ debounce → enqueue push-cycle │
└─────────────────────────────────────────────────────────────┘
▲ читает/пишет страницы ▼ git push/pull
PostgreSQL (pages/spaces) data/git-sync/<spaceId>/ (vault) → remote
```
Ключ интеграции: движок docmost-sync уже **полностью построен на dependency
injection** — весь внешний IO (REST-клиент, git, файловая система) передаётся
через узкие интерфейсы. Мы НЕ переписываем движок; мы подставляем нативные
реализации в его DI-швы.
---
## 2. Состав вендоринга из docmost-sync
В новый пакет `packages/git-sync` копируем (с сохранением истории смысла —
backport-friendly, как сделано с `packages/mcp`):
### 2.1. Движок (engine) — `src/engine/`
| Файл | Что несёт | IO | Берём |
| --- | --- | --- | --- |
| `pull.ts` | Docmost→FS: reconcile + write + commit + merge | client+git+fs (инжектируется) | да |
| `push.ts` | FS→Docmost: diff + classify + apply + refs | client+git+fs (инжектируется) | да |
| `git.ts` | `VaultGit` — обёртка git shell-out | системный `git` | да, как есть |
| `reconcile.ts` | чистый планировщик | нет | да |
| `layout.ts` | чистый маппер дерево→пути | нет | да |
| `sanitize.ts` | чистая санитизация имён | нет | да |
| `stabilize.ts` | fixpoint-нормализация md (SPEC §11) | нет (lib-вызовы) | да |
| `loop-guard.ts` | `bodyHash` (sha256) | нет | да |
| `settings.ts` | zod-конфиг | `.env` | **адаптируем** (см. §7) |
| `index.ts` | тонкий CLI-скаффолд | — | нет (заменяем на NestJS) |
### 2.2. Конвертер (чистая часть) — `src/lib/`
Из `packages/docmost-client/src/lib/` берём **только** чистый конвертер и формат
файла (collab/auth REST-части НЕ нужны — запись нативная):
| Файл | Экспорт |
| --- | --- |
| `markdown-converter.ts` | `convertProseMirrorToMarkdown(content): string` |
| `collaboration.ts` (только конвертер-функция) | `markdownToProseMirror(md): Promise<doc>` ⚠️ |
| `markdown-document.ts` | `serializeDocmostMarkdownBody`, `parseDocmostMarkdown`, `serializeDocmostMarkdown`, тип `DocmostMdMeta` |
| `canonicalize.ts` | `canonicalizeContent(node)`, `docsCanonicallyEqual(a,b)` |
| `docmost-schema.ts` | tiptap-схема для `markdownToProseMirror` |
| `node-ops.ts`, `diff.ts` | трансформации/диф (нужны транзитивно) |
⚠️ `markdownToProseMirror` физически лежит в `collaboration.ts` docmost-client
(строка 289) — это **чистая** функция (marked→HTML→generateJSON), не путать с
collab/websocket write-path из того же файла, который НЕ берём.
> **Долг (зафиксирован в бэклоге):** это третья копия конвертера (есть в
> docmost-sync, в `packages/mcp`, теперь в `packages/git-sync`). Конвергенция в
> общий пакет — отдельная задача; здесь сознательно вендорим валидированную
> копию ради сохранения идемпотентности.
### 2.3. НЕ берём
`pull`/`push` CLI-обёртки, `roundtrip.ts` (харнес переносим в тесты, см. §13),
`docmost-client` REST-клиент целиком, `lib/collaboration.ts` (websocket-write),
`lib/auth-utils.ts`, `Makefile`, Docker-обвязку docmost-sync.
---
## 3. Главный шов: `GitmostDataSource`
Движок дёргает Docmost через `Pick<DocmostClient, …>`. Мы реализуем класс,
**структурно совместимый** с этими сигнатурами, но нативный внутри. Это
единственный нетривиальный новый код.
### 3.1. Точный набор методов, которых требует движок
Из `pull.ts` (`ApplyPullActionsDeps.client`) и обхода дерева:
```ts
listSpaceTree(spaceId: string, rootPageId?: string): Promise<{ pages: PageNode[]; complete: boolean }>;
getPageJson(pageId: string): Promise<{ id; slugId; title; parentPageId; spaceId; updatedAt; content }>;
```
Из `push.ts` (`ApplyPushDeps.client`):
```ts
importPageMarkdown(pageId: string, fullMarkdown: string): Promise<{ updatedAt?: string; /* … */ }>;
createPage(title: string, content: string, spaceId: string, parentPageId?: string): Promise<{ data: { id: string }; updatedAt?: string }>;
deletePage(pageId: string): Promise<unknown>;
movePage(pageId: string, parentPageId: string | null, position?: string): Promise<unknown>;
renamePage(pageId: string, title: string): Promise<unknown>;
```
Для непрерывного режима/детекции удалений (фаза B+, SPEC §8):
```ts
listRecentSince(spaceId: string | undefined, sinceIso: string | null, hardPageCap?: number): Promise<any[]>;
listTrash(spaceId: string): Promise<any[]>;
restorePage(pageId: string): Promise<unknown>;
```
### 3.2. Маппинг на нативные сервисы gitmost
| Метод адаптера | Нативная реализация |
| --- | --- |
| `listSpaceTree(spaceId)` | `SpaceRepo.findById(spaceId, wsId)` + `PageRepo.getSpaceDescendants(spaceId, { includeContent: false })` → map в `PageNode { id, title, slugId, parentPageId, hasChildren }`. **`complete: true` всегда** (читаем БД, не пагинированный REST) → суппрессия `incomplete-fetch` из SPEC §8 нативно не срабатывает. |
| `getPageJson(pageId)` | `PageRepo.findById(pageId, { includeContent: true })``{ id, slugId, title, parentPageId, spaceId, updatedAt, content }`. `content` — ProseMirror JSON в схеме `editor-ext`. |
| `importPageMarkdown(pageId, fullMd)` | `parseDocmostMarkdown(fullMd)` → body; `await markdownToProseMirror(body)` → doc; **запись через collab** (см. §3.3). Вернуть `{ updatedAt }` свежей страницы. |
| `createPage(title, body, spaceId, parent?)` | `PageService.create(userId, wsId, { spaceId, title, parentPageId }, provenance)` → shell; затем тело через collab (§3.3). Вернуть `{ data: { id }, updatedAt }`. |
| `deletePage(pageId)` | `PageService.removePage(pageId, userId, wsId)` (soft-delete → Trash, обратимо). |
| `movePage(pageId, parent, pos?)` | `PageService.movePage({ pageId, parentPageId: parent, position }, movedPage, provenance)`. **`position` обязателен** для Docmost-move — вычисляем `fractional-indexing-jittered` ключ между соседями (соседей берём из `PageRepo`). |
| `renamePage(pageId, title)` | `PageService.update(page, { title }, user, provenance)`. |
| `listRecentSince` | `PageRepo.getRecentPagesInSpace(spaceId, { … })`, фильтр по `updatedAt > since`. |
| `listTrash(spaceId)` | `PageRepo` запрос с `deletedAt IS NOT NULL` по спейсу. |
| `restorePage(pageId)` | `PageService.restore(...)`. |
`userId`/`wsId` берём из конфигурации спейса (сервисный аккаунт воркспейса или
владелец спейса — см. §7). `provenance` всегда несёт `source: 'git-sync'` (§8).
### 3.3. Нативная запись тела (linchpin)
Подтверждено в коде: `CollaborationGateway.openDirectConnection(documentName, context)`
([collaboration.gateway.ts:148](../apps/server/src/collaboration/collaboration.gateway.ts#L148-L150))
+ паттерн `withYdocConnection`
([collaboration.handler.ts:118-133](../apps/server/src/collaboration/collaboration.handler.ts#L118-L133)).
Имя документа — `page.<pageId>` ([getPageId](../apps/server/src/collaboration/collaboration.util.ts#L163-L165)).
Схему берём из `tiptapExtensions` ([collaboration.util.ts](../apps/server/src/collaboration/collaboration.util.ts)).
```ts
// In-process body write — no loopback websocket, no service-user token.
// Mirrors collaboration.handler.ts 'replace' operation exactly.
private async writeBody(pageId: string, prosemirrorJson: JSONContent): Promise<void> {
const conn = await this.collabGateway.openDirectConnection(
`page.${pageId}`,
{ actor: 'git-sync' }, // provenance flows into PersistenceExtension (see §8)
);
try {
await conn.transact((doc) => {
const fragment = doc.getXmlFragment('default');
if (fragment.length > 0) fragment.delete(0, fragment.length);
const next = TiptapTransformer.toYdoc(prosemirrorJson, 'default', tiptapExtensions);
Y.applyUpdate(doc, Y.encodeStateAsUpdate(next));
});
} finally {
await conn.disconnect();
}
// PersistenceExtension.onStoreDocument persists ydoc+content+textContent
// consistently, stamps lastUpdatedSource, broadcasts 'page.updated'.
}
```
**Схема-совместимость (критично).** `markdownToProseMirror` производит
ProseMirror JSON в схеме docmost-client, а `TiptapTransformer.toYdoc` валидирует
его в схеме `editor-ext`. Аналогично на чтении `convertProseMirrorToMarkdown`
получает `content` в схеме `editor-ext`. Эти две схемы **должны совпадать по
именам нод/марок/атрибутов**, иначе ноды потеряются. Это и есть гейт §13.1.
---
## 4. `VaultGit` и git-бинарь
`VaultGit` (engine/git.ts) оставляем как есть — он шеллит в системный `git` через
`execFile` (args-массив, без инъекций), всегда `cwd=<vaultPath>`. Константы:
`DEFAULT_BRANCH = "main"`, `BOT_AUTHOR_NAME = "Docmost Sync"`,
`BOT_AUTHOR_EMAIL = "docmost-sync@local"`; в push.ts: `DOCMOST_BRANCH = "docmost"`,
`LAST_PUSHED_REF = "refs/docmost/last-pushed"`, провенанс-трейлеры
`Docmost-Sync-Source: docmost|local`.
**Ops-требование:** в рантайм-образ gitmost добавить пакет `git`
([Dockerfile](../Dockerfile)) — сейчас его там может не быть. Без бинаря
`VaultGit.assertGitAvailable()` падает на старте цикла.
**Модель веток (пер-репо, SPEC §5):** `main` (правит человек/файлы) ↔ `docmost`
(зеркало Docmost, пишет только движок) ↔ `merge-base` как базлайн;
`refs/docmost/last-pushed` — что из `main` уже отражено в Docmost.
---
## 5. Топология vault: репозиторий на спейс
- Корень: `<DATA_DIR>/git-sync/<spaceId>/` — отдельный git-репо на каждый
включённый спейс. `layout.ts` уже спейс-скоупный (корень спейса → `segments: []`).
- Remote — пер-спейс (из конфигурации спейса/ENV). Изоляция конфликтов, блокировок
и blast-radius.
- `move-to-space` (страница меняет спейс) → **кросс-репо**: `delete` в исходном
репо + `create` в целевом. Ловим по событию `PAGE_MOVED_TO_SPACE`.
- Redis-lock ключ — `git-sync:lock:<spaceId>` (§9).
---
## 6. NestJS-модуль `GitSyncModule`
Структура (шаблон — `McpModule`):
```
apps/server/src/integrations/git-sync/
git-sync.module.ts
git-sync.constants.ts # QueueJob/event-имена, дефолты
services/
gitmost-datasource.service.ts # §3 адаптер
git-sync.orchestrator.ts # @Interval + leader-lock + цикл по спейсам
vault-registry.service.ts # путь vault на спейс, VaultGit-инстансы
fractional-index.util.ts # position для move (reuse server util)
listeners/
page-change.listener.ts # подписка на EventName.PAGE_* + debounce
git-sync.controller.ts # (опц.) ручной trigger/status для админа
```
```ts
@Module({
imports: [DatabaseModule, EnvironmentModule, ScheduleModule.forRoot()],
providers: [
GitmostDataSourceService,
GitSyncOrchestrator,
VaultRegistryService,
PageChangeListener,
],
})
export class GitSyncModule {}
```
- Регистрируем в [app.module.ts](../apps/server/src/app.module.ts) рядом с `McpModule`.
- Зависимости: `PageRepo`/`SpaceRepo` (через `DatabaseModule`), `PageService`,
`CollaborationGateway` (экспортировать из `CollaborationModule`),
`EnvironmentService`, ioredis-клиент.
- `ScheduleModule.forRoot()` уже подключается в `TelemetryModule`; повторный вызов
безопасен, но лучше вынести в общий модуль или убедиться, что forRoot один раз.
---
## 7. Конфигурация
### 7.1. Per-space (UI) — `space.settings.gitSync`
Расширяем существующий паттерн `settings.sharing` / `settings.comments`.
Сервер:
- `UpdateSpaceDto` ([update-space.dto.ts](../apps/server/src/core/space/dto/update-space.dto.ts)):
добавить `@IsOptional() @IsBoolean() gitSyncEnabled?: boolean;` (+ опц.
`gitSyncRemote?: string`, если решим хранить remote в БД, а не только в ENV).
- `SpaceService.updateSpace(dto, wsId)`
([space.service.ts:120](../apps/server/src/core/space/services/space.service.ts#L120)):
обработать как `disablePublicSharing`/`allowViewerComments`.
- `SpaceRepo`: добавить `updateGitSyncSettings(spaceId, wsId, prefKey, prefValue, trx?)`
по образцу `updateSharingSettings`
([space.repo.ts:92](../apps/server/src/database/repos/space/space.repo.ts#L92)) —
jsonb-merge в `settings.gitSync.<key>`.
- Гард: CASL `SpaceCaslAction.Manage / SpaceCaslSubject.Settings` (как в
[space.controller.ts:147](../apps/server/src/core/space/space.controller.ts#L147)).
Клиент:
- Тоггл в форме настроек спейса
([edit-space-form.tsx](../apps/client/src/features/space/components/edit-space-form.tsx))
через `useUpdateSpaceMutation()``updateSpace({ spaceId, gitSyncEnabled })`.
Образец — `mcp-settings.tsx`. `readOnly` при отсутствии `Manage/Settings`.
Форма `space.settings.gitSync`:
```jsonc
{ "gitSync": { "enabled": true, "remote": "git@…", "branch": "main" } }
```
### 7.2. Секреты/тюнинг (ENV) — `EnvironmentService`
Движковый `settings.ts` (zod, читает `.env`) **заменяем** на чтение из gitmost
`EnvironmentService`: `parseSettings(env)` оставляем как чистую функцию для тестов,
но в проде собираем `Settings` из `EnvironmentService`-геттеров.
Новые переменные (объявить в
[environment.validation.ts](../apps/server/src/integrations/environment/environment.validation.ts)
class-validator-декораторами, геттеры — в
[environment.service.ts](../apps/server/src/integrations/environment/environment.service.ts)):
| ENV | Назначение | Обяз. |
| --- | --- | --- |
| `GIT_SYNC_ENABLED` | глобальный мастер-выключатель | нет (default false) |
| `GIT_SYNC_DATA_DIR` | корень vault'ов (default `<DATA_DIR>/git-sync`) | нет |
| `GIT_SYNC_REMOTE_TEMPLATE` | шаблон remote, напр. `git@host:vault-{spaceId}.git` | нет |
| `GIT_SYNC_SSH_KEY_PATH` / креды remote | доступ к git-remote (secret) | по ситуации |
| `GIT_SYNC_POLL_INTERVAL_MS` | страховочный поллинг (default 15000) | нет |
| `GIT_SYNC_DEBOUNCE_MS` | окно дебаунса событий (default 2000) | нет |
| `GIT_SYNC_SERVICE_USER_ID` | от чьего имени писать в Docmost | да (если синк включён) |
> git-remote = доступ ко всей вики спейса (SPEC §12): креды только в ENV/secret
> store, никогда в БД/коммиты. В UI — только `enabled` (+ опц. имя remote из
> заранее разрешённого списка).
---
## 8. Провенанс и loop-guard
### 8.1. Значение `'git-sync'`
Сегодня `lastUpdatedSource ∈ { 'user', 'agent' }`
([persistence.extension.ts:132-134](../apps/server/src/collaboration/extensions/persistence.extension.ts#L132-L134)).
Добавляем `'git-sync'`:
- `PersistenceExtension`: `context.actor === 'git-sync'``lastUpdatedSource = 'git-sync'`.
- Снапшот истории для `'git-sync'` — дебаунс (как у человека), а не немедленный
(немедленный — только для `'agent'`,
[persistence.extension.ts:321](../apps/server/src/collaboration/extensions/persistence.extension.ts#L321)).
- Для `create/move/rename/delete` через `PageService` передаём
`AuthProvenanceData` c `source: 'git-sync'` (тип уже используется для агента —
расширить допустимые значения; точную форму подтвердить на реализации).
- Клиент: в истории
([history-item.tsx:128](../apps/client/src/features/page-history/components/history-item.tsx#L128))
не показывать агентский бейдж/дип-линк для `'git-sync'`; добавить значение в
тип [page.types.ts:23-26](../apps/client/src/features/page-history/types/page.types.ts#L23-L26)
(опц. свой бейдж «sync»).
### 8.2. Подавление петли (SPEC §10)
На pull-стороне игнорируем страницу как «свою запись», если:
`page.lastUpdatedSource === 'git-sync'` **И** `bodyHash(exportedBody)` совпадает
с последним запушенным (`PushedPageRecord.bodyHash` из `push.ts`). После записи в
Docmost сохраняем `updatedAt` ответа, чтобы поллинг-страховка не утянул свою же
запись обратно.
---
## 9. Single-writer (Redis leader-lock)
В кодовой базе `@Interval`-задачи (`trash-cleanup`, `telemetry`, `session-cleanup`)
**не защищены** от мультиинстанса. Для синка добавляем явный лок.
- ioredis уже есть (`RedisModule` из `@nestjs-labs/nestjs-ioredis`,
[app.module.ts](../apps/server/src/app.module.ts); прямой `RedisClient`
используется в collab-gateway).
- Лок на спейс: `SET git-sync:lock:<spaceId> <instanceId> NX PX <ttl>`; держим
цикл только при успехе, продлеваем по heartbeat, освобождаем в `finally`
(Lua-CAS на удаление по `instanceId`, чтобы не снять чужой лок).
- TTL > максимальной длительности цикла; на краше лок истекает сам.
```ts
// Acquire per-space leadership; returns false if another replica holds it.
private async acquire(spaceId: string): Promise<boolean> {
const ok = await this.redis.set(`git-sync:lock:${spaceId}`, this.instanceId, 'PX', LOCK_TTL_MS, 'NX');
return ok === 'OK';
}
```
---
## 10. Планировщик и событийные триггеры
- **События (основной триггер).** `PageChangeListener` подписывается на
`EventName.PAGE_CREATED | PAGE_UPDATED | PAGE_MOVED | PAGE_SOFT_DELETED |
PAGE_RESTORED | PAGE_MOVED_TO_SPACE` и job `PAGE_CONTENT_UPDATED`
([event.contants.ts](../apps/server/src/common/events/event.contants.ts)).
Фильтр по `spaceId` (только включённые спейсы) → дебаунс (`GIT_SYNC_DEBOUNCE_MS`)
→ ставит pull/push-цикл спейса в очередь оркестратора.
- Loop-guard: события от собственных записей (`source==='git-sync'` + совпавший
хэш) пропускаем (§8.2).
- **Поллинг-страховка.** `@Interval(GIT_SYNC_POLL_INTERVAL_MS)` в оркестраторе:
по каждому включённому спейсу (под локом) — реконсиляция (`listRecentSince` +
`listTrash`), ловит пропущенные события и стартовую сверку после простоя
(SPEC §12).
- Один цикл на спейс за раз (внутри-процессный мьютекс на `spaceId` поверх
Redis-лока).
---
## 11. Потоки данных (walkthroughs)
### 11.1. Первичный клон спейса (initial clone, SPEC §12)
1. `VaultGit.ensureRepo()` + `ensureBranch('docmost','main')` + `checkout('docmost')`.
2. `dataSource.listSpaceTree(spaceId)``{ pages, complete:true }`.
3. `readExisting({ listTracked: () => git.listTrackedFiles('*.md'), readFile })`.
4. `computePullActions({ pages, treeComplete:true, existing })` → план.
5. `applyPullActions(deps, actions, vaultRoot)`: на каждую страницу
`getPageJson``stabilizePageFile(content, meta)` (export→import→export
fixpoint, SPEC §11) → запись файла; затем `stageAll` + `commit` (трейлер
`docmost`) на `docmost`; `checkout('main')` + `merge('docmost')`.
6. Зафиксировать max `updatedAt` как стартовый `T_last`; `git push` в remote.
### 11.2. Docmost → FS (pull-цикл)
Триггер: событие/поллинг → (под локом) шаги §11.1 п.1–5 инкрементально. 3-way
merge `docmost→main` делает git: непересекающиеся правки сливаются, реальное
пересечение → conflict-маркеры в файле. **При конфликте push этой страницы в
Docmost блокируется** до ручного резолва (SPEC §9; фаза D).
### 11.3. FS → Docmost (push-цикл)
`runPush(deps, { dryRun })`:
1. `git.ensureRepo` / `isMergeInProgress` (abort при merge) / `checkout('main')`.
2. `stageAll` + `commit('local: working-tree changes')` (локально, в Docmost не шлёт).
3. База диффа: `readRef(LAST_PUSHED_REF)` ?? `docmost`; `revParse('main')``pushedCommit`.
4. `diffNameStatus(base, 'main')` → changes; префетч `metaAt(path, side)`.
5. `computePushActions({ changes, metaAt })` → creates/updates/deletes/renamesMoves/skipped.
6. `dryRun` → лог плана и выход (клиент НЕ создаётся).
7. `--apply`: `makeClient(settings)` → наш `GitmostDataSource`;
`applyPushActions`:
- update → `importPageMarkdown(pageId, fullMd)` (collab-write, §3.3);
- create → `createPage(...)` → записать присвоенный `pageId` обратно в meta;
- delete → `deletePage(pageId)` (Trash);
- rename/move → `classifyRenameMoves``movePage`/`renamePage`;
- при пустых failures: `updateRef(LAST_PUSHED_REF, pushedCommit)` +
`fastForwardBranch('docmost', pushedCommit)`.
8. Записать `bodyHash` + `updatedAt` (loop-guard, §8.2); `git push`.
---
## 12. Фазирование
- **A. Каркас + односторонний pull (нативно).** `packages/git-sync` (вендоринг
§2), `GitmostDataSource` (чтение через репозитории), `GitSyncModule`, конфиг из
`EnvironmentService`, ручной/однократный pull-цикл на один спейс. **Гейт §13.1.**
- **B. Push + непрерывность.** Нативная запись (§3.3), `runPush`, ветки/refs,
loop-guard (§8), Redis-лок (§9), `@Interval` + `PageChangeListener` (§10).
- **C. Per-space UI.** `space.settings.gitSync` (§7.1), DTO/сервис/репо/гард,
тоггл на клиенте, скоуп оркестратора по включённым спейсам.
- **D. Харднинг.** Conflict-gating (SPEC §9), удаления через Trash + git (§5),
стартовая реконсиляция и `move-to-space` кросс-репо, провенанс на клиенте,
Dockerfile `git`, полный набор тестов.
---
## 13. Тестирование
### 13.1. Гейт идемпотентности (блокирует фазу B)
Перенести round-trip-харнес docmost-sync (`roundtrip.ts` + `test/fixtures/corpus`)
в тесты `packages/git-sync`, но прогонять **против схемы `editor-ext`**:
`content (editor-ext) → convertProseMirrorToMarkdown → markdownToProseMirror →
TiptapTransformer.toYdoc(…, tiptapExtensions) → fromYdoc → canonicalizeContent`
должно давать `docsCanonicallyEqual === true`. Любая потеря нод/атрибутов =
расхождение схем → чинить `docmost-schema.ts` под `editor-ext`.
### 13.2. Юнит (чистая логика, переносится как есть)
`reconcile` (planReconciliation / decideAbsenceDeletions / mass-delete guards),
`layout` (коллизии/санитизация), `computePullActions`, `computePushActions`,
`classifyRenameMoves`, `bodyHash`.
### 13.3. Интеграция (нативный адаптер)
`GitmostDataSource` против тестовой БД: `listSpaceTree`/`getPageJson` корректно
маппят; `createPage`/`movePage`/`deletePage`/`importPageMarkdown` пишут через
collab и проставляют `lastUpdatedSource='git-sync'`; loop-guard не зацикливается
(write → poll → no-op).
### 13.4. e2e (под локом)
Полный pull→push round-trip на временном vault + временном спейсе: правка в
Docmost доезжает в файл и наоборот; конфликт даёт маркеры и блокирует push.
---
## 14. Риски и открытые пункты
1. **Схема-совместимость конвертера** (§3.3, §13.1) — главный риск; гейт
обязателен до фазы B.
2. **`AuthProvenanceData`** — точную форму типа подтвердить; возможно, потребует
расширения enum источника на сервере и в истории.
3. **Согласованность Yjs** — писать строго через `openDirectConnection`/`transact`;
не трогать `content`-колонку напрямую.
4. **`position` для move** — обязателен в Docmost-move; нужен
`fractional-indexing-jittered` между соседями (соседей брать сортировкой
`position COLLATE "C"`).
5. **`git` в рантайме** — добавить в Dockerfile.
6. **`ScheduleModule.forRoot()`** — не задублировать `forRoot`.
7. **Сервисный пользователь записи** (`GIT_SYNC_SERVICE_USER_ID`) — от чьего имени
идут create/move (влияет на `creatorId`/права); согласовать политику.
8. **Конфликты и удаления** — фаза D строго по SPEC §8/§9 (маркеры никогда не
уезжают в Docmost).
---
## 15. Чек-лист изменений по файлам
**Новый пакет**
- `packages/git-sync/**` — движок + чистый конвертер (§2), `package.json`
(`@docmost/git-sync`, `workspace:*`), `tsconfig.json`.
**Сервер (`apps/server/src`)**
- `integrations/git-sync/**` — модуль, оркестратор, адаптер, листенер (§6).
- `app.module.ts` — импорт `GitSyncModule`.
- `collaboration/collaboration.module.ts` — экспорт `CollaborationGateway`.
- `collaboration/extensions/persistence.extension.ts` — источник `'git-sync'` (§8.1).
- `core/space/dto/update-space.dto.ts``gitSyncEnabled?` (§7.1).
- `core/space/services/space.service.ts` — обработка флага.
- `database/repos/space/space.repo.ts``updateGitSyncSettings` (§7.1).
- `integrations/environment/environment.validation.ts` + `environment.service.ts`
новые ENV (§7.2).
- `Dockerfile` — пакет `git`.
**Клиент (`apps/client/src`)**
- `features/space/components/edit-space-form.tsx` — тоггл git-sync.
- `features/space/types` — поле `settings.gitSync`.
- `features/page-history/types/page.types.ts` + `components/history-item.tsx`
значение `'git-sync'` в `lastUpdatedSource`.
**Корень**
- `pnpm-workspace.yaml` уже покрывает `packages/*`; `apps/server/package.json`
зависимость `@docmost/git-sync: workspace:*`.

View File

@@ -1,145 +0,0 @@
# Улучшение качества RAG-поиска агента — план по итерациям
> Статус: живой документ. Итерация 1 **реализована** (см. ниже). Остальное —
> бэклог на следующие итерации, отсортированный по «качество / усилие».
> Контекст: gitmost — форк Docmost. Семантический поиск агента: per-workspace
> эмбеддинги в `page_embeddings` (pgvector, dimension-agnostic колонка, seq-scan
> с `<=>`), индексация через BullMQ (`reindexPage` / `reindexWorkspace`).
> Активная embedding-модель деплоя: OpenAI `text-embedding-3-large` (3072d).
## Как сверялось с реальным кодом
Внешнее предложение по улучшению RAG было сверено с кодовой базой. Точные факты
на момент итерации 1:
- Хранилище: [page_embeddings](../apps/server/src/database/migrations/20260617T120000-page-embeddings.ts),
колонка `embedding` сделана dimension-agnostic в
[20260617T140000](../apps/server/src/database/migrations/20260617T140000-page-embeddings-dimension-agnostic.ts);
`model_name` / `model_dimensions` хранятся по строке.
- Полнотекстовые индексы **уже существуют** (предложение ошибочно утверждало
обратное): `pages_tsv_idx` на `pages.tsv` и `attachments_tsv_idx`. Конфигурация —
`to_tsvector('english', f_unaccent(...))` + `setweight`
([тут](../apps/server/src/database/migrations/20250729T213756-add-unaccent-pg_trm-update-tsvector..ts)).
- Чанкинг: `RecursiveCharacterTextSplitter` 1000/200, без префиксов.
- Префиксы `query:` / `passage:` **не нужны**: они требуются для e5/bge/gte/Qwen3,
а деплой на OpenAI `text-embedding-3-large` (этот пункт предложения неприменим).
- Вложения (`attachment_id` в схеме есть) **не индексируются** — индексатор всегда
пишет `attachmentId: null`.
---
## Итерация 1 — РЕАЛИЗОВАНО
Три «низковисящих фрукта»:
### 1. Хлебные крошки заголовков в чанках
Файл: [embedding-indexer.service.ts](../apps/server/src/core/ai-chat/embedding/embedding-indexer.service.ts).
Каждый чанк префиксуется путём заголовков `«Заголовок страницы > H1 > H2»` перед
эмбеддингом. Крошки строятся обходом **ProseMirror JSON** (`heading`-ноды с
`attrs.level`), а не markdown-текста — поэтому `#` внутри fenced-код-блока (типичный
bash-сниппет в WirenBoard-вики) **никогда** не принимается за заголовок. Деградация
к старому plain-text чанкингу при отсутствии/сбое `content`. Префикс попадает и в
эмбеддинг, и в `content` (а значит — в лексический индекс `fts` и в сниппет агента).
### 2. Гибридный поиск (RRF), слияние двух инструментов в один
- Миграция [20260618T150000-page-embeddings-fts.ts](../apps/server/src/database/migrations/20260618T150000-page-embeddings-fts.ts):
генерируемая колонка `fts tsvector GENERATED ALWAYS AS (to_tsvector('english',
f_unaccent(content))) STORED` + GIN-индекс. Конфиг совпадает с `pages.tsv` (та же
обработка unaccent/Cyrillic); `f_unaccent` IMMUTABLE → триггер не нужен.
- Репозиторий: метод `hybridSearch` в
[page-embedding.repo.ts](../apps/server/src/database/repos/ai-chat/page-embedding.repo.ts) —
один SQL-запрос, два CTE (cosine + `websearch_to_tsquery`), слияние Reciprocal Rank
Fusion через FULL OUTER JOIN на уровне чанков. `k=60` (дефолт Cormack 2009 /
ES / OpenSearch / Weaviate), равные веса 1.0/1.0. RRF сливает **ранги**, поэтому
несовместимость шкал BM25 и косинуса не требует нормализации. Dimension-фильтр —
только на семантической стороне.
- Инструменты: `semanticSearch` удалён, `searchPages` стал единым гибридным
инструментом ([ai-chat-tools.service.ts](../apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts)).
Контроль доступа сохранён 1-в-1 (scope по доступным спейсам + пост-фильтр прав
страниц). Если эмбеддинги не настроены / эмбеддинг упал / нет доступных спейсов /
гибрид пуст → graceful fallback на прежний REST-полнотекст (CASL-enforced).
### 3. Переписывание запроса + описания инструментов
- Описание `searchPages` теперь явно просит агента переформулировать вопрос в
сфокусированный поисковый запрос и переискивать при слабой выдаче (это переживает
кастомный admin-промпт, т.к. лежит в описании инструмента).
- Одна строка-подсказка добавлена в `DEFAULT_PROMPT`
([ai-chat.prompt.ts](../apps/server/src/core/ai-chat/ai-chat.prompt.ts)).
> ВАЖНО после деплоя: чтобы крошки и `fts` появились у существующих страниц, нужна
> **переиндексация корпуса** (кнопка «Reindex now» / `WORKSPACE_CREATE_EMBEDDINGS`).
> Миграция заполнит `fts` у текущих строк автоматически, но крошки добавляются только
> при переиндексации (она же перезапишет `content`).
### Известные нюансы текущей реализации (осознанные компромиссы)
- Гибрид покрывает только проиндексированные чанки. Свежесозданная страница
становится искомой после отработки её BullMQ-`reindexPage`. Пока эмбеддинги не
настроены — работает только REST-fallback (полнотекст уровня страницы по `pages.tsv`).
- Если **весь** пул кандидатов гибрида (до 200 чанков) оказался из закрытых для
пользователя страниц, инструмент вернёт пусто, а не уйдёт в keyword-fallback.
Узкий кейс; возможное улучшение — fallback и при пустом результате пост-фильтра.
- `fts` использует конфиг `english` (как и `pages.tsv`) — без русской стеммизации.
Для русской вики это консистентно с текущим поиском; переход на `simple`/`russian`
конфиг — отдельная задача с переиндексацией.
- `candidates` (=clamp(limit×5, 50, 200)) служит и per-CTE лимитом, и финальным
лимитом слияния; веса RRF равные. Тюнится после появления оценочного харнесса.
---
## Бэклог следующих итераций (по приоритету «качество / усилие»)
### A. Реранкер (cross-encoder) — наибольший ROI после гибрида
Вставить между over-fetch гибрида и дедупом: брать топ-50–100 кандидатов от
`hybridSearch`, реранкать, оставлять топ-5–10. Ожидаемый прирост precision/MRR
+10–25 %. Точка вставки уже готова — это шаг между `hybridSearch(... candidates)` и
циклом дедупа в `searchPages`.
- Хостовый старт (раз уже на OpenAI-инфраструктуре): **Cohere Rerank** или
**Voyage `rerank-2.5`** — провайдер по аналогии с текущим pluggable embedding-конфигом.
- Self-hosted (под Ollama-этос): **BGE-reranker-v2-m3** через HF Text Embeddings
Inference (`/rerank`), либо FlashRank (ONNX/CPU, ~15–30 мс).
- Диагностика: если реранк не двигает метрики — узкое место в recall (чанкинг/гибрид),
а не в ранжировании.
### B. Индексация вложений — закрыть пробел покрытия
Схема уже готова (`attachment_id`). Добавить в BullMQ-flow шаг извлечения текста из
PDF/документов (PyMuPDF для цифровых PDF; OCR для сканов; для таблиц — markdown через
LLM-парсер) и вливать его в тот же путь чанк→эмбеддинг→`fts`, помечая `attachment_id`.
Структура извлечённых данных важнее голой точности OCR.
### C. Тюнинг гибрида и оценочный харнесс
- Золотой датасет 30–100 примеров (вопрос → нужная страница/чанк) + Ragas/DeepEval
(Recall@k, MRR/nDCG, context precision/recall, faithfulness). Прогон до/после
каждого изменения. **Прерогатива пропущена в итерации 1 осознанно** — без неё все
нижеследующие тюнинги делаются «на глаз».
- После харнесса: тюнить веса RRF (старт 1.0/1.0), `k` (старт 60), число `candidates`.
- Эксперимент: чанки ~512 симв. против 1000 (предложение указывает на рост precision).
### D. Contextual Retrieval (Anthropic), если крошек мало
Один LLM-вызов на чанк добавляет предложение-контекст. Снижение провалов выдачи
на 35–49 %. Ложится в BullMQ-`reindexPage`; на сотнях страниц с prompt caching — копейки.
Применять, только если хлебных крошек окажется недостаточно против потери контекста.
### E. ParadeDB `pg_search` (настоящий BM25), если лексика станет узким местом
Нативный `ts_rank` использует только TF и длину документа, без IDF. `pg_search`
(Rust/Tantivy) даёт честный BM25-индекс. Не drop-in (свои операторы вместо `@@`) —
это изменение кода, а не флаг. На сотнях страниц нативного `tsvector` хватает; брать
только если качество лексического ранжирования упрётся в потолок.
### F. Прочее
- **Префиксы query/passage** — НЕ нужны на OpenAI. Внедрять только при переходе на
e5/bge/gte/Qwen3 (тогда индексатор ставит `passage:`, запрос — `query:`; BGE-v1.5,
наоборот, префиксов НЕ должна получать). Зафиксировано как ловушка на будущее.
- **Апгрейд embedding-модели** — уже на `text-embedding-3-large` (топ среди закрытых).
Matryoshka (обрезка размерности) — запас на будущее; dimension-agnostic колонка
делает миграцию тривиальной (цена — переэмбеддинг корпуса).
- **HyDE и широкий multi-query/RAG-Fusion** — НЕ рекомендуются как дефолт: в свежих
бенчмарках уступали и добавляют задержку/галлюцинации.
## Оговорки
- Все внешние числа (62→84 % precision, +17 % Recall@5, −35…49 % провалов, +10–25 %
от реранка) получены на ДРУГИХ корпусах (SEC-отчёты, финтекст, право, медицина).
На этой вики величины будут иными — поэтому пункт C (свой датасет) обязателен перед
тонким тюнингом. Внешние числа — направление, не гарантия величины.
- Часть источников предложения — вендорский маркетинг (Cohere, Voyage, ParadeDB);
направление подтверждается независимыми (T2-RAGBench, оценка Anthropic), но величины
у вендоров могут быть завышены.

View File

@@ -7,8 +7,10 @@ export interface HtmlEmbedOptions {
}
export interface HtmlEmbedAttributes {
// Raw HTML/CSS/JS string that is injected verbatim into the wiki origin.
// Raw HTML/CSS/JS string rendered inside a sandboxed iframe by the NodeView.
source?: string;
// Fixed iframe height in pixels. null/absent => auto-resize via postMessage.
height?: number | null;
}
declare module "@tiptap/core" {
@@ -98,6 +100,21 @@ export const HtmlEmbed = Node.create<HtmlEmbedOptions>({
"data-source": encodeHtmlEmbedSource(attributes.source || ""),
}),
},
// Fixed iframe height in px. null/absent => auto-resize on the client.
height: {
default: null,
parseHTML: (el) => {
const v = el.getAttribute("data-height");
if (!v) return null;
const n = parseInt(v, 10);
// A non-numeric data-height (e.g. crafted/corrupted import) must not
// become NaN: NaN is typeof "number" and would disable auto-resize and
// yield an unclamped iframe height downstream. Treat it as auto (null).
return Number.isFinite(n) ? n : null;
},
renderHTML: (attrs: HtmlEmbedAttributes) =>
attrs.height ? { "data-height": String(attrs.height) } : {},
},
};
},

View File

@@ -797,6 +797,60 @@ const Embed = Node.create({
},
});
/**
* Docmost raw HTML embed. Block atom; the client renders `source` inside a
* sandboxed iframe. The MCP server never renders it — it only needs the
* schema to accept and carry the node so a fromYdoc -> transform -> toYdoc
* round-trip does not throw "Unknown node type: htmlEmbed". Mirrors the
* @docmost/editor-ext node name, attribute keys and flags; keep in sync when
* the editor-ext htmlEmbed schema changes.
*
* NOTE: unlike the canonical editor-ext node, `data-source` here is mapped as
* plain text rather than base64-encoded. That is intentional: the MCP write
* path carries the node through Yjs (fromYdoc -> toYdoc) on its JSON `source`
* attribute and never invokes parseHTML/renderHTML, and htmlEmbed is not
* produced from the markdown/HTML (generateJSON) path. If a future HTML path
* for htmlEmbed is added here, this mapping must adopt editor-ext's base64
* encode/decode to avoid double-encoding `source`.
*/
const HtmlEmbed = Node.create({
name: "htmlEmbed",
group: "block",
inline: false,
isolating: true,
atom: true,
defining: true,
draggable: true,
addAttributes() {
return {
source: {
default: "",
parseHTML: (el: HTMLElement) => el.getAttribute("data-source") ?? "",
renderHTML: (attrs: Record<string, any>) => ({
"data-source": attrs.source ?? "",
}),
},
height: {
default: null,
parseHTML: (el: HTMLElement) => {
const v = el.getAttribute("data-height");
if (!v) return null;
const n = parseInt(v, 10);
return Number.isFinite(n) ? n : null;
},
renderHTML: (attrs: Record<string, any>) =>
attrs.height != null ? { "data-height": String(attrs.height) } : {},
},
};
},
parseHTML() {
return [{ tag: 'div[data-type="htmlEmbed"]' }];
},
renderHTML({ HTMLAttributes }) {
return ["div", { "data-type": "htmlEmbed", ...HTMLAttributes }, 0];
},
});
/** Shared attribute set for drawio/excalidraw diagram nodes. */
const diagramAttributes = () => ({
src: {
@@ -1158,6 +1212,7 @@ export const docmostExtensions = [
Video,
Youtube,
Embed,
HtmlEmbed,
Drawio,
Excalidraw,
Columns,