From 8842bc8bf3774218d2a69b6bca3d1b35d658dba0 Mon Sep 17 00:00:00 2001 From: claude_code Date: Sun, 28 Jun 2026 19:08:06 +0300 Subject: [PATCH] =?UTF-8?q?fix(sandbox):=20address=20PR=20#250=20follow-up?= =?UTF-8?q?=20review=20=E2=80=94=20XSS=20hardening,=20eviction=20reconcile?= =?UTF-8?q?,=20doc=20sync=20(#243)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security (must-fix): - sandbox.controller: the anonymous GET /api/sb/:id response now sets X-Content-Type-Options: nosniff, a restrictive CSP, and Content-Disposition= attachment for any mime outside a raster-image allowlist (png/jpeg/gif/webp/ avif). entry.mime is attacker-controlled, so an evil.svg/evil.html could otherwise execute script inline on the Docmost origin (stored XSS). Mirrors the public attachment route's hardening. Stability: - client.stashPage: reconcile mirrors AFTER the final document put, not only before it. The doc blob is the newest entry and FIFO eviction drops the oldest = this stash's own images, so the stored doc could reference an evicted blob (consumer 404) and over-report images.mirrored. A bounded loop now reverts doc-put-evicted mirrors, drops the stale doc blob, and re-puts until stable. Regenerated packages/mcp/build/. - sandbox.controller: emit Cache-Control on the 304 branch too (ttlSeconds is computed before the conditional check). Docs: - Bump the MCP tool count 39 -> 40 across all READMEs and AGENTS.md (the registry now exposes exactly 40 tools). Refactor: - SandboxStore.asSink() centralizes the {put,has,evict} sink + uri<->id mapping; the embedded-MCP and in-app agent-tools wiring sites share it. Tests: - security headers (inline vs attachment, nosniff, CSP), 304 Cache-Control, putAndLink URL form, has()/remove(), asSink() round-trip, getSandboxPublicUrl (trailing-slash trim + APP_URL fallback), and a stash test where the doc put itself evicts a mirrored image. Co-Authored-By: Claude Opus 4.8 --- AGENTS.md | 2 +- README.md | 6 +- README.ru.md | 6 +- .../ai-chat/tools/ai-chat-tools.service.ts | 12 +-- .../environment/environment.service.spec.ts | 25 ++++++ .../src/integrations/mcp/mcp.service.ts | 11 +-- .../sandbox/sandbox.controller.spec.ts | 72 +++++++++++++++++ .../sandbox/sandbox.controller.ts | 54 ++++++++++--- .../sandbox/sandbox.store.spec.ts | 32 ++++++++ .../src/integrations/sandbox/sandbox.store.ts | 20 +++++ packages/mcp/README.md | 4 +- packages/mcp/README.ru.md | 4 +- packages/mcp/build/client.js | 81 +++++++++++++------ packages/mcp/src/client.ts | 81 +++++++++++++------ packages/mcp/test/mock/stash-page.test.mjs | 48 +++++++++++ 15 files changed, 371 insertions(+), 87 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index e8eed03d..3b84f0d0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -254,7 +254,7 @@ The API server is a Fastify app with a global `/api` prefix (`main.ts` excludes - **Redis** backs caching, the BullMQ queues, the WebSocket Socket.IO adapter, and collaboration sync. ### The two AI subsystems (the main fork additions) -1. **Embedded MCP server** (`integrations/mcp/` + `packages/mcp`). The standalone `@docmost/mcp` server (39 agent-native tools: per-block patch/insert/delete by id, scripted `(doc)=>doc` transforms with dry-run diff, table editing, version diff/restore, comments, images, shares) is bundled and served over HTTP at `/mcp`. It writes through Docmost's real-time-collaboration layer so concurrent human edits aren't clobbered. Each request authenticates **per-user** via the `Authorization` header — either HTTP Basic (`base64(email:password)`, the user's own Docmost login, validated through `AuthService`) or a Bearer access JWT (the user's `authToken`) — and the session acts under that user's permissions. `MCP_DOCMOST_EMAIL` / `MCP_DOCMOST_PASSWORD` are an **optional service-account fallback**, used only when a request carries neither Basic nor Bearer credentials (back-compat for CI/scripts). An admin enables MCP with a workspace toggle (Workspace settings → AI). Optionally protected by a shared `MCP_TOKEN`: when set, every `/mcp` request must carry a matching `X-MCP-Token` header (its own header, separate from `Authorization`, which now carries the per-user Basic/Bearer credentials). Note: this changed from the older `Authorization: Bearer ` scheme — see `.env.example` and the CHANGELOG Breaking Changes entry. +1. **Embedded MCP server** (`integrations/mcp/` + `packages/mcp`). The standalone `@docmost/mcp` server (40 agent-native tools: per-block patch/insert/delete by id, scripted `(doc)=>doc` transforms with dry-run diff, table editing, version diff/restore, comments, images, shares) is bundled and served over HTTP at `/mcp`. It writes through Docmost's real-time-collaboration layer so concurrent human edits aren't clobbered. Each request authenticates **per-user** via the `Authorization` header — either HTTP Basic (`base64(email:password)`, the user's own Docmost login, validated through `AuthService`) or a Bearer access JWT (the user's `authToken`) — and the session acts under that user's permissions. `MCP_DOCMOST_EMAIL` / `MCP_DOCMOST_PASSWORD` are an **optional service-account fallback**, used only when a request carries neither Basic nor Bearer credentials (back-compat for CI/scripts). An admin enables MCP with a workspace toggle (Workspace settings → AI). Optionally protected by a shared `MCP_TOKEN`: when set, every `/mcp` request must carry a matching `X-MCP-Token` header (its own header, separate from `Authorization`, which now carries the per-user Basic/Bearer credentials). Note: this changed from the older `Authorization: Bearer ` scheme — see `.env.example` and the CHANGELOG Breaking Changes entry. 2. **AI agent chat** (`core/ai-chat/` server + `apps/client/src/features/ai-chat/` client). A built-in agent over the wiki using the Vercel **AI SDK** (`ai`, `@ai-sdk/*`) against any OpenAI-compatible provider configured per workspace (`integrations/ai/` — credentials encrypted at rest via `integrations/crypto`, stored in `ai_provider_credentials`). Key pieces: - `core/ai-chat/tools/` — the agent's ~40 read+write tools. Every tool runs under the **calling user's** CASL permissions via a per-user loopback access token (`docmost-client.loader.ts`), so the agent can never exceed what the user could do. Only **reversible** operations are exposed (page history + trash; no permanent delete). Agent edits get an "AI agent" provenance badge in page history (`20260616T130000-agent-provenance` migration). - `core/ai-chat/embedding/` — RAG indexer + a BullMQ consumer on `AI_QUEUE` that embeds pages into `page_embeddings` (vector search), complementing Postgres full-text search. Pages are (re)indexed on edit; `AI_EMBEDDING_TIMEOUT_MS` bounds a hung embeddings endpoint. diff --git a/README.md b/README.md index 8fd95cf5..5f0357f3 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The goal of the fork is a **100% open, AGPL-only build with no Enterprise-Editio | --- | --- | | **EE code removed** | Stripped all client and server Enterprise-Edition code; ships as a clean community/AGPL build with no license checks. | | **Comment resolution** | Re-implemented from scratch as a community feature (resolve / re-open with Open/Resolved tabs). No EE code reused, available to anyone who can comment. | -| **Embedded MCP server** | A community MCP server (`@docmost/mcp`, 39 tools) is served over HTTP at `/mcp` — no enterprise license required. Replaces the removed license-gated EE MCP. | +| **Embedded MCP server** | A community MCP server (`@docmost/mcp`, 40 tools) is served over HTTP at `/mcp` — no enterprise license required. Replaces the removed license-gated EE MCP. | | **AI agent chat** | Built-in AI agent chat over your wiki, written from scratch as a community feature — no enterprise license. The agent reads and edits pages on your behalf (scoped to your permissions), with full-text + vector (RAG) search and optional web access via external MCP servers. | | **Rebranding** | App logo / name changed from *Docmost* to *Gitmost*. | | **Compact page tree** | Default page-tree indentation reduced from 16px to 8px per nesting level. | @@ -44,7 +44,7 @@ The goal of the fork is a **100% open, AGPL-only build with no Enterprise-Editio ### Embedded MCP server Gitmost has **our own MCP server** — [docmost-mcp](https://github.com/vvzvlad/docmost-mcp), -which we wrote — **built directly into the app** and served at `/mcp`. It exposes **39 +which we wrote — **built directly into the app** and served at `/mcp`. It exposes **40 agent-native tools**: surgical per-block edits (patch / insert / delete by id), structure-preserving find/replace, scripted `(doc) => doc` transforms with a dry-run diff, structured table editing, version history with diff / restore, comments, images and share @@ -60,7 +60,7 @@ every little fix. And it needs no enterprise license. | | **Gitmost `/mcp` (our docmost-mcp)** | Docmost's built-in MCP | | --- | :---: | :---: | | **Enterprise license** | Not required | Required | -| **Tools** | 39, agent-native | Coarse (read Markdown, page CRUD, replace whole page) | +| **Tools** | 40, agent-native | Coarse (read Markdown, page CRUD, replace whole page) | | **Per-block edits / find-replace / scripted transforms** | ✅ | — | | **Structured table editing, version diff / restore** | ✅ | — | | **Comments, images, share links** | ✅ | — | diff --git a/README.ru.md b/README.ru.md index ca980d31..0e9becde 100644 --- a/README.ru.md +++ b/README.ru.md @@ -33,7 +33,7 @@ | --- | --- | | **Удалён EE-код** | Вырезан весь код Enterprise-редакции на клиенте и сервере; это чистая community/AGPL-сборка без лицензионных проверок. | | **Резолв комментариев** | Переписан с нуля как community-функция (резолв / переоткрытие с вкладками «Открытые» / «Решённые»). EE-код не используется, доступно любому, кто может комментировать. | -| **Встроенный MCP-сервер** | Community MCP-сервер (`@docmost/mcp`, 39 инструментов) отдаётся по HTTP на `/mcp` — без enterprise-лицензии. Заменяет удалённый лицензируемый EE MCP. | +| **Встроенный MCP-сервер** | Community MCP-сервер (`@docmost/mcp`, 40 инструментов) отдаётся по HTTP на `/mcp` — без enterprise-лицензии. Заменяет удалённый лицензируемый EE MCP. | | **Чат с AI-агентом** | Встроенный чат с AI-агентом по содержимому вики, написанный с нуля как community-функция — без enterprise-лицензии. Агент читает и редактирует страницы от вашего имени (в рамках ваших прав), с полнотекстовым + векторным (RAG) поиском и опциональным доступом в интернет через внешние MCP-серверы. | | **Ребрендинг** | Логотип / название приложения изменены с *Docmost* на *Gitmost*. | | **Компактное дерево страниц** | Отступ дерева страниц по умолчанию уменьшен с 16px до 8px на уровень вложенности. | @@ -44,7 +44,7 @@ В Gitmost есть **наш собственный MCP-сервер** — [docmost-mcp](https://github.com/vvzvlad/docmost-mcp), который мы написали сами, — **встроенный прямо в приложение** и доступный на `/mcp`. Он даёт -**39 agent-native инструментов**: точечное редактирование по блокам (patch / insert / delete +**40 agent-native инструментов**: точечное редактирование по блокам (patch / insert / delete по id), find/replace с сохранением структуры, скриптовые трансформации `(doc) => doc` с предпросмотром диффа, структурное редактирование таблиц, история версий с диффом / восстановлением, комментарии, изображения и ссылки на шаринг — всё применяется через слой @@ -60,7 +60,7 @@ real-time-коллаборации Docmost, поэтому запись нико | | **`/mcp` в Gitmost (наш docmost-mcp)** | Родной MCP у Docmost | | --- | :---: | :---: | | **Enterprise-лицензия** | Не нужна | Нужна | -| **Инструменты** | 39, agent-native | Примитивные (Markdown, CRUD страниц, замена целиком) | +| **Инструменты** | 40, agent-native | Примитивные (Markdown, CRUD страниц, замена целиком) | | **Правки по блокам / find-replace / скриптовые трансформации** | ✅ | — | | **Структурное редактирование таблиц, дифф / восстановление версий** | ✅ | — | | **Комментарии, изображения, ссылки на шаринг** | ✅ | — | diff --git a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts index 7e586d83..abe10219 100644 --- a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts +++ b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts @@ -92,20 +92,14 @@ export class AiChatToolsService { // Bind the stash tool to the shared in-RAM SandboxStore. The store owns the // anonymous-URL composition (putAndLink) and the live/evict probes the MCP // package needs to keep its mirror counts honest under FIFO eviction (the - // package never touches env or the store). The sink speaks `uri`s, so the - // probes map a uri back to its id (the last path segment). - const idOf = (uri: string) => uri.substring(uri.lastIndexOf('/') + 1); - + // package never touches env or the store). asSink() centralizes the uri↔id + // mapping next to putAndLink, shared with the embedded-MCP wiring site. const { DocmostClient, sharedToolSpecs } = await loadDocmostMcp(); const client: DocmostClientLike = new DocmostClient({ apiUrl, getToken, getCollabToken, - sandbox: { - put: (buf, mime) => this.sandboxStore.putAndLink(buf, mime), - has: (uri) => this.sandboxStore.has(idOf(uri)), - evict: (uri) => this.sandboxStore.remove(idOf(uri)), - }, + sandbox: this.sandboxStore.asSink(), }); // Build an ai-SDK tool from a shared, zod-agnostic spec. The spec owns the diff --git a/apps/server/src/integrations/environment/environment.service.spec.ts b/apps/server/src/integrations/environment/environment.service.spec.ts index 03970b1b..3e91e10a 100644 --- a/apps/server/src/integrations/environment/environment.service.spec.ts +++ b/apps/server/src/integrations/environment/environment.service.spec.ts @@ -40,4 +40,29 @@ describe('EnvironmentService', () => { expect(build(undefined).getSandboxTtlMs()).toBe(3_600_000); }); }); + + describe('getSandboxPublicUrl', () => { + // Stub that resolves BOTH keys the public-url logic consults. + const build = (vals: { sandboxUrl?: string; appUrl?: string }) => + new EnvironmentService({ + get: (key: string, def?: string) => + key === 'SANDBOX_PUBLIC_URL' + ? (vals.sandboxUrl ?? def) + : key === 'APP_URL' + ? (vals.appUrl ?? def) + : def, + } as any); + + it('uses SANDBOX_PUBLIC_URL and trims a trailing slash', () => { + expect( + build({ sandboxUrl: 'https://docs.example.com/' }).getSandboxPublicUrl(), + ).toBe('https://docs.example.com'); + }); + + it('falls back to APP_URL (origin) when SANDBOX_PUBLIC_URL is unset', () => { + expect( + build({ appUrl: 'https://app.example.com' }).getSandboxPublicUrl(), + ).toBe('https://app.example.com'); + }); + }); }); diff --git a/apps/server/src/integrations/mcp/mcp.service.ts b/apps/server/src/integrations/mcp/mcp.service.ts index 84a8d535..92ccf4c4 100644 --- a/apps/server/src/integrations/mcp/mcp.service.ts +++ b/apps/server/src/integrations/mcp/mcp.service.ts @@ -119,15 +119,10 @@ export class McpService implements OnModuleDestroy { // Bind the stash tool to the shared in-RAM SandboxStore. The store owns the // anonymous-URL composition (putAndLink) and the live/evict probes the MCP // package needs to keep its mirror counts honest under FIFO eviction; the - // package owns neither env nor the store. The sink speaks `uri`s, so the - // probes map a uri back to its id (the last path segment). + // package owns neither env nor the store. The uri↔id mapping now lives on the + // store (asSink), shared with the in-app agent-tools wiring site. private buildSandboxConfig(): DocmostMcpConfig['sandbox'] { - const idOf = (uri: string) => uri.substring(uri.lastIndexOf('/') + 1); - return { - put: (buf, mime) => this.sandboxStore.putAndLink(buf, mime), - has: (uri) => this.sandboxStore.has(idOf(uri)), - evict: (uri) => this.sandboxStore.remove(idOf(uri)), - }; + return this.sandboxStore.asSink(); } // Service account the embedded MCP uses to talk back to this Docmost diff --git a/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts b/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts index d5930b47..9021e070 100644 --- a/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts +++ b/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts @@ -187,4 +187,76 @@ describe('SandboxController', () => { expect(maxAge).toBeGreaterThanOrEqual(0); expect(maxAge).toBeLessThanOrEqual(60); }); + + it('emits Cache-Control alongside ETag on the 304 branch', async () => { + const sha = '3'.repeat(64); + const store = { + get: jest.fn().mockReturnValue(entry(Buffer.from('x'), 'application/json', sha)), + }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get(VALID_ID, makeReq({ 'if-none-match': `"${sha}"` }), res); + + expect(res._sent.status).toBe(304); + expect(res._sent.headers['cache-control']).toMatch( + /^private, max-age=\d+, immutable$/, + ); + }); + + it('sets nosniff + restrictive CSP and serves an allowlisted image inline', async () => { + const sha = '4'.repeat(64); + const store = { + get: jest.fn().mockReturnValue(entry(Buffer.from('x'), 'image/png', sha)), + }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get(VALID_ID, makeReq(), res); + + expect(res._sent.status).toBe(200); + expect(res._sent.headers['x-content-type-options']).toBe('nosniff'); + expect(res._sent.headers['content-security-policy']).toBe( + "base-uri 'none'; object-src 'self'; default-src 'self';", + ); + expect(res._sent.headers['content-disposition']).toBe('inline'); + }); + + it('forces an SVG to download (attachment) while keeping nosniff + CSP', async () => { + const sha = '5'.repeat(64); + const store = { + get: jest.fn().mockReturnValue(entry(Buffer.from(''), 'image/svg+xml', sha)), + }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get(VALID_ID, makeReq(), res); + + expect(res._sent.status).toBe(200); + expect(res._sent.headers['content-disposition']).toBe('attachment'); + expect(res._sent.headers['x-content-type-options']).toBe('nosniff'); + expect(res._sent.headers['content-security-policy']).toBe( + "base-uri 'none'; object-src 'self'; default-src 'self';", + ); + }); + + it('forces text/html to download (attachment) while keeping nosniff + CSP', async () => { + const sha = '6'.repeat(64); + const store = { + get: jest + .fn() + .mockReturnValue(entry(Buffer.from('

x

'), 'text/html', sha)), + }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get(VALID_ID, makeReq(), res); + + expect(res._sent.status).toBe(200); + expect(res._sent.headers['content-disposition']).toBe('attachment'); + expect(res._sent.headers['x-content-type-options']).toBe('nosniff'); + expect(res._sent.headers['content-security-policy']).toBe( + "base-uri 'none'; object-src 'self'; default-src 'self';", + ); + }); }); diff --git a/apps/server/src/integrations/sandbox/sandbox.controller.ts b/apps/server/src/integrations/sandbox/sandbox.controller.ts index c01f0cba..4cfd320a 100644 --- a/apps/server/src/integrations/sandbox/sandbox.controller.ts +++ b/apps/server/src/integrations/sandbox/sandbox.controller.ts @@ -9,6 +9,18 @@ import { SANDBOX_ROUTE_SEGMENT } from './sandbox.constants'; const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/; +// MIME types safe to render inline in a browser. SVG is deliberately EXCLUDED +// (it can carry script), as are text/html and the JSON document blob — anything +// not on this list is served as an attachment so an attacker-controlled mime can +// never execute script on this origin (the route is anonymous + same-origin). +const INLINE_SAFE_MIME = new Set([ + 'image/png', + 'image/jpeg', + 'image/gif', + 'image/webp', + 'image/avif', +]); + /** * Anonymous read endpoint for the in-RAM blob sandbox. * @@ -22,6 +34,12 @@ const UUID_RE = * It only ever serves blobs looked up from the SandboxStore by a validated * UUID; `:id` is never used as a filesystem path, so there is no traversal * surface. Never returns tokens, never 401s. + * + * Anti-XSS hardening mirrors the public attachment route: every response sets + * `X-Content-Type-Options: nosniff` and a restrictive CSP, and serves any mime + * NOT on the inline-safe allowlist (svg/html/the JSON document blob) as an + * attachment, so an attacker-controlled `entry.mime` can never execute script + * on this same-origin anonymous route. */ @Controller(SANDBOX_ROUTE_SEGMENT) export class SandboxController { @@ -52,17 +70,32 @@ export class SandboxController { // body — the original bug this whole channel exists to fix. const etag = `"${entry.sha256}"`; - // Conditional request: an exact ETag match → 304 with no body. The blob is - // immutable, so the validator is stable for the blob's whole lifetime. - if (this.ifNoneMatchMatches(req.headers['if-none-match'], entry.sha256)) { - res.status(304).header('ETag', etag).send(); - return; - } - + // Compute freshness BEFORE the conditional check: a 304 conditional + // revalidation must not lose the Cache-Control freshness directives, or a + // revalidating client would forget how long the blob stays fresh. const ttlSeconds = Math.max( 0, Math.floor((entry.expiresAt - Date.now()) / 1000), ); + // Capability URL — keep it out of shared caches; immutable for its TTL. + const cacheControl = `private, max-age=${ttlSeconds}, immutable`; + + // Conditional request: an exact ETag match → 304 with no body. The blob is + // immutable, so the validator is stable for the blob's whole lifetime. + if (this.ifNoneMatchMatches(req.headers['if-none-match'], entry.sha256)) { + res + .status(304) + .header('ETag', etag) + .header('Cache-Control', cacheControl) + .send(); + return; + } + + // Non-allowlisted mimes (svg/html/the JSON blob) are forced to download so + // an attacker-controlled mime can never run script inline on this origin. + const disposition = INLINE_SAFE_MIME.has(entry.mime) + ? 'inline' + : 'attachment'; // Use @Res() + res.send(Buffer) with an explicit Content-Type so the binary // body bypasses the global JSON response transform/serializer. @@ -72,8 +105,11 @@ export class SandboxController { 'Content-Type': entry.mime, 'Content-Length': entry.buf.length, ETag: etag, - // Capability URL — keep it out of shared caches; immutable for its TTL. - 'Cache-Control': `private, max-age=${ttlSeconds}, immutable`, + 'Cache-Control': cacheControl, + 'X-Content-Type-Options': 'nosniff', + 'Content-Security-Policy': + "base-uri 'none'; object-src 'self'; default-src 'self';", + 'Content-Disposition': disposition, }) .send(entry.buf); } diff --git a/apps/server/src/integrations/sandbox/sandbox.store.spec.ts b/apps/server/src/integrations/sandbox/sandbox.store.spec.ts index d4e81706..8ef90774 100644 --- a/apps/server/src/integrations/sandbox/sandbox.store.spec.ts +++ b/apps/server/src/integrations/sandbox/sandbox.store.spec.ts @@ -130,4 +130,36 @@ describe('SandboxStore', () => { /total store cap/, ); }); + + it('putAndLink composes the anonymous /api/sb/ url with matching integrity', () => { + store = new SandboxStore(makeEnv()); + const buf = Buffer.from('hello link', 'utf8'); + const expected = createHash('sha256').update(buf).digest('hex'); + + const res = store.putAndLink(buf, 'image/png'); + expect(res.uri).toMatch(/^https:\/\/example\.test\/api\/sb\/[0-9a-f-]{36}$/); + expect(res.sha256).toBe(expected); + expect(res.size).toBe(buf.length); + }); + + it('has()/remove() report and free a blob by id', () => { + store = new SandboxStore(makeEnv()); + const { id } = store.put(Buffer.from('x'), 'text/plain'); + + expect(store.has(id)).toBe(true); + store.remove(id); + expect(store.has(id)).toBe(false); + expect(store.bytes).toBe(0); + }); + + it('asSink() round-trips put/has/evict through the anonymous uri', () => { + store = new SandboxStore(makeEnv()); + const sink = store.asSink(); + const buf = Buffer.from('sink bytes', 'utf8'); + + const r = sink.put(buf, 'image/png'); + expect(sink.has(r.uri)).toBe(true); + sink.evict(r.uri); + expect(sink.has(r.uri)).toBe(false); + }); }); diff --git a/apps/server/src/integrations/sandbox/sandbox.store.ts b/apps/server/src/integrations/sandbox/sandbox.store.ts index 7d604d32..e4511974 100644 --- a/apps/server/src/integrations/sandbox/sandbox.store.ts +++ b/apps/server/src/integrations/sandbox/sandbox.store.ts @@ -108,6 +108,26 @@ export class SandboxStore implements OnModuleDestroy { }; } + /** + * Adapter to the package's blob-sandbox sink contract `{ put, has, evict }`. + * The sink speaks anonymous `uri`s while the store is keyed by `id`, so this is + * the ONE place that maps a sandbox uri back to its id (the last path segment). + * Both wiring sites (embedded MCP + in-app agent tools) use this so the uri↔id + * mapping and URL composition live next to putAndLink, not copy-pasted. + */ + asSink(): { + put: (buf: Buffer, mime: string) => { uri: string; sha256: string; size: number }; + has: (uri: string) => boolean; + evict: (uri: string) => void; + } { + const idOf = (uri: string) => uri.substring(uri.lastIndexOf('/') + 1); + return { + put: (buf, mime) => this.putAndLink(buf, mime), + has: (uri) => this.has(idOf(uri)), + evict: (uri) => this.remove(idOf(uri)), + }; + } + /** True if the blob is still live (not evicted/expired). */ has(id: string): boolean { return this.get(id) !== undefined; diff --git a/packages/mcp/README.md b/packages/mcp/README.md index 57dd8d9d..be83177d 100644 --- a/packages/mcp/README.md +++ b/packages/mcp/README.md @@ -16,7 +16,7 @@ license. > that interface. Other Docmost MCPs are human-shaped — they expose "open the page" and > "replace the page"; this one exposes the editing primitives a model is good at. -It exposes **39 tools** built around three ideas that the other Docmost MCPs do not +It exposes **40 tools** built around three ideas that the other Docmost MCPs do not combine: 1. **Surgical, token-cheap edits.** Address a single block by id and patch it, or run @@ -106,7 +106,7 @@ There are several Docmost MCPs. Here is a capability-by-capability comparison. ## Tools -All 39 tools, grouped by what you'd reach for them. +All 40 tools, grouped by what you'd reach for them. ### Exploration & retrieval diff --git a/packages/mcp/README.ru.md b/packages/mcp/README.ru.md index 305f7013..e9f21569 100644 --- a/packages/mcp/README.ru.md +++ b/packages/mcp/README.ru.md @@ -17,7 +17,7 @@ > «открыть страницу» и «заменить страницу»; этот даёт примитивы редактирования, в которых > модель сильна. -Сервер предоставляет **39 инструментов**, построенных вокруг трёх идей, которые другие +Сервер предоставляет **40 инструментов**, построенных вокруг трёх идей, которые другие Docmost-MCP не сочетают: 1. **Точечные, экономичные по токенам правки.** Адресуйте отдельный блок по id и патчите @@ -109,7 +109,7 @@ Docmost-MCP не сочетают: ## Инструменты -Все 39 инструментов, сгруппированы по задачам, для которых вы их возьмёте. +Все 40 инструментов, сгруппированы по задачам, для которых вы их возьмёте. ### Чтение и поиск diff --git a/packages/mcp/build/client.js b/packages/mcp/build/client.js index 80da5454..db5240bf 100644 --- a/packages/mcp/build/client.js +++ b/packages/mcp/build/client.js @@ -723,37 +723,68 @@ export class DocmostClient { } })); } - // Reconcile against FIFO eviction: a heavy page can have a later image-put - // evict an EARLIER image stored in this SAME stash. The stored doc must not - // reference an evicted blob (consumer 404) and the counts must not lie, so - // for any mirror whose blob is gone, revert its nodes to their original - // internal srcs and re-count it as failed. + // Revert one mirror's nodes to their original internal srcs and re-count it + // as failed (its blob was FIFO-evicted before the doc could reference it + // safely). + const revertMirror = (mirror) => { + for (const entry of mirror.entries) + entry.node.attrs.src = entry.origSrc; + mirrored--; + failed++; + console.warn(`stash_page: mirrored blob ${mirror.uri} was evicted before the doc ` + + `could safely reference it; reverted its src and counted it as failed`); + }; + // Pre-put reconciliation: an image put earlier in THIS stash can FIFO-evict + // an even-earlier image of the same stash. Drop those from the live set + // first so the first serialized doc is already mostly correct. + let liveMirrors = mirrors; if (this.sandboxHas) { + liveMirrors = []; for (const mirror of mirrors) { - if (!this.sandboxHas(mirror.uri)) { - for (const entry of mirror.entries) { - entry.node.attrs.src = entry.origSrc; - } - mirrored--; - failed++; - console.warn(`stash_page: mirrored blob ${mirror.uri} was evicted before ` + - `the doc was stored; reverted its src and counted it as failed`); - } + if (this.sandboxHas(mirror.uri)) + liveMirrors.push(mirror); + else + revertMirror(mirror); } } - const docBuf = Buffer.from(JSON.stringify(cloned), "utf8"); + // Put the document, then reconcile against eviction caused by the doc put + // ITSELF (the doc is newest, FIFO drops oldest = this stash's images). Each + // iteration reverts >=1 mirror, so the loop terminates (worst case: all + // images reverted and the doc references no sandbox image URLs). let stored; - try { - stored = this.sandboxPut(docBuf, "application/json"); - } - catch (err) { - // The doc put failed (e.g. doc exceeds the cap). Free this op's image - // blobs instead of leaking them in RAM for the whole TTL, then re-throw. - if (this.sandboxEvict) { - for (const mirror of mirrors) - this.sandboxEvict(mirror.uri); + for (;;) { + const docBuf = Buffer.from(JSON.stringify(cloned), "utf8"); + let docStored; + try { + docStored = this.sandboxPut(docBuf, "application/json"); } - throw err; + catch (err) { + // The doc put failed (e.g. doc exceeds the cap). Free this op's image + // blobs instead of leaking them in RAM for the whole TTL, then + // re-throw. + if (this.sandboxEvict) { + for (const mirror of liveMirrors) + this.sandboxEvict(mirror.uri); + } + throw err; + } + if (!this.sandboxHas) { + stored = docStored; + break; + } + const evictedNow = liveMirrors.filter((m) => !this.sandboxHas(m.uri)); + if (evictedNow.length === 0) { + stored = docStored; + break; + } + // The doc we just stored references now-dead blobs. Revert those nodes, + // drop the stale doc blob, and loop to re-serialize + re-put the + // corrected doc. + for (const mirror of evictedNow) + revertMirror(mirror); + liveMirrors = liveMirrors.filter((m) => this.sandboxHas(m.uri)); + if (this.sandboxEvict) + this.sandboxEvict(docStored.uri); } return { uri: stored.uri, diff --git a/packages/mcp/src/client.ts b/packages/mcp/src/client.ts index f2afb716..f74a9af3 100644 --- a/packages/mcp/src/client.ts +++ b/packages/mcp/src/client.ts @@ -926,38 +926,69 @@ export class DocmostClient { ); } - // Reconcile against FIFO eviction: a heavy page can have a later image-put - // evict an EARLIER image stored in this SAME stash. The stored doc must not - // reference an evicted blob (consumer 404) and the counts must not lie, so - // for any mirror whose blob is gone, revert its nodes to their original - // internal srcs and re-count it as failed. + // Revert one mirror's nodes to their original internal srcs and re-count it + // as failed (its blob was FIFO-evicted before the doc could reference it + // safely). + const revertMirror = (mirror: { + uri: string; + entries: Array<{ node: any; origSrc: string }>; + }) => { + for (const entry of mirror.entries) entry.node.attrs.src = entry.origSrc; + mirrored--; + failed++; + console.warn( + `stash_page: mirrored blob ${mirror.uri} was evicted before the doc ` + + `could safely reference it; reverted its src and counted it as failed`, + ); + }; + + // Pre-put reconciliation: an image put earlier in THIS stash can FIFO-evict + // an even-earlier image of the same stash. Drop those from the live set + // first so the first serialized doc is already mostly correct. + let liveMirrors = mirrors; if (this.sandboxHas) { + liveMirrors = []; for (const mirror of mirrors) { - if (!this.sandboxHas(mirror.uri)) { - for (const entry of mirror.entries) { - entry.node.attrs.src = entry.origSrc; - } - mirrored--; - failed++; - console.warn( - `stash_page: mirrored blob ${mirror.uri} was evicted before ` + - `the doc was stored; reverted its src and counted it as failed`, - ); - } + if (this.sandboxHas(mirror.uri)) liveMirrors.push(mirror); + else revertMirror(mirror); } } - const docBuf = Buffer.from(JSON.stringify(cloned), "utf8"); + // Put the document, then reconcile against eviction caused by the doc put + // ITSELF (the doc is newest, FIFO drops oldest = this stash's images). Each + // iteration reverts >=1 mirror, so the loop terminates (worst case: all + // images reverted and the doc references no sandbox image URLs). let stored: { uri: string; sha256: string; size: number }; - try { - stored = this.sandboxPut(docBuf, "application/json"); - } catch (err) { - // The doc put failed (e.g. doc exceeds the cap). Free this op's image - // blobs instead of leaking them in RAM for the whole TTL, then re-throw. - if (this.sandboxEvict) { - for (const mirror of mirrors) this.sandboxEvict(mirror.uri); + for (;;) { + const docBuf = Buffer.from(JSON.stringify(cloned), "utf8"); + let docStored: { uri: string; sha256: string; size: number }; + try { + docStored = this.sandboxPut(docBuf, "application/json"); + } catch (err) { + // The doc put failed (e.g. doc exceeds the cap). Free this op's image + // blobs instead of leaking them in RAM for the whole TTL, then + // re-throw. + if (this.sandboxEvict) { + for (const mirror of liveMirrors) this.sandboxEvict(mirror.uri); + } + throw err; } - throw err; + + if (!this.sandboxHas) { + stored = docStored; + break; + } + const evictedNow = liveMirrors.filter((m) => !this.sandboxHas!(m.uri)); + if (evictedNow.length === 0) { + stored = docStored; + break; + } + // The doc we just stored references now-dead blobs. Revert those nodes, + // drop the stale doc blob, and loop to re-serialize + re-put the + // corrected doc. + for (const mirror of evictedNow) revertMirror(mirror); + liveMirrors = liveMirrors.filter((m) => this.sandboxHas!(m.uri)); + if (this.sandboxEvict) this.sandboxEvict(docStored.uri); } return { uri: stored.uri, diff --git a/packages/mcp/test/mock/stash-page.test.mjs b/packages/mcp/test/mock/stash-page.test.mjs index 647ea9d7..f53c077a 100644 --- a/packages/mcp/test/mock/stash-page.test.mjs +++ b/packages/mcp/test/mock/stash-page.test.mjs @@ -285,6 +285,54 @@ test("stashPage reverts a FIFO-evicted image and counts it as failed (B1)", asyn assert.equal(reverted, 2); }); +test("stashPage reverts an image evicted by the DOC put itself (after-put reconcile, B1)", async () => { + // Both images (1000 bytes each) survive the image phase: total 2000 <= cap + // 2500. The doc, however, serializes large (a node with a ~700-byte string + // attr), so putting it (newest) tips total over the cap and FIFO-evicts the + // OLDEST image (img0) — an eviction caused by the doc put itself, which only + // the after-put reconciliation can catch. The loop then reverts img0, drops + // the stale doc blob, and re-puts the corrected doc (now total = img1 + + // docSize <= cap, so img1 survives). + const BIG = Buffer.alloc(1000, 0x41); + const sandbox = makeSandbox({ maxTotal: 2500 }); + const doc = { + type: "doc", + content: [ + { type: "image", attrs: { src: "/api/files/att-0/pic.png", attachmentId: "att-0" } }, + { type: "image", attrs: { src: "/api/files/att-1/pic.png", attachmentId: "att-1" } }, + // Bulk the doc JSON up so the doc put crosses the cap on its own. Stays in + // the doc across reverts, so each re-serialization is similarly large. + { type: "paragraph", attrs: { filler: "x".repeat(700) }, content: [] }, + ], + }; + const client = await buildClient(sandbox, { doc, fileBytes: BIG }); + + const result = await client.stashPage("page-1"); + + // The doc put evicted exactly one image -> reverted + counted as failed. + assert.deepEqual(result.images, { mirrored: 1, failed: 1 }); + + // Use the LAST json put: the first (stale) doc referenced the now-dead blob + // and was itself evicted; the corrected re-put is the one that stands. + const docPut = sandbox.puts.filter((p) => p.mime === "application/json").at(-1); + const stashed = JSON.parse(docPut.buf.toString("utf8")); + const imgs = stashed.content.content.filter((n) => n.type === "image"); + let live = 0; + let reverted = 0; + for (const img of imgs) { + const src = img.attrs.src; + if (src.startsWith("https://sb.test/api/sb/")) { + assert.ok(sandbox.has(src), `final doc references evicted blob ${src}`); + live++; + } else { + assert.match(src, /^\/api\/files\/att-\d+\/pic\.png$/); + reverted++; + } + } + assert.equal(live, 1); + assert.equal(reverted, 1); +}); + test("stashPage frees image blobs when the doc put throws (B1)", async () => { // Two distinct images mirror fine; the final JSON doc put throws (doc exceeds // cap). stashPage must reject AND evict every image blob it stored this op.