From 6eb335d5e312e9932c8db350ee920d0f99c3cda5 Mon Sep 17 00:00:00 2001 From: claude_code Date: Sun, 28 Jun 2026 18:02:46 +0300 Subject: [PATCH] =?UTF-8?q?fix(sandbox):=20address=20PR=20#250=20review=20?= =?UTF-8?q?=E2=80=94=20SSRF=20guard,=20eviction=20safety,=20cleanup=20(#24?= =?UTF-8?q?3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security: - stash_page: reject path-traversal / percent-encoded srcs before the authed loopback fetch (resolveInternalFilePath), closing an SSRF/exfiltration hole where a crafted node.attrs.src could read an arbitrary internal GET endpoint into the anonymous sandbox. Stability: - stash_page: revert + recount mirrors FIFO-evicted by a later put in the same stash (no dangling sandbox refs, honest images.mirrored/failed); free image blobs if the final document put throws. - Reject/clamp non-positive SANDBOX_TTL_MS to the 1h default (warn once). - Log mirror failures unconditionally (console.warn, no blob bodies). Cleanup / architecture: - Remove dead expiresAt from SandboxPutResult. - Centralize the /api/sb route in SANDBOX_ROUTE_SEGMENT/SANDBOX_API_PATH and move URL composition into SandboxStore.putAndLink; drop the duplicated sink closures and the now-unused EnvironmentService injection from McpService and AiChatToolsService. - Un-export isInternalFileUrl; document the process-local (instance-bound) sandbox limitation in the tool description and .env.example. Docs/tests: - README/README.ru: 38 -> 39 tools + stash_page entry. - Add traversal/normalize/recursion unit tests, stash self-eviction + doc-put-throw + empty/octet-stream mock tests, controller If-None-Match (wildcard/weak/list) + Cache-Control tests, and SANDBOX_TTL_MS validation tests. Regenerate packages/mcp/build. Co-Authored-By: Claude Opus 4.8 --- .env.example | 4 + .../tools/ai-chat-tools.service.spec.ts | 20 +-- .../ai-chat/tools/ai-chat-tools.service.ts | 23 ++- .../ai-chat/tools/docmost-client.loader.ts | 4 + .../environment/environment.service.spec.ts | 26 +++ .../environment/environment.service.ts | 24 ++- .../src/integrations/mcp/mcp-auth.helpers.ts | 5 + .../mcp/mcp-basic-login-gate.spec.ts | 1 - .../src/integrations/mcp/mcp.service.ts | 23 +-- .../integrations/sandbox/sandbox.constants.ts | 6 + .../sandbox/sandbox.controller.spec.ts | 66 +++++++ .../sandbox/sandbox.controller.ts | 3 +- .../src/integrations/sandbox/sandbox.store.ts | 33 +++- apps/server/src/main.ts | 3 +- packages/mcp/README.md | 12 +- packages/mcp/README.ru.md | 13 +- packages/mcp/build/client.js | 83 +++++++-- packages/mcp/build/lib/internal-file-urls.js | 57 +++++- packages/mcp/build/tool-specs.js | 5 +- packages/mcp/src/client.ts | 100 +++++++++-- packages/mcp/src/lib/internal-file-urls.ts | 61 ++++++- packages/mcp/src/tool-specs.ts | 5 +- packages/mcp/test/mock/stash-page.test.mjs | 167 ++++++++++++++++-- .../mcp/test/unit/internal-file-urls.test.mjs | 61 +++++++ 24 files changed, 708 insertions(+), 97 deletions(-) create mode 100644 apps/server/src/integrations/sandbox/sandbox.constants.ts create mode 100644 packages/mcp/test/unit/internal-file-urls.test.mjs diff --git a/.env.example b/.env.example index 2c59018a..e75ccad3 100644 --- a/.env.example +++ b/.env.example @@ -134,6 +134,10 @@ MCP_DOCMOST_PASSWORD= # SANDBOX_PUBLIC_URL is the base used to build those URLs; it MUST be reachable # by the consumer (do NOT use a loopback address if the consumer is remote). # Defaults to APP_URL when unset. +# NOTE: the store is process-local — blobs live only on the instance that +# created them. Behind a multi-replica load balancer WITHOUT sticky sessions a +# consumer may hit a different instance and get a 404 (indistinguishable from an +# expired blob). Single-host deployments are unaffected. # SANDBOX_PUBLIC_URL=https://docs.example.com # SANDBOX_TTL_MS=3600000 # SANDBOX_MAX_BYTES=8388608 diff --git a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.spec.ts b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.spec.ts index 0e695905..ea820041 100644 --- a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.spec.ts +++ b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.spec.ts @@ -63,9 +63,8 @@ describe('AiChatToolsService deletePage guardrail (H4)', () => { {} as never, {} as never, {} as never, - // environmentService + sandboxStore (only used by the stash tool closure, - // which these tests do not execute). - {} as never, + // sandboxStore (only used by the stash tool closure, which these tests do + // not execute). {} as never, ); }); @@ -179,9 +178,8 @@ describe('AiChatToolsService expanded toolset guardrails', () => { {} as never, {} as never, {} as never, - // environmentService + sandboxStore (only used by the stash tool closure, - // which these tests do not execute). - {} as never, + // sandboxStore (only used by the stash tool closure, which these tests do + // not execute). {} as never, ); }); @@ -298,9 +296,8 @@ describe('AiChatToolsService node-arg JSON-string coercion', () => { {} as never, {} as never, {} as never, - // environmentService + sandboxStore (only used by the stash tool closure, - // which these tests do not execute). - {} as never, + // sandboxStore (only used by the stash tool closure, which these tests do + // not execute). {} as never, ); }); @@ -452,9 +449,8 @@ describe('AiChatToolsService model-friendly input validation (#190)', () => { {} as never, {} as never, {} as never, - // environmentService + sandboxStore (only used by the stash tool closure, - // which these tests do not execute). - {} as never, + // sandboxStore (only used by the stash tool closure, which these tests do + // not execute). {} as never, ); }); diff --git a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts index ae17be4d..7e586d83 100644 --- a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts +++ b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts @@ -16,7 +16,6 @@ import { import { resolveCurrentPageResult } from './current-page.util'; import { parseNodeArg } from './parse-node-arg'; import { modelFriendlyInput } from './model-friendly-input'; -import { EnvironmentService } from '../../../integrations/environment/environment.service'; import { SandboxStore } from '../../../integrations/sandbox/sandbox.store'; /** @@ -43,7 +42,6 @@ export class AiChatToolsService { private readonly pageEmbeddingRepo: PageEmbeddingRepo, private readonly spaceMemberRepo: SpaceMemberRepo, private readonly pagePermissionRepo: PagePermissionRepo, - private readonly environmentService: EnvironmentService, // Shared singleton in-RAM blob store backing the stash tool. private readonly sandboxStore: SandboxStore, ) {} @@ -91,22 +89,23 @@ export class AiChatToolsService { aiChatId, }); - // Bind the stash tool to the shared in-RAM SandboxStore and compose the - // anonymous public URL here (the MCP package never touches env or the - // store). put() returns the read URL + sha256/size; sha256 is also the - // blob's ETag for integrity. - const sandboxPut = (buf: Buffer, mime: string) => { - const stored = this.sandboxStore.put(buf, mime); - const base = this.environmentService.getSandboxPublicUrl(); - return { uri: `${base}/api/sb/${stored.id}`, sha256: stored.sha256, size: stored.size }; - }; + // Bind the stash tool to the shared in-RAM SandboxStore. The store owns the + // anonymous-URL composition (putAndLink) and the live/evict probes the MCP + // package needs to keep its mirror counts honest under FIFO eviction (the + // package never touches env or the store). The sink speaks `uri`s, so the + // probes map a uri back to its id (the last path segment). + const idOf = (uri: string) => uri.substring(uri.lastIndexOf('/') + 1); const { DocmostClient, sharedToolSpecs } = await loadDocmostMcp(); const client: DocmostClientLike = new DocmostClient({ apiUrl, getToken, getCollabToken, - sandbox: { put: sandboxPut }, + sandbox: { + put: (buf, mime) => this.sandboxStore.putAndLink(buf, mime), + has: (uri) => this.sandboxStore.has(idOf(uri)), + evict: (uri) => this.sandboxStore.remove(idOf(uri)), + }, }); // Build an ai-SDK tool from a shared, zod-agnostic spec. The spec owns the diff --git a/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts b/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts index 17ec49c3..deef79f3 100644 --- a/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts +++ b/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts @@ -171,11 +171,15 @@ export type DocmostClientConfig = { getCollabToken?: () => Promise; // Optional blob-sandbox sink for the stash tool. `put` stores a blob in the // host's in-RAM SandboxStore and returns the anonymous read URL + integrity. + // The optional `has`/`evict` probes let stashPage keep its mirror counts + // honest under the store's FIFO eviction (mirror of the package's sink type). sandbox?: { put: ( buf: Buffer, mime: string, ) => { uri: string; sha256: string; size: number }; + has?: (uri: string) => boolean; + evict?: (uri: string) => void; }; }; diff --git a/apps/server/src/integrations/environment/environment.service.spec.ts b/apps/server/src/integrations/environment/environment.service.spec.ts index efef25b0..03970b1b 100644 --- a/apps/server/src/integrations/environment/environment.service.spec.ts +++ b/apps/server/src/integrations/environment/environment.service.spec.ts @@ -14,4 +14,30 @@ describe('EnvironmentService', () => { it('should be defined', () => { expect(service).toBeDefined(); }); + + describe('getSandboxTtlMs', () => { + // ConfigService stub: get(key, def) returns the configured value for the key + // (falling back to def), matching the @nestjs/config contract the service + // calls with (key, default). + const build = (sandboxTtl?: string) => + new EnvironmentService({ + get: (key: string, def?: string) => + key === 'SANDBOX_TTL_MS' ? (sandboxTtl ?? def) : def, + } as any); + + it.each(['0', '-5', 'abc'])( + 'falls back to the 3600000 default for invalid value %s', + (value) => { + expect(build(value).getSandboxTtlMs()).toBe(3_600_000); + }, + ); + + it('returns the parsed value for a valid positive integer', () => { + expect(build('120000').getSandboxTtlMs()).toBe(120_000); + }); + + it('uses the 3600000 default when SANDBOX_TTL_MS is unset', () => { + expect(build(undefined).getSandboxTtlMs()).toBe(3_600_000); + }); + }); }); diff --git a/apps/server/src/integrations/environment/environment.service.ts b/apps/server/src/integrations/environment/environment.service.ts index 51c94ec2..82798727 100644 --- a/apps/server/src/integrations/environment/environment.service.ts +++ b/apps/server/src/integrations/environment/environment.service.ts @@ -1,9 +1,14 @@ -import { Injectable } from '@nestjs/common'; +import { Injectable, Logger } from '@nestjs/common'; import { ConfigService } from '@nestjs/config'; import ms, { StringValue } from 'ms'; @Injectable() export class EnvironmentService { + private readonly logger = new Logger(EnvironmentService.name); + // One-shot guard so an invalid SANDBOX_TTL_MS is warned about once, not on + // every getSandboxTtlMs() call (which runs per blob put). + private sandboxTtlWarned = false; + constructor(private configService: ConfigService) {} getNodeEnv(): string { @@ -348,12 +353,25 @@ export class EnvironmentService { } // Blob time-to-live. Default 1h. The unguessable UUID + this short TTL + TLS - // are the whole capability model (no tokens). + // are the whole capability model (no tokens). A non-positive or non-integer + // value would make every blob expire instantly (silent 404s), so reject it and + // fall back to the 1h default (warned about once to avoid per-put log spam). getSandboxTtlMs(): number { - return parseInt( + const parsed = parseInt( this.configService.get('SANDBOX_TTL_MS', '3600000'), 10, ); + if (!Number.isInteger(parsed) || parsed <= 0) { + if (!this.sandboxTtlWarned) { + this.sandboxTtlWarned = true; + this.logger.warn( + `Invalid SANDBOX_TTL_MS (must be a positive integer); ` + + `falling back to the 3600000 ms default`, + ); + } + return 3_600_000; + } + return parsed; } // Per-blob cap for non-image blobs (the serialized document). Default 8 MiB. diff --git a/apps/server/src/integrations/mcp/mcp-auth.helpers.ts b/apps/server/src/integrations/mcp/mcp-auth.helpers.ts index 24e60206..80e34d4f 100644 --- a/apps/server/src/integrations/mcp/mcp-auth.helpers.ts +++ b/apps/server/src/integrations/mcp/mcp-auth.helpers.ts @@ -143,6 +143,11 @@ export type DocmostMcpConfig = ( buf: Buffer, mime: string, ) => { uri: string; sha256: string; size: number }; + // Optional live/evict probes the package uses to keep stash_page's mirror + // counts honest under the store's FIFO eviction (mirror of the package's + // sink type); older bindings omit them. + has?: (uri: string) => boolean; + evict?: (uri: string) => void; }; }; diff --git a/apps/server/src/integrations/mcp/mcp-basic-login-gate.spec.ts b/apps/server/src/integrations/mcp/mcp-basic-login-gate.spec.ts index 416470da..d55451cb 100644 --- a/apps/server/src/integrations/mcp/mcp-basic-login-gate.spec.ts +++ b/apps/server/src/integrations/mcp/mcp-basic-login-gate.spec.ts @@ -109,7 +109,6 @@ function makeService(opts: { }; const service = new McpService( - undefined as never, // environmentService undefined as never, // workspaceRepo undefined as never, // authService undefined as never, // tokenService diff --git a/apps/server/src/integrations/mcp/mcp.service.ts b/apps/server/src/integrations/mcp/mcp.service.ts index af8719ec..84a8d535 100644 --- a/apps/server/src/integrations/mcp/mcp.service.ts +++ b/apps/server/src/integrations/mcp/mcp.service.ts @@ -8,7 +8,6 @@ import { ModuleRef } from '@nestjs/core'; import { pathToFileURL } from 'node:url'; import { IncomingMessage } from 'node:http'; import { FastifyReply, FastifyRequest } from 'fastify'; -import { EnvironmentService } from '../environment/environment.service'; import { WorkspaceRepo } from '@docmost/db/repos/workspace/workspace.repo'; import { UserRepo } from '@docmost/db/repos/user/user.repo'; import { UserSessionRepo } from '@docmost/db/repos/session/user-session.repo'; @@ -93,7 +92,6 @@ export class McpService implements OnModuleDestroy { private readonly sweepTimer: NodeJS.Timeout; constructor( - private readonly environmentService: EnvironmentService, private readonly workspaceRepo: WorkspaceRepo, private readonly authService: AuthService, private readonly tokenService: TokenService, @@ -118,20 +116,17 @@ export class McpService implements OnModuleDestroy { clearInterval(this.sweepTimer); } - // Bind the stash tool to the shared in-RAM SandboxStore and compose the - // anonymous public URL (the MCP package owns neither env nor the store). - // put() returns the read URL + sha256/size; sha256 is also the blob ETag. + // Bind the stash tool to the shared in-RAM SandboxStore. The store owns the + // anonymous-URL composition (putAndLink) and the live/evict probes the MCP + // package needs to keep its mirror counts honest under FIFO eviction; the + // package owns neither env nor the store. The sink speaks `uri`s, so the + // probes map a uri back to its id (the last path segment). private buildSandboxConfig(): DocmostMcpConfig['sandbox'] { + const idOf = (uri: string) => uri.substring(uri.lastIndexOf('/') + 1); return { - put: (buf: Buffer, mime: string) => { - const stored = this.sandboxStore.put(buf, mime); - const base = this.environmentService.getSandboxPublicUrl(); - return { - uri: `${base}/api/sb/${stored.id}`, - sha256: stored.sha256, - size: stored.size, - }; - }, + put: (buf, mime) => this.sandboxStore.putAndLink(buf, mime), + has: (uri) => this.sandboxStore.has(idOf(uri)), + evict: (uri) => this.sandboxStore.remove(idOf(uri)), }; } diff --git a/apps/server/src/integrations/sandbox/sandbox.constants.ts b/apps/server/src/integrations/sandbox/sandbox.constants.ts new file mode 100644 index 00000000..2a768ee4 --- /dev/null +++ b/apps/server/src/integrations/sandbox/sandbox.constants.ts @@ -0,0 +1,6 @@ +// Single source of truth for the anonymous blob-sandbox route. The controller +// is mounted under the global `/api` prefix, so its decorator uses the bare +// segment while the public URL and the workspace-gate exclusion need the full +// path — derive the latter from the former so the two never drift. +export const SANDBOX_ROUTE_SEGMENT = 'sb'; +export const SANDBOX_API_PATH = `/api/${SANDBOX_ROUTE_SEGMENT}`; diff --git a/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts b/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts index fb18f555..d5930b47 100644 --- a/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts +++ b/apps/server/src/integrations/sandbox/sandbox.controller.spec.ts @@ -121,4 +121,70 @@ describe('SandboxController', () => { expect(res._sent.status).toBe(200); }); + + it('returns 304 for a wildcard "*" If-None-Match', async () => { + const sha = 'e'.repeat(64); + const store = { + get: jest.fn().mockReturnValue(entry(Buffer.from('x'), 'application/json', sha)), + }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get(VALID_ID, makeReq({ 'if-none-match': '*' }), res); + + expect(res._sent.status).toBe(304); + }); + + it('returns 304 for a weak validator W/""', async () => { + const sha = 'f'.repeat(64); + const store = { + get: jest.fn().mockReturnValue(entry(Buffer.from('x'), 'application/json', sha)), + }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get(VALID_ID, makeReq({ 'if-none-match': `W/"${sha}"` }), res); + + expect(res._sent.status).toBe(304); + }); + + it('returns 304 when a comma-separated If-None-Match list contains the sha', async () => { + const sha = '1'.repeat(64); + const store = { + get: jest.fn().mockReturnValue(entry(Buffer.from('x'), 'application/json', sha)), + }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get( + VALID_ID, + makeReq({ 'if-none-match': `"other", "${sha}"` }), + res, + ); + + expect(res._sent.status).toBe(304); + }); + + it('sets a private, immutable Cache-Control with a max-age within the TTL on 200', async () => { + const sha = '2'.repeat(64); + // Known TTL: ~30s out, so the floored max-age must land within [0, 60]. + const e: SandboxEntry = { + buf: Buffer.from('x'), + mime: 'application/json', + sha256: sha, + expiresAt: Date.now() + 30_000, + }; + const store = { get: jest.fn().mockReturnValue(e) }; + const controller = new SandboxController(store as any); + const res = makeRes(); + + await controller.get(VALID_ID, makeReq(), res); + + expect(res._sent.status).toBe(200); + const cc = res._sent.headers['cache-control'] as string; + expect(cc).toMatch(/^private, max-age=\d+, immutable$/); + const maxAge = Number(cc.match(/max-age=(\d+)/)![1]); + expect(maxAge).toBeGreaterThanOrEqual(0); + expect(maxAge).toBeLessThanOrEqual(60); + }); }); diff --git a/apps/server/src/integrations/sandbox/sandbox.controller.ts b/apps/server/src/integrations/sandbox/sandbox.controller.ts index 615e2c60..c01f0cba 100644 --- a/apps/server/src/integrations/sandbox/sandbox.controller.ts +++ b/apps/server/src/integrations/sandbox/sandbox.controller.ts @@ -1,6 +1,7 @@ import { Controller, Get, Param, Req, Res } from '@nestjs/common'; import { FastifyReply, FastifyRequest } from 'fastify'; import { SandboxStore } from './sandbox.store'; +import { SANDBOX_ROUTE_SEGMENT } from './sandbox.constants'; // Strict UUID v-agnostic shape. This is anti-traversal / input hygiene (so `:id` // can never be a path like `../...`), NOT authorization — the capability is the @@ -22,7 +23,7 @@ const UUID_RE = * UUID; `:id` is never used as a filesystem path, so there is no traversal * surface. Never returns tokens, never 401s. */ -@Controller('sb') +@Controller(SANDBOX_ROUTE_SEGMENT) export class SandboxController { constructor(private readonly store: SandboxStore) {} diff --git a/apps/server/src/integrations/sandbox/sandbox.store.ts b/apps/server/src/integrations/sandbox/sandbox.store.ts index a833e8d7..7d604d32 100644 --- a/apps/server/src/integrations/sandbox/sandbox.store.ts +++ b/apps/server/src/integrations/sandbox/sandbox.store.ts @@ -1,6 +1,7 @@ import { Injectable, Logger, OnModuleDestroy } from '@nestjs/common'; import { createHash, randomUUID } from 'node:crypto'; import { EnvironmentService } from '../environment/environment.service'; +import { SANDBOX_API_PATH } from './sandbox.constants'; // In-RAM, process-local blob store. No disk, no DB. Ephemeral by design: a // restart empties it. A blob is addressed by an unguessable randomUUID() which @@ -17,7 +18,6 @@ export interface SandboxPutResult { id: string; sha256: string; size: number; - expiresAt: number; } @Injectable() @@ -86,7 +86,36 @@ export class SandboxStore implements OnModuleDestroy { const expiresAt = Date.now() + this.environmentService.getSandboxTtlMs(); this.map.set(id, { buf, mime, sha256, expiresAt }); this.totalBytes += buf.length; - return { id, sha256, size: buf.length, expiresAt }; + return { id, sha256, size: buf.length }; + } + + /** + * Store a blob and return its anonymous read URL plus integrity metadata. + * Owns the single sandbox-URL composition (`${publicBase}${SANDBOX_API_PATH}/ + * `) so callers never hand-build the route; the raw put() stays public for + * tests/low-level callers. sha256 is also the blob's strong ETag. + */ + putAndLink( + buf: Buffer, + mime: string, + ): { uri: string; sha256: string; size: number } { + const stored = this.put(buf, mime); + const base = this.environmentService.getSandboxPublicUrl(); + return { + uri: `${base}${SANDBOX_API_PATH}/${stored.id}`, + sha256: stored.sha256, + size: stored.size, + }; + } + + /** True if the blob is still live (not evicted/expired). */ + has(id: string): boolean { + return this.get(id) !== undefined; + } + + /** Drop a blob by id (public wrapper over the private FIFO evict). */ + remove(id: string): void { + this.evict(id); } /** Returns the entry, or undefined if missing OR expired (lazy expiry). */ diff --git a/apps/server/src/main.ts b/apps/server/src/main.ts index 5ae036fb..b5fa0027 100644 --- a/apps/server/src/main.ts +++ b/apps/server/src/main.ts @@ -13,6 +13,7 @@ import fastifyCookie from '@fastify/cookie'; import fastifyIp from 'fastify-ip'; import { InternalLogFilter } from './common/logger/internal-log-filter'; import { EnvironmentService } from './integrations/environment/environment.service'; +import { SANDBOX_API_PATH } from './integrations/sandbox/sandbox.constants'; import { resolveFrameHeader } from './common/helpers'; import { resolveTrustProxy } from './integrations/environment/trust-proxy.util'; @@ -129,7 +130,7 @@ async function bootstrap() { // Anonymous in-RAM blob sandbox: a remote consumer fetches blobs by an // unguessable UUID without any workspace host context, so the // workspace-resolution gate must not apply. - '/api/sb', + SANDBOX_API_PATH, ]; if ( diff --git a/packages/mcp/README.md b/packages/mcp/README.md index 2adb9024..57dd8d9d 100644 --- a/packages/mcp/README.md +++ b/packages/mcp/README.md @@ -16,7 +16,7 @@ license. > that interface. Other Docmost MCPs are human-shaped — they expose "open the page" and > "replace the page"; this one exposes the editing primitives a model is good at. -It exposes **38 tools** built around three ideas that the other Docmost MCPs do not +It exposes **39 tools** built around three ideas that the other Docmost MCPs do not combine: 1. **Surgical, token-cheap edits.** Address a single block by id and patch it, or run @@ -106,7 +106,7 @@ There are several Docmost MCPs. Here is a capability-by-capability comparison. ## Tools -All 38 tools, grouped by what you'd reach for them. +All 39 tools, grouped by what you'd reach for them. ### Exploration & retrieval @@ -203,6 +203,14 @@ All 38 tools, grouped by what you'd reach for them. node referencing the old attachment (recursively, including callouts/tables) via the live document, preserving comments, alignment and alt text. (In-place overwrite is deliberately avoided — some Docmost versions corrupt the attachment on overwrite.) +- **`stash_page`** — Serialize a whole page (its full ProseMirror JSON) into an ephemeral + in-RAM blob and return ONLY a short anonymous URL — the body never enters the model + context, so it is the way to hand a large page (and its images) to an external consumer + without truncation. Every internal file/image attachment is mirrored into the same + sandbox and its `src` rewritten to a sandbox URL; external http(s) images are left + untouched. Returns `{ uri, size, sha256, images:{ mirrored, failed } }` (`sha256` is also + the blob's ETag). Blobs are RAM-only, expire after a short TTL (~1h) and are bound to the + server instance that created them. ### Comments diff --git a/packages/mcp/README.ru.md b/packages/mcp/README.ru.md index ebf02f3d..305f7013 100644 --- a/packages/mcp/README.ru.md +++ b/packages/mcp/README.ru.md @@ -17,7 +17,7 @@ > «открыть страницу» и «заменить страницу»; этот даёт примитивы редактирования, в которых > модель сильна. -Сервер предоставляет **38 инструментов**, построенных вокруг трёх идей, которые другие +Сервер предоставляет **39 инструментов**, построенных вокруг трёх идей, которые другие Docmost-MCP не сочетают: 1. **Точечные, экономичные по токенам правки.** Адресуйте отдельный блок по id и патчите @@ -109,7 +109,7 @@ Docmost-MCP не сочетают: ## Инструменты -Все 38 инструментов, сгруппированы по задачам, для которых вы их возьмёте. +Все 39 инструментов, сгруппированы по задачам, для которых вы их возьмёте. ### Чтение и поиск @@ -209,6 +209,15 @@ Docmost-MCP не сочетают: коллауты/таблицы), через живой документ, сохраняя комментарии, выравнивание и alt-текст. (Перезапись «по месту» намеренно не используется — некоторые версии Docmost портят вложение при перезаписи.) +- **`stash_page`** — Сериализовать страницу целиком (её полный ProseMirror JSON) в + эфемерный blob в оперативной памяти и вернуть ТОЛЬКО короткий анонимный URL — тело + никогда не попадает в контекст модели, поэтому это способ передать большую страницу + (вместе с её изображениями) внешнему потребителю без усечения. Каждое внутреннее + файловое/графическое вложение зеркалируется в тот же sandbox, а его `src` переписывается + на URL sandbox; внешние http(s)-изображения остаются нетронутыми. Возвращает + `{ uri, size, sha256, images:{ mirrored, failed } }` (`sha256` — это также ETag blob'а). + Blob'ы хранятся только в оперативной памяти, истекают через короткий TTL (~1 ч) и + привязаны к тому экземпляру сервера, который их создал. ### Комментарии diff --git a/packages/mcp/build/client.js b/packages/mcp/build/client.js index 74e42c87..80da5454 100644 --- a/packages/mcp/build/client.js +++ b/packages/mcp/build/client.js @@ -7,7 +7,7 @@ import { TiptapTransformer } from "@hocuspocus/transformer"; import * as Y from "yjs"; import WebSocket from "ws"; import { convertProseMirrorToMarkdown } from "./lib/markdown-converter.js"; -import { collectInternalFileNodes, normalizeFileUrl, } from "./lib/internal-file-urls.js"; +import { collectInternalFileNodes, normalizeFileUrl, resolveInternalFilePath, } from "./lib/internal-file-urls.js"; import { updatePageContentRealtime, replacePageContent, markdownToProseMirror, markdownToProseMirrorCanonical, mutatePageContent, buildCollabWsUrl, assertYjsEncodable, applyDocToFragment, } from "./lib/collaboration.js"; import { footnoteWarningsField } from "./lib/footnote-analyze.js"; import { buildPageTree } from "./lib/tree.js"; @@ -54,6 +54,11 @@ export class DocmostClient { getCollabTokenFn = null; // Optional blob-sandbox sink for the stash tool. Null when not configured. sandboxPut = null; + // Optional probes paired with the sink. `has` lets stashPage detect a blob + // FIFO-evicted by a LATER put in the same stash; `evict` lets it free this + // op's image blobs if the final doc put throws. Null when the sink omits them. + sandboxHas = null; + sandboxEvict = null; // In-flight login dedup: when the token expires, the 401 interceptor, // ensureAuthenticated, getCollabTokenWithReauth and the two multipart retries // can all call login() at once. Memoizing a single promise collapses that @@ -82,6 +87,8 @@ export class DocmostClient { } if (config.sandbox) { this.sandboxPut = config.sandbox.put; + this.sandboxHas = config.sandbox.has ?? null; + this.sandboxEvict = config.sandbox.evict ?? null; } this.client = axios.create({ baseURL: this.apiUrl, @@ -619,12 +626,16 @@ export class DocmostClient { * and the response Content-Type (mime), defaulting to octet-stream. * * The fetch is size-bounded (hard 64 MiB ceiling) purely to protect memory; - * the authoritative per-blob cap is enforced by the sandbox `put`. The leading - * `/api` strip means this never escapes the Docmost API base. + * the authoritative per-blob cap is enforced by the sandbox `put`. The path is + * resolved via resolveInternalFilePath, which REJECTS (throws) any traversal + * or percent-encoded src that would let an attacker-controlled `attrs.src` + * escape `/api/files/` and reach another internal endpoint (SSRF). That throw + * happens before this.client.get, so a malicious src is counted as a failed + * mirror — it never reaches the network. */ async fetchInternalFile(src) { const HARD_CEILING = 64 * 1024 * 1024; // 64 MiB memory guard - const relPath = src.replace(/^\/api/, ""); + const relPath = resolveInternalFilePath(src); const response = await this.client.get(relPath, { responseType: "arraybuffer", timeout: 30000, @@ -668,42 +679,82 @@ export class DocmostClient { const cloned = structuredClone(pageJson); // Group internal-file nodes by normalized src so each unique resource is // fetched + stored ONCE (dedup), and every node sharing that src points at - // the one sandbox blob. + // the one sandbox blob. Capture each node's ORIGINAL raw src per-node: + // dedup groups nodes whose normalized src is equal even when their raw srcs + // differ (e.g. `/api/files/...` vs the bare `/files/...`), so on a revert we + // must restore each node's own original value, not the group key. const bySrc = new Map(); for (const node of collectInternalFileNodes(cloned.content)) { - const src = normalizeFileUrl(String(node.attrs.src)); + const origSrc = String(node.attrs.src); + const src = normalizeFileUrl(origSrc); + const entry = { node, origSrc }; const group = bySrc.get(src); if (group) - group.push(node); + group.push(entry); else - bySrc.set(src, [node]); + bySrc.set(src, [entry]); } let mirrored = 0; let failed = 0; + // Record every successful mirror so it can be (a) reverted if its blob gets + // FIFO-evicted by a LATER put in this same stash, and (b) freed if the final + // doc put throws. + const mirrors = []; const MAX_CONCURRENCY = 5; const groups = [...bySrc.entries()]; for (let i = 0; i < groups.length; i += MAX_CONCURRENCY) { const batch = groups.slice(i, i + MAX_CONCURRENCY); - await Promise.all(batch.map(async ([src, nodes]) => { + await Promise.all(batch.map(async ([src, entries]) => { try { const { buffer, mime } = await this.fetchInternalFile(src); // put may throw if the blob exceeds the per-blob/total caps. const stored = this.sandboxPut(buffer, mime); - for (const node of nodes) - node.attrs.src = stored.uri; + for (const entry of entries) + entry.node.attrs.src = stored.uri; + mirrors.push({ uri: stored.uri, entries }); mirrored++; } catch (err) { - // One bad/oversized image must not abort the document. + // One bad/oversized image (or a rejected traversal src) must not + // abort the document. Logged unconditionally (never the blob body), + // matching the package's ungated console.warn convention. failed++; - if (process.env.DEBUG) { - console.error(`stash_page: failed to mirror "${src}":`, err); - } + console.warn(`stash_page: failed to mirror "${src}": ${err instanceof Error ? err.message : String(err)}`); } })); } + // Reconcile against FIFO eviction: a heavy page can have a later image-put + // evict an EARLIER image stored in this SAME stash. The stored doc must not + // reference an evicted blob (consumer 404) and the counts must not lie, so + // for any mirror whose blob is gone, revert its nodes to their original + // internal srcs and re-count it as failed. + if (this.sandboxHas) { + for (const mirror of mirrors) { + if (!this.sandboxHas(mirror.uri)) { + for (const entry of mirror.entries) { + entry.node.attrs.src = entry.origSrc; + } + mirrored--; + failed++; + console.warn(`stash_page: mirrored blob ${mirror.uri} was evicted before ` + + `the doc was stored; reverted its src and counted it as failed`); + } + } + } const docBuf = Buffer.from(JSON.stringify(cloned), "utf8"); - const stored = this.sandboxPut(docBuf, "application/json"); + let stored; + try { + stored = this.sandboxPut(docBuf, "application/json"); + } + catch (err) { + // The doc put failed (e.g. doc exceeds the cap). Free this op's image + // blobs instead of leaking them in RAM for the whole TTL, then re-throw. + if (this.sandboxEvict) { + for (const mirror of mirrors) + this.sandboxEvict(mirror.uri); + } + throw err; + } return { uri: stored.uri, sha256: stored.sha256, diff --git a/packages/mcp/build/lib/internal-file-urls.js b/packages/mcp/build/lib/internal-file-urls.js index 4eab311d..3fc89bbe 100644 --- a/packages/mcp/build/lib/internal-file-urls.js +++ b/packages/mcp/build/lib/internal-file-urls.js @@ -11,7 +11,7 @@ // // Mirrors editor-ext's isInternalFileUrl / normalizeFileUrl (kept as a local // dup so the ESM mcp package does not depend on the editor-ext build). -export function isInternalFileUrl(url) { +function isInternalFileUrl(url) { if (typeof url !== "string") return false; const normalized = url.trim(); @@ -24,6 +24,61 @@ export function normalizeFileUrl(src) { return "/api" + trimmed; return trimmed; } +/** + * Resolve a page-content `src` into the safe, `/api`-relative path the stash + * tool may fetch over the authenticated loopback client — or THROW. + * + * SECURITY (SSRF / path-traversal): `src` comes from page content and is fully + * attacker-controllable. The mirroring fetch runs through the AUTHENTICATED + * loopback axios client whose baseURL ends in `/api`, so a naive + * `src.replace(/^\/api/, "")` lets a crafted value like + * `/api/files/../auth/whoami` collapse (via axios/WHATWG URL `..` resolution) + * into an ARBITRARY internal GET endpoint, whose authed response would then be + * stored in the anonymous sandbox (SSRF + data exfiltration). A prefix-only + * `startsWith("/api/files/")` check does NOT defend against this because the + * `..` segments are still present in the raw string and resolved later. + * + * This function defeats that by resolving the canonical pathname FIRST and only + * then asserting it still lives under `/api/files/`: + * - it rejects any percent-encoded dot/slash (`%2e` / `%2f`): the WHATWG URL + * parser collapses LITERAL `../` but does NOT decode `%2f` separators, so a + * content-controlled src must never be allowed to smuggle those past the + * canonicalization; + * - it resolves `new URL(trimmed, "http://internal.invalid").pathname`, which + * normalizes `..`/`.` segments (e.g. `/api/files/../auth/whoami` → + * `/api/auth/whoami`); + * - it then requires the canonical pathname to start with `/api/files/`, so a + * traversal that escaped that subtree is rejected. + * + * Returns the path RELATIVE to the `/api` base (e.g. `/files//`), + * ready to hand to the loopback client. The throw happens BEFORE any network + * call, so a rejected src is counted as a failed mirror and its original src is + * kept (the per-image try/catch in stashPage never aborts the whole document). + */ +export function resolveInternalFilePath(src) { + const trimmed = src.trim(); + // Percent-encoded dot/slash must never reach the URL canonicalizer: the + // WHATWG parser does NOT decode `%2f` into a path separator, so an encoded + // `..%2fauth` would survive canonicalization and still escape /api/files/. + if (/%2e|%2f/i.test(trimmed)) { + throw new Error(`Refusing internal file src with percent-encoded path segment: "${src}"`); + } + let pathname; + try { + // The base host is irrelevant (never contacted); it only lets the parser + // resolve a relative `src` and normalize `..`/`.` segments. + pathname = new URL(trimmed, "http://internal.invalid").pathname; + } + catch { + throw new Error(`Invalid internal file src: "${src}"`); + } + if (!pathname.startsWith("/api/files/")) { + throw new Error(`Refusing internal file src that escapes /api/files/: "${src}"`); + } + // Strip the `/api` base prefix; the loopback client's baseURL already ends + // in `/api`, so it expects the path relative to that (e.g. /files//). + return pathname.replace(/^\/api/, ""); +} /** * Recursively collect every node whose `attrs.src` is an internal file URL. * Returns references to the live nodes (so the caller can rewrite `attrs.src` diff --git a/packages/mcp/build/tool-specs.js b/packages/mcp/build/tool-specs.js index 7b9e2f19..2d3c8ab6 100644 --- a/packages/mcp/build/tool-specs.js +++ b/packages/mcp/build/tool-specs.js @@ -224,7 +224,10 @@ export const SHARED_TOOL_SPECS = { 'failed} }. Integrity: the blob is served with ETag = its sha256, so a ' + 'truncated/corrupted fetch is detectable. Blobs are RAM-only: they expire ' + 'after a short TTL (~1h) and are cleared on restart — consume the URL ' + - 'within the TTL and one uptime, or re-stash.', + 'within the TTL and one uptime, or re-stash. A blob is bound to the ' + + 'server instance that created it: in a multi-replica deployment without ' + + 'sticky sessions a blob stored on one instance is not retrievable via the ' + + 'sandbox URL on another (it 404s like an expired one).', buildShape: (z) => ({ pageId: z.string().min(1), }), diff --git a/packages/mcp/src/client.ts b/packages/mcp/src/client.ts index 1dfd58e4..f2afb716 100644 --- a/packages/mcp/src/client.ts +++ b/packages/mcp/src/client.ts @@ -16,6 +16,7 @@ import { convertProseMirrorToMarkdown } from "./lib/markdown-converter.js"; import { collectInternalFileNodes, normalizeFileUrl, + resolveInternalFilePath, } from "./lib/internal-file-urls.js"; import { updatePageContentRealtime, @@ -122,8 +123,14 @@ export type DocmostMcpConfig = { apiUrl: string } & ( // both branches; see the type doc above. getCollabToken?: () => Promise; // Optional blob sandbox sink. Present only where the stash tool is wired; - // when absent, stash_page throws a clear "not configured" error. - sandbox?: { put: SandboxPut }; + // when absent, stash_page throws a clear "not configured" error. The + // optional `has`/`evict` probes let stashPage keep its mirror counts honest + // under the store's FIFO eviction (see stashPage); older sinks omit them. + sandbox?: { + put: SandboxPut; + has?: (uri: string) => boolean; + evict?: (uri: string) => void; + }; }; export class DocmostClient { @@ -143,6 +150,11 @@ export class DocmostClient { private getCollabTokenFn: (() => Promise) | null = null; // Optional blob-sandbox sink for the stash tool. Null when not configured. private sandboxPut: SandboxPut | null = null; + // Optional probes paired with the sink. `has` lets stashPage detect a blob + // FIFO-evicted by a LATER put in the same stash; `evict` lets it free this + // op's image blobs if the final doc put throws. Null when the sink omits them. + private sandboxHas: ((uri: string) => boolean) | null = null; + private sandboxEvict: ((uri: string) => void) | null = null; // In-flight login dedup: when the token expires, the 401 interceptor, // ensureAuthenticated, getCollabTokenWithReauth and the two multipart retries // can all call login() at once. Memoizing a single promise collapses that @@ -184,6 +196,8 @@ export class DocmostClient { } if (config.sandbox) { this.sandboxPut = config.sandbox.put; + this.sandboxHas = config.sandbox.has ?? null; + this.sandboxEvict = config.sandbox.evict ?? null; } this.client = axios.create({ baseURL: this.apiUrl, @@ -795,14 +809,18 @@ export class DocmostClient { * and the response Content-Type (mime), defaulting to octet-stream. * * The fetch is size-bounded (hard 64 MiB ceiling) purely to protect memory; - * the authoritative per-blob cap is enforced by the sandbox `put`. The leading - * `/api` strip means this never escapes the Docmost API base. + * the authoritative per-blob cap is enforced by the sandbox `put`. The path is + * resolved via resolveInternalFilePath, which REJECTS (throws) any traversal + * or percent-encoded src that would let an attacker-controlled `attrs.src` + * escape `/api/files/` and reach another internal endpoint (SSRF). That throw + * happens before this.client.get, so a malicious src is counted as a failed + * mirror — it never reaches the network. */ private async fetchInternalFile( src: string, ): Promise<{ buffer: Buffer; mime: string }> { const HARD_CEILING = 64 * 1024 * 1024; // 64 MiB memory guard - const relPath = src.replace(/^\/api/, ""); + const relPath = resolveInternalFilePath(src); const response = await this.client.get(relPath, { responseType: "arraybuffer", timeout: 30000, @@ -857,42 +875,90 @@ export class DocmostClient { // Group internal-file nodes by normalized src so each unique resource is // fetched + stored ONCE (dedup), and every node sharing that src points at - // the one sandbox blob. - const bySrc = new Map(); + // the one sandbox blob. Capture each node's ORIGINAL raw src per-node: + // dedup groups nodes whose normalized src is equal even when their raw srcs + // differ (e.g. `/api/files/...` vs the bare `/files/...`), so on a revert we + // must restore each node's own original value, not the group key. + const bySrc = new Map>(); for (const node of collectInternalFileNodes(cloned.content)) { - const src = normalizeFileUrl(String(node.attrs.src)); + const origSrc = String(node.attrs.src); + const src = normalizeFileUrl(origSrc); + const entry = { node, origSrc }; const group = bySrc.get(src); - if (group) group.push(node); - else bySrc.set(src, [node]); + if (group) group.push(entry); + else bySrc.set(src, [entry]); } let mirrored = 0; let failed = 0; + // Record every successful mirror so it can be (a) reverted if its blob gets + // FIFO-evicted by a LATER put in this same stash, and (b) freed if the final + // doc put throws. + const mirrors: Array<{ + uri: string; + entries: Array<{ node: any; origSrc: string }>; + }> = []; const MAX_CONCURRENCY = 5; const groups = [...bySrc.entries()]; for (let i = 0; i < groups.length; i += MAX_CONCURRENCY) { const batch = groups.slice(i, i + MAX_CONCURRENCY); await Promise.all( - batch.map(async ([src, nodes]) => { + batch.map(async ([src, entries]) => { try { const { buffer, mime } = await this.fetchInternalFile(src); // put may throw if the blob exceeds the per-blob/total caps. const stored = this.sandboxPut!(buffer, mime); - for (const node of nodes) node.attrs.src = stored.uri; + for (const entry of entries) entry.node.attrs.src = stored.uri; + mirrors.push({ uri: stored.uri, entries }); mirrored++; } catch (err) { - // One bad/oversized image must not abort the document. + // One bad/oversized image (or a rejected traversal src) must not + // abort the document. Logged unconditionally (never the blob body), + // matching the package's ungated console.warn convention. failed++; - if (process.env.DEBUG) { - console.error(`stash_page: failed to mirror "${src}":`, err); - } + console.warn( + `stash_page: failed to mirror "${src}": ${ + err instanceof Error ? err.message : String(err) + }`, + ); } }), ); } + // Reconcile against FIFO eviction: a heavy page can have a later image-put + // evict an EARLIER image stored in this SAME stash. The stored doc must not + // reference an evicted blob (consumer 404) and the counts must not lie, so + // for any mirror whose blob is gone, revert its nodes to their original + // internal srcs and re-count it as failed. + if (this.sandboxHas) { + for (const mirror of mirrors) { + if (!this.sandboxHas(mirror.uri)) { + for (const entry of mirror.entries) { + entry.node.attrs.src = entry.origSrc; + } + mirrored--; + failed++; + console.warn( + `stash_page: mirrored blob ${mirror.uri} was evicted before ` + + `the doc was stored; reverted its src and counted it as failed`, + ); + } + } + } + const docBuf = Buffer.from(JSON.stringify(cloned), "utf8"); - const stored = this.sandboxPut(docBuf, "application/json"); + let stored: { uri: string; sha256: string; size: number }; + try { + stored = this.sandboxPut(docBuf, "application/json"); + } catch (err) { + // The doc put failed (e.g. doc exceeds the cap). Free this op's image + // blobs instead of leaking them in RAM for the whole TTL, then re-throw. + if (this.sandboxEvict) { + for (const mirror of mirrors) this.sandboxEvict(mirror.uri); + } + throw err; + } return { uri: stored.uri, sha256: stored.sha256, diff --git a/packages/mcp/src/lib/internal-file-urls.ts b/packages/mcp/src/lib/internal-file-urls.ts index e201e1e0..42d4b65f 100644 --- a/packages/mcp/src/lib/internal-file-urls.ts +++ b/packages/mcp/src/lib/internal-file-urls.ts @@ -12,7 +12,7 @@ // Mirrors editor-ext's isInternalFileUrl / normalizeFileUrl (kept as a local // dup so the ESM mcp package does not depend on the editor-ext build). -export function isInternalFileUrl(url: unknown): boolean { +function isInternalFileUrl(url: unknown): boolean { if (typeof url !== "string") return false; const normalized = url.trim(); return ( @@ -27,6 +27,65 @@ export function normalizeFileUrl(src: string): string { return trimmed; } +/** + * Resolve a page-content `src` into the safe, `/api`-relative path the stash + * tool may fetch over the authenticated loopback client — or THROW. + * + * SECURITY (SSRF / path-traversal): `src` comes from page content and is fully + * attacker-controllable. The mirroring fetch runs through the AUTHENTICATED + * loopback axios client whose baseURL ends in `/api`, so a naive + * `src.replace(/^\/api/, "")` lets a crafted value like + * `/api/files/../auth/whoami` collapse (via axios/WHATWG URL `..` resolution) + * into an ARBITRARY internal GET endpoint, whose authed response would then be + * stored in the anonymous sandbox (SSRF + data exfiltration). A prefix-only + * `startsWith("/api/files/")` check does NOT defend against this because the + * `..` segments are still present in the raw string and resolved later. + * + * This function defeats that by resolving the canonical pathname FIRST and only + * then asserting it still lives under `/api/files/`: + * - it rejects any percent-encoded dot/slash (`%2e` / `%2f`): the WHATWG URL + * parser collapses LITERAL `../` but does NOT decode `%2f` separators, so a + * content-controlled src must never be allowed to smuggle those past the + * canonicalization; + * - it resolves `new URL(trimmed, "http://internal.invalid").pathname`, which + * normalizes `..`/`.` segments (e.g. `/api/files/../auth/whoami` → + * `/api/auth/whoami`); + * - it then requires the canonical pathname to start with `/api/files/`, so a + * traversal that escaped that subtree is rejected. + * + * Returns the path RELATIVE to the `/api` base (e.g. `/files//`), + * ready to hand to the loopback client. The throw happens BEFORE any network + * call, so a rejected src is counted as a failed mirror and its original src is + * kept (the per-image try/catch in stashPage never aborts the whole document). + */ +export function resolveInternalFilePath(src: string): string { + const trimmed = src.trim(); + // Percent-encoded dot/slash must never reach the URL canonicalizer: the + // WHATWG parser does NOT decode `%2f` into a path separator, so an encoded + // `..%2fauth` would survive canonicalization and still escape /api/files/. + if (/%2e|%2f/i.test(trimmed)) { + throw new Error( + `Refusing internal file src with percent-encoded path segment: "${src}"`, + ); + } + let pathname: string; + try { + // The base host is irrelevant (never contacted); it only lets the parser + // resolve a relative `src` and normalize `..`/`.` segments. + pathname = new URL(trimmed, "http://internal.invalid").pathname; + } catch { + throw new Error(`Invalid internal file src: "${src}"`); + } + if (!pathname.startsWith("/api/files/")) { + throw new Error( + `Refusing internal file src that escapes /api/files/: "${src}"`, + ); + } + // Strip the `/api` base prefix; the loopback client's baseURL already ends + // in `/api`, so it expects the path relative to that (e.g. /files//). + return pathname.replace(/^\/api/, ""); +} + /** * Recursively collect every node whose `attrs.src` is an internal file URL. * Returns references to the live nodes (so the caller can rewrite `attrs.src` diff --git a/packages/mcp/src/tool-specs.ts b/packages/mcp/src/tool-specs.ts index 7a19cd66..4074d099 100644 --- a/packages/mcp/src/tool-specs.ts +++ b/packages/mcp/src/tool-specs.ts @@ -283,7 +283,10 @@ export const SHARED_TOOL_SPECS = { 'failed} }. Integrity: the blob is served with ETag = its sha256, so a ' + 'truncated/corrupted fetch is detectable. Blobs are RAM-only: they expire ' + 'after a short TTL (~1h) and are cleared on restart — consume the URL ' + - 'within the TTL and one uptime, or re-stash.', + 'within the TTL and one uptime, or re-stash. A blob is bound to the ' + + 'server instance that created it: in a multi-replica deployment without ' + + 'sticky sessions a blob stored on one instance is not retrievable via the ' + + 'sandbox URL on another (it 404s like an expired one).', buildShape: (z) => ({ pageId: z.string().min(1), }), diff --git a/packages/mcp/test/mock/stash-page.test.mjs b/packages/mcp/test/mock/stash-page.test.mjs index 126caab6..647ea9d7 100644 --- a/packages/mcp/test/mock/stash-page.test.mjs +++ b/packages/mcp/test/mock/stash-page.test.mjs @@ -41,17 +41,53 @@ after(async () => { // In-memory sandbox sink mirroring the host binding: store the blob, return a // uri + sha256 + size. Records every put so the test can inspect what was -// stashed (and verify the doc body never leaves via the return value). -function makeSandbox() { +// stashed (and verify the doc body never leaves via the return value). Models +// the real store's FIFO eviction + cap + the has/evict probes so B1 (self- +// eviction reconciliation and doc-put-throw cleanup) is testable. Default +// maxTotal is effectively unlimited so the happy-path tests behave as before. +// +// `throwOnJson` forces the final document put to throw, standing in for "doc +// exceeds the cap". +function makeSandbox({ maxTotal = Infinity, throwOnJson = false } = {}) { const puts = []; + const evicted = []; + // id -> size, in insertion order (Map preserves it) so the oldest is first. + const live = new Map(); + let total = 0; + const idOf = (uri) => uri.substring(uri.lastIndexOf("/") + 1); return { puts, + evicted, put(buf, mime) { + if (throwOnJson && mime === "application/json") { + throw new Error("doc blob exceeds the sandbox cap"); + } const sha256 = createHash("sha256").update(buf).digest("hex"); const id = `id-${puts.length}`; - puts.push({ buf, mime, sha256 }); + puts.push({ buf, mime, sha256, id }); + live.set(id, buf.length); + total += buf.length; + // FIFO-evict the oldest live blobs until this put fits under the cap. + while (total > maxTotal && live.size > 0) { + const oldest = live.keys().next().value; + if (oldest === id) break; // never evict the blob we just stored + total -= live.get(oldest); + live.delete(oldest); + evicted.push(oldest); + } return { uri: `https://sb.test/api/sb/${id}`, sha256, size: buf.length }; }, + has(uri) { + return live.has(idOf(uri)); + }, + evict(uri) { + const id = idOf(uri); + if (live.has(id)) { + total -= live.get(id); + live.delete(id); + } + evicted.push(id); + }, }; } @@ -80,8 +116,18 @@ function pageDoc() { } // Build a client wired to a server that logs in, serves the page, and serves the -// internal file bytes. `fileStatus` lets a test force the file fetch to fail. -async function buildClient(sandbox, { fileStatus = 200 } = {}) { +// internal file bytes. `fileStatus` lets a test force the file fetch to fail; +// `doc` overrides the served page; `fileBytes`/`fileHeaders` shape the file +// response (used by the empty-body / missing-Content-Type branch tests). +async function buildClient( + sandbox, + { + fileStatus = 200, + doc = pageDoc(), + fileBytes = IMAGE_BYTES, + fileHeaders = { "Content-Type": "image/png" }, + } = {}, +) { const baseURL = await spawn(async (req, res) => { await readBody(req); if (req.url === "/api/auth/login") { @@ -94,17 +140,17 @@ async function buildClient(sandbox, { fileStatus = 200 } = {}) { } if (req.url === "/api/pages/info") { res.writeHead(200, { "Content-Type": "application/json" }); - res.end(JSON.stringify({ data: { id: "page-1", title: "T", content: pageDoc() } })); + res.end(JSON.stringify({ data: { id: "page-1", title: "T", content: doc } })); return; } - if (req.url.startsWith("/api/files/att-1/")) { + if (req.url.startsWith("/api/files/")) { if (fileStatus !== 200) { res.writeHead(fileStatus); res.end(); return; } - res.writeHead(200, { "Content-Type": "image/png" }); - res.end(IMAGE_BYTES); + res.writeHead(200, fileHeaders); + res.end(fileBytes); return; } res.writeHead(404); @@ -114,10 +160,26 @@ async function buildClient(sandbox, { fileStatus = 200 } = {}) { apiUrl: baseURL, email: "u@example.com", password: "pw", - sandbox: { put: (buf, mime) => sandbox.put(buf, mime) }, + sandbox: { + put: (buf, mime) => sandbox.put(buf, mime), + has: (uri) => sandbox.has(uri), + evict: (uri) => sandbox.evict(uri), + }, }); } +// A page with several DISTINCT internal images (each a unique attachment id) so +// each is its own sandbox blob — needed to exercise FIFO self-eviction. +function multiImageDoc(n) { + return { + type: "doc", + content: Array.from({ length: n }, (_, i) => ({ + type: "image", + attrs: { src: `/api/files/att-${i}/pic.png`, attachmentId: `att-${i}` }, + })), + }; +} + test("stashPage stores the doc + mirrors/rewrites internal images, returns only a link", async () => { const sandbox = makeSandbox(); const client = await buildClient(sandbox); @@ -181,3 +243,88 @@ test("stashPage throws a clear error when no sandbox is configured", async () => }); await assert.rejects(() => client.stashPage("page-1"), /not configured/); }); + +test("stashPage reverts a FIFO-evicted image and counts it as failed (B1)", async () => { + // 3 distinct images of S=4000 bytes each; doc JSON is far smaller than one + // image. With a cap of 4500: storing img1 evicts img0, storing img2 evicts + // img1 — so only img2 survives the loop (img0 + img1 reverted). The doc + // (4000 + a few hundred bytes <= 4500) then fits alongside the survivor, so it + // does NOT trigger further eviction. The stored doc must therefore reference + // exactly one live blob and revert the other two to their internal srcs. + const BIG = Buffer.alloc(4000, 0x41); + const sandbox = makeSandbox({ maxTotal: 4500 }); + const client = await buildClient(sandbox, { + doc: multiImageDoc(3), + fileBytes: BIG, + }); + + const result = await client.stashPage("page-1"); + + // Two images were evicted before the doc was stored -> counted as failed. + assert.deepEqual(result.images, { mirrored: 1, failed: 2 }); + + // Inspect the stashed doc: no node may point at an evicted (now-dead) blob, + // and every reverted node carries its ORIGINAL internal src again. + const docPut = sandbox.puts.find((p) => p.mime === "application/json"); + const stashed = JSON.parse(docPut.buf.toString("utf8")); + const imgs = stashed.content.content.filter((n) => n.type === "image"); + let live = 0; + let reverted = 0; + for (const img of imgs) { + const src = img.attrs.src; + if (src.startsWith("https://sb.test/api/sb/")) { + assert.ok(sandbox.has(src), `doc references evicted blob ${src}`); + live++; + } else { + // Reverted to the original internal src. + assert.match(src, /^\/api\/files\/att-\d+\/pic\.png$/); + reverted++; + } + } + assert.equal(live, 1); + assert.equal(reverted, 2); +}); + +test("stashPage frees image blobs when the doc put throws (B1)", async () => { + // Two distinct images mirror fine; the final JSON doc put throws (doc exceeds + // cap). stashPage must reject AND evict every image blob it stored this op. + const sandbox = makeSandbox({ throwOnJson: true }); + const client = await buildClient(sandbox, { doc: multiImageDoc(2) }); + + await assert.rejects(() => client.stashPage("page-1")); + + // Both image blobs were stored, then evicted on the doc-put failure. + const imagePuts = sandbox.puts.filter((p) => p.mime === "image/png"); + assert.equal(imagePuts.length, 2); + for (const p of imagePuts) { + assert.ok(sandbox.evicted.includes(p.id), `image ${p.id} was not freed`); + } +}); + +test("stashPage counts an empty file response as failed (B1/fetchInternalFile)", async () => { + const sandbox = makeSandbox(); + const client = await buildClient(sandbox, { + fileBytes: Buffer.alloc(0), + fileHeaders: { "Content-Type": "image/png", "Content-Length": "0" }, + }); + + const result = await client.stashPage("page-1"); + + // The single internal image (deduped) yielded an empty body -> failed. + assert.deepEqual(result.images, { mirrored: 0, failed: 1 }); + // Only the doc blob was stored. + assert.equal(sandbox.puts.filter((p) => p.mime === "image/png").length, 0); +}); + +test("stashPage mirrors a file with no Content-Type as octet-stream (fetchInternalFile)", async () => { + const sandbox = makeSandbox(); + // No Content-Type header at all -> fetchInternalFile defaults to octet-stream. + const client = await buildClient(sandbox, { fileHeaders: {} }); + + const result = await client.stashPage("page-1"); + + assert.equal(result.images.mirrored, 1); + const imagePut = sandbox.puts.find((p) => p.mime !== "application/json"); + assert.ok(imagePut, "expected an image put"); + assert.equal(imagePut.mime, "application/octet-stream"); +}); diff --git a/packages/mcp/test/unit/internal-file-urls.test.mjs b/packages/mcp/test/unit/internal-file-urls.test.mjs new file mode 100644 index 00000000..0f3d8fd0 --- /dev/null +++ b/packages/mcp/test/unit/internal-file-urls.test.mjs @@ -0,0 +1,61 @@ +// Unit tests for the internal-file URL helpers the stash tool relies on. The +// critical case is resolveInternalFilePath, whose whole job is to REJECT a +// content-controlled `src` that tries to escape /api/files/ (SSRF / traversal) +// before it ever reaches the authenticated loopback client. +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { + resolveInternalFilePath, + normalizeFileUrl, + collectInternalFileNodes, +} from "../../build/lib/internal-file-urls.js"; + +test("resolveInternalFilePath accepts a normal internal src", () => { + assert.equal( + resolveInternalFilePath("/api/files/att-1/pic.png"), + "/files/att-1/pic.png", + ); +}); + +test("resolveInternalFilePath rejects traversal / encoded variants (SSRF guard)", () => { + // `..` collapses to /api/auth/whoami -> outside /api/files/ -> rejected. + assert.throws(() => resolveInternalFilePath("/api/files/../auth/whoami")); + // Escapes the /api base entirely. + assert.throws(() => resolveInternalFilePath("/api/files/../../internal")); + // Percent-encoded dot -> rejected before canonicalization. + assert.throws(() => resolveInternalFilePath("/api/files/%2e%2e/x")); + // Percent-encoded slash separator -> rejected before canonicalization. + assert.throws(() => resolveInternalFilePath("/api/files/..%2fauth")); +}); + +test("normalizeFileUrl rewrites the bare /files/ branch and leaves /api/files/ alone", () => { + assert.equal( + normalizeFileUrl("/files/att-1/pic.png"), + "/api/files/att-1/pic.png", + ); + assert.equal( + normalizeFileUrl("/api/files/att-1/pic.png"), + "/api/files/att-1/pic.png", + ); +}); + +test("collectInternalFileNodes recurses into nested content containers", () => { + // The internal image is buried inside a callout's content array, so a + // regression on the recursion (e.g. a shallow .filter()) would miss it. + const nested = { + type: "image", + attrs: { src: "/api/files/att-9/deep.png", attachmentId: "att-9" }, + }; + const doc = { + type: "doc", + content: [ + { + type: "callout", + content: [{ type: "paragraph", content: [nested] }], + }, + ], + }; + const found = collectInternalFileNodes(doc); + assert.equal(found.length, 1); + assert.equal(found[0], nested); +});