From e431b33bb17f07bb1b7e0e848832ce8ac4f411c0 Mon Sep 17 00:00:00 2001 From: claude code agent 227 Date: Sat, 4 Jul 2026 19:33:26 +0300 Subject: [PATCH 1/2] feat(ai-chat): deferred tool loading (tiers + loadTools meta-tool) (#332) The in-app AI agent shipped all ~41 tool schemas on every model step. This adds a two-tier catalog: core tools (frequent or one-line) stay always-active; the rest are advertised as a compact catalog and their full schema is fetched on demand via the loadTools meta-tool, wired through ai@6 prepareStep's per-step activeTools. - tools/tool-tiers.ts: CORE_TOOL_KEYS, INLINE_TOOL_TIERS, applyLoadTools, catalog builders (+ tool-tiers.spec.ts, 13 cases). - ai-chat.service.ts prepareAgentStep: returns activeTools = [...CORE_TOOL_KEYS, loadTools, ...activatedTools]; per-turn activated Set. - ai-chat.prompt.ts: buildToolCatalogBlock renders the deferred catalog. - mcp/tool-specs.ts: tier + catalogLine metadata (external snake_case /mcp transport unchanged). - EnvironmentService.isAiChatDeferredToolsEnabled(): AI_CHAT_DEFERRED_TOOLS, default ON per issue intent (kill-switch =false restores old behavior). Gate: server ai-chat 631/631, tool-tiers 13/13, mcp 472/472, tsc clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/core/ai-chat/ai-chat.prompt.spec.ts | 65 +++- .../server/src/core/ai-chat/ai-chat.prompt.ts | 63 ++++ .../core/ai-chat/ai-chat.role-resolve.spec.ts | 1 + .../ai-chat/ai-chat.service.lifecycle.spec.ts | 1 + .../src/core/ai-chat/ai-chat.service.spec.ts | 69 +++- .../src/core/ai-chat/ai-chat.service.ts | 96 +++++- .../ai-chat/tools/ai-chat-tools.service.ts | 16 + .../ai-chat/tools/docmost-client.loader.ts | 5 + .../src/core/ai-chat/tools/tool-tiers.spec.ts | 159 +++++++++ .../src/core/ai-chat/tools/tool-tiers.ts | 309 ++++++++++++++++++ .../environment/environment.service.ts | 15 + .../integration/ai-chat-stream.int-spec.ts | 3 + packages/mcp/src/tool-specs.ts | 65 ++++ 13 files changed, 853 insertions(+), 14 deletions(-) create mode 100644 apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts create mode 100644 apps/server/src/core/ai-chat/tools/tool-tiers.ts diff --git a/apps/server/src/core/ai-chat/ai-chat.prompt.spec.ts b/apps/server/src/core/ai-chat/ai-chat.prompt.spec.ts index 44d976b4..9abd7de1 100644 --- a/apps/server/src/core/ai-chat/ai-chat.prompt.spec.ts +++ b/apps/server/src/core/ai-chat/ai-chat.prompt.spec.ts @@ -1,4 +1,8 @@ -import { buildSystemPrompt, buildMcpToolingBlock } from './ai-chat.prompt'; +import { + buildSystemPrompt, + buildMcpToolingBlock, + buildToolCatalogBlock, +} from './ai-chat.prompt'; import { Workspace } from '@docmost/db/types/entity.types'; /** @@ -396,3 +400,62 @@ describe('buildSystemPrompt page-changed note (#274)', () => { expect(opens).toBe(1); }); }); + +/** + * #332 deferred tool loading — the block builder and its + * gating inside buildSystemPrompt. + */ +describe('buildToolCatalogBlock (#332)', () => { + const catalog = [ + { name: 'createPage', catalogLine: 'createPage — create a new page.' }, + { name: 'transformPage', catalogLine: 'transformPage — run a JS transform.' }, + ]; + + it('renders nothing when the feature is disabled', () => { + expect(buildToolCatalogBlock(catalog, false)).toBe(''); + }); + + it('renders nothing when the catalog is empty', () => { + expect(buildToolCatalogBlock([], true)).toBe(''); + expect(buildToolCatalogBlock(undefined, true)).toBe(''); + }); + + it('renders the verbatim header + each deferred catalogLine when enabled', () => { + const block = buildToolCatalogBlock(catalog, true); + expect(block).toContain('` block (#332): the compact list of DEFERRED tools + * the model can activate on demand via loadTools. Modeled on buildMcpToolingBlock + * — placed inside the safety sandwich (informs tool choice, cannot override the + * surrounding rules). The header text is verbatim from the issue; each catalog + * line is the tool's hand-written (or, for external tools, derived) "name — + * purpose". Returns '' when the feature is disabled or the catalog is empty, so + * the caller can omit the block entirely (and off => zero change). + */ +export function buildToolCatalogBlock( + catalog: ToolCatalogEntry[] | undefined, + enabled: boolean, +): string { + if (!enabled) return ''; + const lines = (catalog ?? []) + .filter((e) => e && typeof e.catalogLine === 'string' && e.catalogLine.trim()) + .map((e) => `- ${e.catalogLine.trim()}`); + if (lines.length === 0) return ''; + return [ + '', + 'The tools below EXIST and are available to you, but their full definitions are', + 'NOT loaded into this conversation yet. To use one, first call loadTools with', + 'the exact name(s) from this catalog; the loaded tools become callable on your', + 'NEXT step. Load several at once when the task clearly needs them.', + 'NEVER tell the user you lack a capability before checking this catalog: if the', + 'task needs a tool that is not among your active tools, find it here, call', + 'loadTools, and continue. Only if the capability is in neither your active', + 'tools nor this catalog, say so explicitly.', + 'Deferred tools (name — purpose):', + ...lines, + '', + ].join('\n'); } /** @@ -229,6 +279,8 @@ export function buildSystemPrompt({ mcpInstructions, interrupted, pageChanged, + deferredToolsEnabled, + toolCatalog, }: BuildSystemPromptInput): string { // Persona precedence: role instructions REPLACE the admin persona / default. // effectivePersona = roleInstructions || adminPrompt || DEFAULT_PROMPT. @@ -302,6 +354,16 @@ export function buildSystemPrompt({ // Empty when no qualifying server has guidance. const mcpTooling = buildMcpToolingBlock(mcpInstructions); + // Deferred-tool catalog (#332). Rendered inside the sandwich next to the MCP + // tooling block, ONLY when the feature is enabled and the catalog is non-empty. + // Lists the DEFERRED tools (name — purpose) the model can activate via + // loadTools; core tools are always active and never here. Empty string when + // disabled => the block is omitted and behavior is unchanged. + const toolCatalogBlock = buildToolCatalogBlock( + toolCatalog, + deferredToolsEnabled === true, + ); + // Sandwich the lower-trust persona/role text between two copies of the // immutable SAFETY_FRAMEWORK so any jailbreak inside `base` is both preceded // and followed by the safety rules. The persona is delimited with explicit @@ -316,6 +378,7 @@ export function buildSystemPrompt({ '', context, mcpTooling, + toolCatalogBlock, SAFETY_FRAMEWORK, ] .filter((part) => part !== '') diff --git a/apps/server/src/core/ai-chat/ai-chat.role-resolve.spec.ts b/apps/server/src/core/ai-chat/ai-chat.role-resolve.spec.ts index 3683d8c1..0b9e0d9c 100644 --- a/apps/server/src/core/ai-chat/ai-chat.role-resolve.spec.ts +++ b/apps/server/src/core/ai-chat/ai-chat.role-resolve.spec.ts @@ -53,6 +53,7 @@ describe('AiChatService.resolveRoleForRequest', () => { aiAgentRoleRepo as never, {} as never, // pageRepo {} as never, // pageAccess + {} as never, // environment ); return { service, aiChatRepo, aiAgentRoleRepo }; } diff --git a/apps/server/src/core/ai-chat/ai-chat.service.lifecycle.spec.ts b/apps/server/src/core/ai-chat/ai-chat.service.lifecycle.spec.ts index dc7cbdaf..ac1d510b 100644 --- a/apps/server/src/core/ai-chat/ai-chat.service.lifecycle.spec.ts +++ b/apps/server/src/core/ai-chat/ai-chat.service.lifecycle.spec.ts @@ -22,6 +22,7 @@ describe('AiChatService.onModuleInit (startup sweep)', () => { {} as never, // aiAgentRoleRepo {} as never, // pageRepo {} as never, // pageAccess + {} as never, // environment ); return { service, aiChatMessageRepo }; } diff --git a/apps/server/src/core/ai-chat/ai-chat.service.spec.ts b/apps/server/src/core/ai-chat/ai-chat.service.spec.ts index 0df4c5dc..c09c2a28 100644 --- a/apps/server/src/core/ai-chat/ai-chat.service.spec.ts +++ b/apps/server/src/core/ai-chat/ai-chat.service.spec.ts @@ -217,23 +217,78 @@ describe('rowToUiMessage', () => { * a text-only synthesis answer (toolChoice 'none') with the FINAL_STEP_INSTRUCTION * appended onto — not replacing — the original system prompt. */ +// Narrowing helpers for the prepareAgentStep union return type. +const asLockdown = (r: ReturnType) => + r as { toolChoice: 'none'; system: string }; +const asActive = (r: ReturnType) => + r as { activeTools: string[] }; + describe('prepareAgentStep', () => { - it('returns undefined for the first step', () => { + // --- toggle OFF (default): unchanged behavior --- + it('returns undefined for the first step (toggle off)', () => { expect(prepareAgentStep(0, 'SYS')).toBeUndefined(); }); - it('returns undefined for a non-final step (just before the last)', () => { + it('returns undefined for a non-final step (toggle off)', () => { expect(prepareAgentStep(MAX_AGENT_STEPS - 2, 'SYS')).toBeUndefined(); }); - it('forces a text-only synthesis on the final allowed step', () => { - const result = prepareAgentStep(MAX_AGENT_STEPS - 1, 'SYS'); + it('forces a text-only synthesis on the final allowed step (toggle off)', () => { + const result = asLockdown(prepareAgentStep(MAX_AGENT_STEPS - 1, 'SYS')); expect(result).toBeDefined(); - expect(result?.toolChoice).toBe('none'); + expect(result.toolChoice).toBe('none'); // The original persona is preserved (prefix), not replaced. - expect(result?.system.startsWith('SYS')).toBe(true); + expect(result.system.startsWith('SYS')).toBe(true); // The synthesis instruction is appended. - expect(result?.system).toContain(FINAL_STEP_INSTRUCTION); + expect(result.system).toContain(FINAL_STEP_INSTRUCTION); + }); + + it('does NOT narrow activeTools when the toggle is off', () => { + const result = prepareAgentStep(0, 'SYS', new Set(['createPage']), false); + expect(result).toBeUndefined(); + }); + + // --- toggle ON (#332): deferred tool visibility --- + it('a non-final step exposes CORE + loadTools + activatedTools', () => { + const activated = new Set(); + const result = asActive(prepareAgentStep(0, 'SYS', activated, true)); + expect(result.activeTools).toContain('searchPages'); // core + expect(result.activeTools).toContain('searchInPage'); // #330, core + expect(result.activeTools).toContain('editPageText'); // core + expect(result.activeTools).toContain('loadTools'); // meta-tool + // No deferred tool is active before it is loaded. + expect(result.activeTools).not.toContain('createPage'); + expect(result.activeTools).not.toContain('transformPage'); + }); + + it('adding a name to activatedTools makes it appear on the next step', () => { + const activated = new Set(); + // Before loading: createPage is not active. + expect( + asActive(prepareAgentStep(1, 'SYS', activated, true)).activeTools, + ).not.toContain('createPage'); + // loadTools grows the SAME set… + activated.add('createPage'); + // …so the next step sees it. + const next = asActive(prepareAgentStep(2, 'SYS', activated, true)); + expect(next.activeTools).toContain('createPage'); + expect(next.activeTools).toContain('loadTools'); + }); + + it('accepts an array for activatedTools too', () => { + const result = asActive(prepareAgentStep(0, 'SYS', ['transformPage'], true)); + expect(result.activeTools).toContain('transformPage'); + expect(result.activeTools).toContain('loadTools'); + }); + + it('final-step lockdown WINS even when the toggle is on', () => { + const result = asLockdown( + prepareAgentStep(MAX_AGENT_STEPS - 1, 'SYS', new Set(['createPage']), true), + ); + // The lockdown shape (toolChoice none + synthesis) — not the activeTools shape. + expect(result.toolChoice).toBe('none'); + expect(result.system).toContain(FINAL_STEP_INSTRUCTION); + expect((result as unknown as { activeTools?: string[] }).activeTools).toBeUndefined(); }); }); diff --git a/apps/server/src/core/ai-chat/ai-chat.service.ts b/apps/server/src/core/ai-chat/ai-chat.service.ts index 4586185a..017056ac 100644 --- a/apps/server/src/core/ai-chat/ai-chat.service.ts +++ b/apps/server/src/core/ai-chat/ai-chat.service.ts @@ -30,7 +30,15 @@ import { } from '@docmost/db/types/entity.types'; import { AiChatToolsService } from './tools/ai-chat-tools.service'; import { McpClientsService } from './external-mcp/mcp-clients.service'; +import { EnvironmentService } from '../../integrations/environment/environment.service'; import { buildSystemPrompt } from './ai-chat.prompt'; +import { + CORE_TOOL_KEYS, + CORE_TOOL_SET, + LOAD_TOOLS_NAME, + makeLoadToolsTool, + buildExternalToolCatalog, +} from './tools/tool-tiers'; import { computePageChange } from './page-change/page-change.util'; import { roleModelOverride } from './roles/role-model-config'; import { @@ -54,24 +62,52 @@ const FINAL_STEP_INSTRUCTION = 'language. If the information is incomplete, say so explicitly: summarize ' + 'what you found, what is still missing, and give your best partial conclusion.'; -// Pure, unit-testable: decide per-step overrides. Returns undefined for normal -// steps; on the final allowed step forces a text-only synthesis answer. +// Pure, unit-testable: decide per-step overrides. Two responsibilities: +// 1. Final-step lockdown (always): on the final allowed step force a text-only +// synthesis answer (toolChoice 'none' + FINAL_STEP_INSTRUCTION). This WINS — +// it takes precedence over the deferred-tool narrowing below. +// 2. Deferred tool visibility (#332): when `deferredEnabled` and NOT the final +// step, expose only the CORE tools + loadTools + whatever loadTools has +// activated so far this turn (`activatedTools`), via `activeTools`. Deferred +// tools stay in the until the model loads them. +// When `deferredEnabled` is false the behavior is unchanged: undefined on normal +// steps (all tools active), lockdown on the final step. +// // `system` is the in-scope system prompt; we CONCATENATE so the original // persona/context is preserved — a bare `system` override would REPLACE the -// whole system prompt for the step. +// whole system prompt for the step. `activatedTools` is PER-TURN mutable state +// owned by the streaming loop (a closure Set grown by loadTools); it is passed +// in (not module-global, not persisted) so this stays a pure function of its +// arguments. // // NOTE: at AI SDK v7 the per-step `system` field is renamed to `instructions`. // On v6 (`^6.0.134`) `system` is the correct field — adjust when bumping. export function prepareAgentStep( stepNumber: number, system: string, -): { toolChoice: 'none'; system: string } | undefined { + activatedTools: ReadonlySet | readonly string[] = [], + deferredEnabled = false, +): + | { toolChoice: 'none'; system: string } + | { activeTools: string[] } + | undefined { + // Final-step lockdown WINS (applies regardless of the deferred toggle). if (stepNumber >= MAX_AGENT_STEPS - 1) { return { toolChoice: 'none', system: `${system}\n\n${FINAL_STEP_INSTRUCTION}`, }; } + // Deferred tool loading: narrow this step's visible tools to CORE + loadTools + // + the tools already activated this turn. + if (deferredEnabled) { + const activated = Array.isArray(activatedTools) + ? activatedTools + : [...activatedTools]; + return { + activeTools: [...CORE_TOOL_KEYS, LOAD_TOOLS_NAME, ...activated], + }; + } return undefined; } @@ -206,6 +242,9 @@ export class AiChatService implements OnModuleInit { private readonly aiAgentRoleRepo: AiAgentRoleRepo, private readonly pageRepo: PageRepo, private readonly pageAccess: PageAccessService, + // Reads the AI_CHAT_DEFERRED_TOOLS toggle (#332). Injected last so existing + // positional constructor callers (tests) only append one stub. + private readonly environment: EnvironmentService, ) {} /** @@ -625,9 +664,25 @@ export class AiChatService implements OnModuleInit { // Build the system prompt + Docmost toolset. If either throws after the // external MCP lease was taken above, release the lease before rethrowing so // the leased transports are not leaked (#185 review). + // Deferred tool loading toggle (#332). When ON, the model sees a compact + // and only CORE tools + loadTools are active each step; other + // tools (fat/rare in-app tools + ALL external MCP tools) load on demand. When + // OFF, every tool is active and nothing below changes. + const deferredEnabled = this.environment.isAiChatDeferredToolsEnabled(); + let system: string; let docmostTools: Awaited>; try { + // Assemble the deferred catalog for the system prompt: hand-written lines + // for the in-app deferred tools + a derived line for each external MCP tool + // (also deferred by default). Only built when the feature is enabled. + const toolCatalog = deferredEnabled + ? [ + ...(await this.tools.getInAppDeferredCatalog()), + ...buildExternalToolCatalog(external.tools), + ] + : []; + system = buildSystemPrompt({ workspace, adminPrompt: resolved?.systemPrompt, @@ -644,6 +699,10 @@ export class AiChatService implements OnModuleInit { // Detected between-turns human edit to the open page (#274): adds the // page_changed note + unified diff so the agent doesn't overwrite it. pageChanged, + // Deferred tool loading (#332): renders the block (only + // when enabled + non-empty) so the model can activate deferred tools. + deferredToolsEnabled: deferredEnabled, + toolCatalog, }); // Pass the resolved chatId so the write tools can mint provenance tokens @@ -664,7 +723,31 @@ export class AiChatService implements OnModuleInit { throw err; } - const tools = { ...external.tools, ...docmostTools }; + // Base toolset: external MCP tools + Docmost in-app tools (Docmost wins on a + // name clash — external are namespaced, so no clash is expected). + const baseTools = { ...external.tools, ...docmostTools }; + + // Deferred tool loading state (#332), scoped to THIS streaming loop: + // - `activatedTools` is per-TURN mutable state — a fresh closure Set created + // per streamText call, NOT module-global and NOT persisted, so a new turn + // starts cold. loadTools.execute adds to it; prepareAgentStep reads it to + // widen `activeTools` on the NEXT step. + // - `validDeferredNames` = every tool that is NOT core (the in-app deferred + // tools + ALL external MCP tools), computed from the ACTUAL toolset so an + // external tool is loadable by its namespaced name. loadTools rejects any + // name outside this set. + const activatedTools = new Set(); + const validDeferredNames = new Set( + Object.keys(baseTools).filter((k) => !CORE_TOOL_SET.has(k)), + ); + // Add the loadTools meta-tool ONLY when the feature is enabled; when off the + // toolset and behavior are exactly as before. + const tools = deferredEnabled + ? { + ...baseTools, + [LOAD_TOOLS_NAME]: makeLoadToolsTool(activatedTools, validDeferredNames), + } + : baseTools; // Accumulate the turn's streamed output so a provider error / disconnect can // persist the PARTIAL answer the user already saw — the SDK's onError/onAbort @@ -799,7 +882,8 @@ export class AiChatService implements OnModuleInit { // ends with no assistant text (an empty turn). prepareAgentStep forbids // further tool calls and appends a synthesis instruction on that step, // concatenated onto the original `system` so the persona is preserved. - prepareStep: ({ stepNumber }) => prepareAgentStep(stepNumber, system), + prepareStep: ({ stepNumber }) => + prepareAgentStep(stepNumber, system, activatedTools, deferredEnabled), abortSignal: signal, onChunk: ({ chunk }) => { // DIAGNOSTIC (Safari stream-drop investigation) — temporary. Any model diff --git a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts index cc0c48fa..76aa48e7 100644 --- a/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts +++ b/apps/server/src/core/ai-chat/tools/ai-chat-tools.service.ts @@ -17,6 +17,10 @@ import { resolveCurrentPageResult } from './current-page.util'; import { parseNodeArg } from './parse-node-arg'; import { modelFriendlyInput } from './model-friendly-input'; import { SandboxStore } from '../../../integrations/sandbox/sandbox.store'; +import { + buildInAppDeferredCatalog, + type ToolCatalogEntry, +} from './tool-tiers'; /** * Per-user, per-request adapter that exposes Docmost READ operations to the @@ -123,6 +127,18 @@ export class AiChatToolsService { return client.exportPageMarkdown(pageId); } + /** + * Build the IN-APP deferred entries (#332): one "name — purpose" + * line per DEFERRED tool, merging the per-layer INLINE_TOOL_TIERS with the + * shared registry's own catalogLine. Loads @docmost/mcp for the shared specs + * (memoized). Core tools are always active and are NOT listed here. External + * MCP tools are catalogued separately by the caller (they are runtime-scoped). + */ + async getInAppDeferredCatalog(): Promise { + const { sharedToolSpecs } = await loadDocmostMcp(); + return buildInAppDeferredCatalog(sharedToolSpecs); + } + async forUser( user: User, sessionId: string, diff --git a/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts b/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts index 8e6ee0c2..07aa982f 100644 --- a/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts +++ b/apps/server/src/core/ai-chat/tools/docmost-client.loader.ts @@ -241,6 +241,11 @@ export interface SharedToolSpec { mcpName: string; inAppKey: string; description: string; + // Deferred-tool metadata (#332). Optional in this mirror so an older/stale + // @docmost/mcp build (pre-#332) still type-checks; the in-app catalog builder + // reads them defensively. The external /mcp server ignores both fields. + tier?: 'core' | 'deferred'; + catalogLine?: string; // Loose `z` on purpose: the registry is zod-agnostic so the server can pass // its own zod (v4) and the MCP package its own (v3) into the same builder. buildShape?: (z: any) => Record; diff --git a/apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts b/apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts new file mode 100644 index 00000000..367f4f4b --- /dev/null +++ b/apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts @@ -0,0 +1,159 @@ +import { + CORE_TOOL_KEYS, + CORE_TOOL_SET, + LOAD_TOOLS_NAME, + LOAD_TOOLS_DESCRIPTION, + INLINE_TOOL_TIERS, + buildInAppDeferredCatalog, + buildExternalToolCatalog, + shortenForCatalog, + applyLoadTools, +} from './tool-tiers'; +// The real shared registry, imported from source (same approach as the +// SHARED_TOOL_SPECS contract spec) so the tier metadata is checked against +// exactly what @docmost/mcp ships. +import { SHARED_TOOL_SPECS } from '../../../../../../packages/mcp/src/tool-specs'; + +/** + * #332 deferred tool loading — tier metadata, catalog assembly, and the + * loadTools meta-tool. Pure units; no Nest graph, no @docmost/mcp build (the + * registry is imported from TS source). + */ + +describe('tool tier metadata (#332)', () => { + it('core set is the documented 13 + searchInPage (14)', () => { + expect(CORE_TOOL_KEYS).toHaveLength(14); + expect(CORE_TOOL_SET.has('searchInPage')).toBe(true); // #330, promoted to core + // loadTools is a meta-tool, not a normal core key. + expect(CORE_TOOL_SET.has(LOAD_TOOLS_NAME)).toBe(false); + }); + + it('SHARED_TOOL_SPECS tier agrees with CORE_TOOL_SET for every shared tool', () => { + for (const [key, spec] of Object.entries(SHARED_TOOL_SPECS)) { + const isCoreByTier = spec.tier === 'core'; + const isCoreByList = CORE_TOOL_SET.has(key); + expect(isCoreByTier).toBe(isCoreByList); + // Every spec carries a non-empty catalogLine (core tools too). + expect(typeof spec.catalogLine).toBe('string'); + expect(spec.catalogLine.trim().length).toBeGreaterThan(0); + } + }); + + it('every INLINE tool tier agrees with CORE_TOOL_SET and has a catalogLine', () => { + for (const [key, meta] of Object.entries(INLINE_TOOL_TIERS)) { + expect(meta.tier === 'core').toBe(CORE_TOOL_SET.has(key)); + expect(meta.catalogLine.trim().length).toBeGreaterThan(0); + } + }); +}); + +describe('buildInAppDeferredCatalog (#332)', () => { + const catalog = buildInAppDeferredCatalog(SHARED_TOOL_SPECS as never); + const names = catalog.map((e) => e.name); + + it('includes deferred tools from BOTH the inline map and the shared registry', () => { + expect(names).toContain('transformPage'); // inline deferred + expect(names).toContain('getPageJson'); // shared deferred + expect(names).toContain('patchNode'); // shared deferred + expect(names).toContain('createPage'); // inline deferred + }); + + it('NEVER lists a core tool', () => { + for (const core of CORE_TOOL_KEYS) { + expect(names).not.toContain(core); + } + // spot-check a couple that are core in each source. + expect(names).not.toContain('searchInPage'); // shared core + expect(names).not.toContain('searchPages'); // inline core + expect(names).not.toContain('editPageText'); // shared core + }); + + it('lists all 28 deferred tools (16 inline + 12 shared)', () => { + expect(catalog).toHaveLength(28); + // Each entry is a "name — purpose" line. + for (const entry of catalog) { + expect(entry.catalogLine).toMatch(/ — /); + } + }); +}); + +describe('buildExternalToolCatalog + shortenForCatalog (#332)', () => { + it('derives a short "name — purpose" line from each external tool description', () => { + const catalog = buildExternalToolCatalog({ + tavily_search: { description: 'Search the web for fresh results. More detail here.' }, + tavily_extract: { description: '' }, + }); + expect(catalog).toEqual([ + { name: 'tavily_search', catalogLine: 'tavily_search — Search the web for fresh results.' }, + { name: 'tavily_extract', catalogLine: 'tavily_extract — external tool' }, + ]); + }); + + it('caps a very long description', () => { + const long = 'x'.repeat(500); + expect(shortenForCatalog(long).length).toBeLessThanOrEqual(140); + expect(shortenForCatalog(long).endsWith('…')).toBe(true); + }); +}); + +describe('applyLoadTools (#332)', () => { + const valid = new Set(['createPage', 'transformPage', 'tavily_search']); + + it('adds valid names to the activated set and returns { loaded }', () => { + const activated = new Set(); + const result = applyLoadTools(['createPage', 'tavily_search'], activated, valid); + expect(result).toEqual({ loaded: ['createPage', 'tavily_search'] }); + expect(activated.has('createPage')).toBe(true); + expect(activated.has('tavily_search')).toBe(true); + }); + + it('rejects an unknown name with an error listing the valid deferred names', () => { + const activated = new Set(); + expect(() => applyLoadTools(['nope'], activated, valid)).toThrow(/unknown tool name/i); + try { + applyLoadTools(['nope'], activated, valid); + } catch (e) { + const msg = (e as Error).message; + // Lists every valid name (sorted). + expect(msg).toContain('createPage'); + expect(msg).toContain('transformPage'); + expect(msg).toContain('tavily_search'); + } + // Nothing is activated on a rejected call. + expect(activated.size).toBe(0); + }); + + it('tolerates a non-array / empty input (loads nothing)', () => { + const activated = new Set(); + expect(applyLoadTools(undefined, activated, valid)).toEqual({ loaded: [] }); + expect(applyLoadTools([], activated, valid)).toEqual({ loaded: [] }); + expect(activated.size).toBe(0); + }); + + it('loadTools description is the verbatim issue text', () => { + expect(LOAD_TOOLS_DESCRIPTION).toContain('only ACTIVATES them'); + expect(LOAD_TOOLS_DESCRIPTION).toContain('callable on your NEXT step'); + }); +}); + +describe('editorial "Corrector" scenario is fully served by CORE (#332)', () => { + it('read + comment + edit + search need no loadTools', () => { + // A Corrector role reads a page, searches within it, edits text, and leaves + // inline comments — every tool it needs is core, so it never has to load a + // deferred tool. + const needed = [ + 'getCurrentPage', + 'getPage', + 'searchPages', + 'searchInPage', + 'editPageText', + 'createComment', + 'listComments', + 'getComment', + 'resolveComment', + ]; + for (const t of needed) { + expect(CORE_TOOL_SET.has(t)).toBe(true); + } + }); +}); diff --git a/apps/server/src/core/ai-chat/tools/tool-tiers.ts b/apps/server/src/core/ai-chat/tools/tool-tiers.ts new file mode 100644 index 00000000..4708416f --- /dev/null +++ b/apps/server/src/core/ai-chat/tools/tool-tiers.ts @@ -0,0 +1,309 @@ +import { tool, type Tool } from 'ai'; +import { z } from 'zod'; +import type { SharedToolSpec } from './docmost-client.loader'; + +/** + * Deferred tool loading for the in-app AI chat (#332). + * + * The agent otherwise sends ALL ~41 tool definitions on EVERY model call every + * step, bloating context. Instead we split the in-app tools into two tiers: + * + * - CORE (hot, always active): frequent OR tiny tools whose full schema is + * always visible, plus the `loadTools` meta-tool. Deferring a one-line tool is + * pure loss, so tiny tools stay core even if rare. + * - DEFERRED (loaded on demand): the fat/rare tools + ALL external MCP tools by + * default. The model sees only a compact (name — purpose) and + * calls `loadTools(names)` to ACTIVATE a tool's full schema for the NEXT step + * (one extra round-trip on first use). + * + * This module is the single source of truth for the IN-APP tiering: + * - CORE_TOOL_KEYS / CORE_TOOL_SET — the authoritative core list (used by + * prepareAgentStep to build per-step `activeTools`). + * - INLINE_TOOL_TIERS — tier + catalogLine for the per-layer INLINE tools (the + * ones NOT in @docmost/mcp's SHARED_TOOL_SPECS, which carry their own). + * - buildInAppDeferredCatalog / buildExternalToolCatalog — assemble the + * deferred lines. + * - applyLoadTools / makeLoadToolsTool — the loadTools meta-tool. + * + * The tier/catalogLine fields on SHARED_TOOL_SPECS are IN-APP metadata only; the + * external /mcp server ignores them and exposes every tool normally. + */ + +/** A single rendered line: the tool name + its "name — purpose". */ +export interface ToolCatalogEntry { + /** Exact tool name the model must pass to loadTools. */ + name: string; + /** Hand-written (in-app) or derived (external) "name — purpose" line. */ + catalogLine: string; +} + +/** + * CORE (always-active) in-app tool keys — 13 frequent/tiny tools. `searchInPage` + * (#330) is added to core on top of the issue's original tier list: it is + * frequent for the editorial roles this feature targets. `loadTools` is active + * too but is not a normal tool key (it is added to activeTools separately). + */ +export const CORE_TOOL_KEYS = [ + 'searchPages', + 'listPages', + 'listSpaces', + 'getWorkspace', + 'getCurrentPage', + 'getPage', + 'getOutline', + 'getNode', + 'createComment', + 'getComment', + 'listComments', + 'resolveComment', + 'editPageText', + // #330 search_in_page — frequent for editorial sweeps; core despite predating + // the issue's tier list. + 'searchInPage', +] as const; + +/** O(1) membership test for the core tier. */ +export const CORE_TOOL_SET: ReadonlySet = new Set(CORE_TOOL_KEYS); + +/** The meta-tool name (always active alongside the core tools when enabled). */ +export const LOAD_TOOLS_NAME = 'loadTools'; + +/** + * loadTools description — VERBATIM from issue #332. Tells the model that the + * catalog names EXIST, that loadTools only ACTIVATES them (callable next step), + * and to load several at once. + */ +export const LOAD_TOOLS_DESCRIPTION = + 'loadTools — Load the full definitions of deferred tools from the \n' + + 'block in your instructions. Pass the EXACT tool names from the catalog; this\n' + + 'call only ACTIVATES them and returns { loaded: [...] } — the tools become\n' + + 'callable on your NEXT step. Load several names in one call when the task clearly\n' + + 'needs them. Unknown names are rejected with the list of valid ones.'; + +/** + * Tier + catalogLine for the INLINE ai-chat tools — those defined per-layer in + * ai-chat-tools.service.ts and NOT present in @docmost/mcp's SHARED_TOOL_SPECS + * (which carries its own tier/catalogLine). Together with the shared registry + * this describes every in-app tool. catalogLine is present for core tools too + * (uniformity), but only DEFERRED tools are rendered into the catalog. + */ +export const INLINE_TOOL_TIERS: Record< + string, + { tier: 'core' | 'deferred'; catalogLine: string } +> = { + // --- core inline --- + searchPages: { + tier: 'core', + catalogLine: 'searchPages — hybrid semantic + keyword search across the wiki.', + }, + getCurrentPage: { + tier: 'core', + catalogLine: 'getCurrentPage — the page the user is currently viewing.', + }, + getPage: { + tier: 'core', + catalogLine: 'getPage — fetch a page as Markdown by its id.', + }, + listPages: { + tier: 'core', + catalogLine: "listPages — list recent pages, or a space's full page tree.", + }, + listComments: { + tier: 'core', + catalogLine: 'listComments — list all comments on a page (including resolved).', + }, + getComment: { + tier: 'core', + catalogLine: 'getComment — fetch a single comment by id.', + }, + createComment: { + tier: 'core', + catalogLine: + 'createComment — add an inline comment (optionally with a suggested edit).', + }, + resolveComment: { + tier: 'core', + catalogLine: 'resolveComment — resolve or reopen a comment thread.', + }, + + // --- deferred inline --- + createPage: { + tier: 'deferred', + catalogLine: 'createPage — create a new page with a Markdown body in a space.', + }, + updatePageContent: { + tier: 'deferred', + catalogLine: + "updatePageContent — replace a page's body (and optionally title) with new Markdown.", + }, + renamePage: { + tier: 'deferred', + catalogLine: "renamePage — change a page's title only (body untouched).", + }, + movePage: { + tier: 'deferred', + catalogLine: 'movePage — move a page under a new parent or to the space root.', + }, + deletePage: { + tier: 'deferred', + catalogLine: 'deletePage — move a page to trash (soft delete, reversible).', + }, + listSidebarPages: { + tier: 'deferred', + catalogLine: + "listSidebarPages — list a space's root pages or a page's direct children.", + }, + getTable: { + tier: 'deferred', + catalogLine: 'getTable — read a table as a matrix of cell texts and cell ids.', + }, + checkNewComments: { + tier: 'deferred', + catalogLine: + 'checkNewComments — find comments in a space created after a timestamp.', + }, + getPageHistory: { + tier: 'deferred', + catalogLine: + 'getPageHistory — fetch one page-history version with its ProseMirror content.', + }, + exportPageMarkdown: { + tier: 'deferred', + catalogLine: + 'exportPageMarkdown — export a page to self-contained Markdown (body + comments).', + }, + updatePageJson: { + tier: 'deferred', + catalogLine: + "updatePageJson — overwrite a page's body with a full ProseMirror document.", + }, + tableInsertRow: { + tier: 'deferred', + catalogLine: 'tableInsertRow — insert a row of plain-text cells into a table.', + }, + tableDeleteRow: { + tier: 'deferred', + catalogLine: 'tableDeleteRow — delete a table row at a 0-based index.', + }, + tableUpdateCell: { + tier: 'deferred', + catalogLine: 'tableUpdateCell — set the text of a table cell at [row, col].', + }, + sharePage: { + tier: 'deferred', + catalogLine: 'sharePage — make a page publicly accessible and return its URL.', + }, + transformPage: { + tier: 'deferred', + catalogLine: "transformPage — run a sandboxed JS transform over a page's document.", + }, +}; + +/** + * Build the deferred lines for the IN-APP tools by merging the + * two metadata sources: the per-layer INLINE_TOOL_TIERS and the shared registry + * (SHARED_TOOL_SPECS, loaded at runtime). Only DEFERRED tools are included; core + * tools are always active and never appear in the catalog. Pure — the caller + * passes the loaded specs so this stays unit-testable. + */ +export function buildInAppDeferredCatalog( + sharedToolSpecs: Record, +): ToolCatalogEntry[] { + const entries: ToolCatalogEntry[] = []; + // Inline deferred tools (hand-written lines). + for (const [name, meta] of Object.entries(INLINE_TOOL_TIERS)) { + if (meta.tier === 'deferred') { + entries.push({ name, catalogLine: meta.catalogLine }); + } + } + // Shared deferred tools (line comes from the registry's own catalogLine). + for (const [name, spec] of Object.entries(sharedToolSpecs)) { + if (spec.tier === 'deferred' && spec.catalogLine) { + entries.push({ name, catalogLine: spec.catalogLine }); + } + } + return entries; +} + +/** + * Cap an external tool's (untrusted) description into a short catalog purpose. + * External MCP tools have no hand-written catalogLine, so we derive one from the + * first sentence of the description, hard-capped. Whitespace is collapsed. + */ +export function shortenForCatalog(description: string, max = 140): string { + const flat = description.replace(/\s+/g, ' ').trim(); + if (!flat) return 'external tool'; + // Prefer the first sentence if it is reasonably short. + const firstSentence = flat.split(/(?<=[.!?])\s/)[0]; + const base = + firstSentence.length > 0 && firstSentence.length <= max + ? firstSentence + : flat; + return base.length > max ? `${base.slice(0, max - 1).trimEnd()}…` : base; +} + +/** + * Build catalog lines for the EXTERNAL MCP tools (all deferred by default, + * #332). Their names are the namespaced tool keys; the purpose is derived from + * each tool's own description (no hand-written line exists). Pure. + */ +export function buildExternalToolCatalog( + externalTools: Record, +): ToolCatalogEntry[] { + return Object.entries(externalTools).map(([name, t]) => ({ + name, + catalogLine: `${name} — ${shortenForCatalog(t?.description ?? '')}`, + })); +} + +/** + * Pure core of the loadTools meta-tool. Validates the requested names against + * the per-turn set of valid deferred names, ADDS the valid ones to the caller's + * mutable `activatedTools` set (so they become callable next step), and returns + * `{ loaded }`. An unknown name throws a clear error listing the valid deferred + * names — surfaced to the model as a tool error so it can retry. + */ +export function applyLoadTools( + names: unknown, + activatedTools: Set, + validDeferredNames: ReadonlySet, +): { loaded: string[] } { + const requested = Array.isArray(names) + ? names.filter((n): n is string => typeof n === 'string') + : []; + const unknown = requested.filter((n) => !validDeferredNames.has(n)); + if (unknown.length > 0) { + const valid = [...validDeferredNames].sort().join(', '); + throw new Error( + `loadTools: unknown tool name(s): ${unknown.join(', ')}. ` + + `Valid deferred tools are: ${valid || '(none)'}.`, + ); + } + for (const n of requested) activatedTools.add(n); + return { loaded: requested }; +} + +/** + * Build the loadTools AI-SDK tool bound to THIS turn's mutable state: the + * `activatedTools` set (grown by execute, read by prepareAgentStep next step) + * and the `validDeferredNames` set (every non-core tool in this turn's toolset, + * incl. external MCP). Created per streamText call — never module-global. + */ +export function makeLoadToolsTool( + activatedTools: Set, + validDeferredNames: ReadonlySet, +): Tool { + return tool({ + description: LOAD_TOOLS_DESCRIPTION, + inputSchema: z.object({ + names: z + .array(z.string()) + .describe( + 'EXACT deferred tool names from the to activate for ' + + 'your next step.', + ), + }), + execute: async ({ names }) => + applyLoadTools(names, activatedTools, validDeferredNames), + }); +} diff --git a/apps/server/src/integrations/environment/environment.service.ts b/apps/server/src/integrations/environment/environment.service.ts index 2f525b8f..15169152 100644 --- a/apps/server/src/integrations/environment/environment.service.ts +++ b/apps/server/src/integrations/environment/environment.service.ts @@ -261,6 +261,21 @@ export class EnvironmentService { return disable === 'true'; } + /** + * Deferred tool loading for the in-app AI chat (#332). When enabled, the agent + * sees a compact and only CORE tools + the loadTools meta-tool + * are active each step; deferred tools (the fat/rare ones + all external MCP + * tools) load on demand. Defaults to ENABLED — the issue treats deferred + * loading as the new behavior; set AI_CHAT_DEFERRED_TOOLS=false to restore the + * old "all tools always active" behavior. + */ + isAiChatDeferredToolsEnabled(): boolean { + const enabled = this.configService + .get('AI_CHAT_DEFERRED_TOOLS', 'true') + .toLowerCase(); + return enabled === 'true'; + } + getPostHogHost(): string { return this.configService.get('POSTHOG_HOST'); } diff --git a/apps/server/test/integration/ai-chat-stream.int-spec.ts b/apps/server/test/integration/ai-chat-stream.int-spec.ts index 103f4334..17deac63 100644 --- a/apps/server/test/integration/ai-chat-stream.int-spec.ts +++ b/apps/server/test/integration/ai-chat-stream.int-spec.ts @@ -146,6 +146,9 @@ describe('AiChatService.stream [integration]', () => { {} as any, // aiAgentRoleRepo (role is pre-resolved + passed in) {} as any, // pageRepo (only used when body.openPage is set) {} as any, // pageAccess (idem) + // environment (#332): keep deferred tool loading OFF for this lifecycle + // harness so the toolset/behavior is exactly as before. + { isAiChatDeferredToolsEnabled: () => false } as any, ); } diff --git a/packages/mcp/src/tool-specs.ts b/packages/mcp/src/tool-specs.ts index 79a2c066..762e292b 100644 --- a/packages/mcp/src/tool-specs.ts +++ b/packages/mcp/src/tool-specs.ts @@ -31,6 +31,22 @@ export interface SharedToolSpec { inAppKey: string; /** Single canonical model-facing description used by both layers. */ description: string; + /** + * Deferred-tool tier for the IN-APP agent (#332). 'core' tools are always + * active; 'deferred' tools are hidden behind the and loaded on + * demand via the loadTools meta-tool. This is an IN-APP concern only: the + * standalone /mcp server ignores this field and registers every tool normally + * (registerShared in index.ts reads mcpName/description/buildShape only). + */ + tier: 'core' | 'deferred'; + /** + * Hand-written one-liner "name — purpose" shown in the in-app agent's + * for a DEFERRED tool (#332). Deliberately NOT derived from the + * description's first sentence — a concise, accurate purpose line. Present on + * every spec (core tools too) for uniformity; only deferred ones are rendered. + * Inert for the external /mcp server. + */ + catalogLine: string; /** * Builds the tool's input schema as a plain object of zod fields (a * ZodRawShape). Called with the consumer's own zod namespace. Omitted for @@ -47,6 +63,8 @@ export const SHARED_TOOL_SPECS = { mcpName: 'get_workspace', inAppKey: 'getWorkspace', description: 'Fetch metadata about the current workspace (name, settings).', + tier: 'core', + catalogLine: 'getWorkspace — fetch current workspace metadata (name, settings).', }, listSpaces: { @@ -55,6 +73,8 @@ export const SHARED_TOOL_SPECS = { description: 'List the spaces the current user can access. Returns the array of ' + 'spaces (id, name, slug, ...).', + tier: 'core', + catalogLine: 'listSpaces — list the spaces the user can access (id, name, slug).', }, listShares: { @@ -62,6 +82,8 @@ export const SHARED_TOOL_SPECS = { inAppKey: 'listShares', description: 'List all public shares in the workspace with page titles and public URLs.', + tier: 'deferred', + catalogLine: 'listShares — list all public shares in the workspace with their URLs.', }, // --- single-pageId read tools --- @@ -74,6 +96,9 @@ export const SHARED_TOOL_SPECS = { 'includes block ids, callouts, tables, link/image attributes) plus the ' + 'slugId used in URLs. Use the block ids it returns to make precise ' + 'structural edits or surgical text edits without resending the page.', + tier: 'deferred', + catalogLine: + "getPageJson — get a page's raw ProseMirror JSON (lossless, with block ids).", buildShape: (z) => ({ pageId: z.string().min(1), }), @@ -88,6 +113,9 @@ export const SHARED_TOOL_SPECS = { 'count) WITHOUT the full document body. Use it to locate sections/tables ' + 'and grab block ids cheaply before fetching, patching or inserting ' + 'individual blocks.', + tier: 'core', + catalogLine: + "getOutline — compact outline of a page's top-level blocks with their ids.", buildShape: (z) => ({ pageId: z.string().min(1), }), @@ -104,6 +132,9 @@ export const SHARED_TOOL_SPECS = { 'outline or page-JSON view (works for headings/paragraphs/callouts/images), OR ' + '`#` to fetch a top-level block by its outline index — use the ' + '`#` form for tables/rows/cells, which carry no id.', + tier: 'core', + catalogLine: + "getNode — fetch one block's ProseMirror subtree by block id or #index.", buildShape: (z) => ({ pageId: z.string().min(1), nodeId: z.string().min(1), @@ -137,6 +168,9 @@ export const SHARED_TOOL_SPECS = { 'caseSensitive:true to match case. Ideal for systematic ' + 'editorial sweeps (unquoted "ё", straight quotes, "т.е.", stray units). An ' + 'invalid regex or an empty query returns a clear error to fix.', + tier: 'core', + catalogLine: + 'searchInPage — find every occurrence of a string/regex inside one page, with locations.', buildShape: (z) => ({ pageId: z.string().min(1).describe('ID of the page to search'), query: z @@ -172,6 +206,8 @@ export const SHARED_TOOL_SPECS = { description: 'Remove a single block by its attrs.id (from the page outline or ' + 'page-JSON view) WITHOUT resending the whole document.', + tier: 'deferred', + catalogLine: 'deleteNode — remove a single content block by its block id.', buildShape: (z) => ({ pageId: z.string().min(1), nodeId: z.string().min(1), @@ -203,6 +239,9 @@ export const SHARED_TOOL_SPECS = { 'JSON object or a JSON string (both accepted). Cheaper and safer than ' + 'replacing the whole document for one-block structural edits. Reversible: ' + 'the previous version is kept in page history.', + tier: 'deferred', + catalogLine: + 'patchNode — replace one block with a new ProseMirror node, keeping its id.', buildShape: (z) => ({ pageId: z.string().min(1).describe('ID of the page containing the block'), nodeId: z @@ -245,6 +284,9 @@ export const SHARED_TOOL_SPECS = { '[{"type":"text","text":"Title"}]}. Bold is a mark: ' + '{"type":"text","text":"x","marks":[{"type":"bold"}]}. The node may be a ' + 'JSON object or a JSON string (both accepted). Reversible via page history.', + tier: 'deferred', + catalogLine: + 'insertNode — insert a block before/after an anchor, or append at the end.', buildShape: (z) => ({ pageId: z.string().min(1), node: z @@ -278,6 +320,8 @@ export const SHARED_TOOL_SPECS = { mcpName: 'unshare_page', inAppKey: 'unsharePage', description: 'Remove the public share of a page (revokes the public URL).', + tier: 'deferred', + catalogLine: "unsharePage — revoke a page's public share (removes the public URL).", buildShape: (z) => ({ pageId: z.string().min(1).describe('ID of the page to unshare'), }), @@ -295,6 +339,9 @@ export const SHARED_TOOL_SPECS = { "`from`/`to` each accept a historyId, or null/'current' for the page's " + 'current content (defaults: from=current, to=current — pass a historyId ' + 'from the page-history list to compare against the live page).', + tier: 'deferred', + catalogLine: + 'diffPageVersions — diff two page versions and return the change set + summary.', buildShape: (z) => ({ pageId: z.string().min(1), from: z @@ -315,6 +362,9 @@ export const SHARED_TOOL_SPECS = { "List a page's saved versions (Docmost auto-snapshots on every save), " + 'newest first, cursor-paginated. Returns { items, nextCursor }; each ' + "item's id is the historyId to pass to the page diff or restore tools.", + tier: 'deferred', + catalogLine: + "listPageHistory — list a page's saved versions (newest first, paginated).", buildShape: (z) => ({ pageId: z.string().min(1), cursor: z @@ -332,6 +382,9 @@ export const SHARED_TOOL_SPECS = { 'as the page\'s current content (Docmost has no restore endpoint, so ' + 'this creates a NEW history snapshot — the restore is itself revertible). ' + 'Get the historyId from the page-history list.', + tier: 'deferred', + catalogLine: + 'restorePageVersion — restore a page to a saved history version (revertible).', buildShape: (z) => ({ historyId: z.string().min(1), }), @@ -349,6 +402,9 @@ export const SHARED_TOOL_SPECS = { 'thread records are NOT created/updated/deleted on the server by this ' + 'tool — only the page body + inline comment marks are written; manage ' + 'comment threads via the comment tools/UI.', + tier: 'deferred', + catalogLine: + "importPageMarkdown — replace a page's content from exported Docmost Markdown.", buildShape: (z) => ({ pageId: z.string().min(1), markdown: z.string().min(1), @@ -365,6 +421,9 @@ export const SHARED_TOOL_SPECS = { 'entirely server-side — the document is NOT sent through the model. The ' + 'target keeps its own title and slug; only its body is replaced. Ideal ' + "for 'make page A's content equal to B' or 'replace A with B but keep A's URL'.", + tier: 'deferred', + catalogLine: + "copyPageContent — replace one page's body with a copy of another page's body.", buildShape: (z) => ({ sourcePageId: z.string().min(1).describe('Page to copy content FROM'), targetPageId: z @@ -402,6 +461,9 @@ export const SHARED_TOOL_SPECS = { 'page JSON and use a structural node patch/update to set its marks. ' + 'Examples: edits:[{find:"teh",replace:"the"}]; edits:[{find:"Hello ' + 'world",replace:"Hello there"}] (crosses a bold boundary).', + tier: 'core', + catalogLine: + "editPageText — surgical find/replace of plain text in a page, preserving ids/marks.", buildShape: (z) => ({ pageId: z.string().describe('ID of the page to edit'), edits: z @@ -440,6 +502,9 @@ export const SHARED_TOOL_SPECS = { 'server instance that created it: in a multi-replica deployment without ' + 'sticky sessions a blob stored on one instance is not retrievable via the ' + 'sandbox URL on another (it 404s like an expired one).', + tier: 'deferred', + catalogLine: + 'stashPage — serialize a whole page to a short anonymous URL without loading its body.', buildShape: (z) => ({ pageId: z.string().min(1), }), -- 2.52.0 From 68caf8157ab97824619667737e3d0c61e4ffeb4e Mon Sep 17 00:00:00 2001 From: agent_coder Date: Sat, 4 Jul 2026 20:34:42 +0300 Subject: [PATCH 2/2] test(ai-chat): document AI_CHAT_DEFERRED_TOOLS + pin ON-path & catalog completeness (#341 review F1-F3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - F1: document AI_CHAT_DEFERRED_TOOLS in .env.example (AI_* section) — default ON = deferred loading (compact catalog + loadTools), =false restores the old "all tools always active" behavior. - F2: integration test of the ON path in ai-chat-stream.int-spec.ts — a deferred tool activated via loadTools is active on the SAME turn's next step but a fresh turn starts cold (CORE + loadTools only), proving the per-turn activatedTools Set does not leak across turns/chats. Drives the real streamText loop with a MockLanguageModelV3 and inspects recorded per-step activeTools-filtered tools. - F3: replace the magic toHaveLength(28) in tool-tiers.spec.ts with a two-way partition against the LIVE in-app toolset (AiChatToolsService.forUser keys): every non-core tool must appear in buildInAppDeferredCatalog and every catalog entry must map to a real non-core tool — so a future tool forgotten in INLINE_TOOL_TIERS fails the suite instead of silently vanishing from the agent. No production logic change (mechanism was already reviewed correct). Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 7 + .../src/core/ai-chat/tools/tool-tiers.spec.ts | 91 ++++++++- .../integration/ai-chat-stream.int-spec.ts | 172 ++++++++++++++++++ 3 files changed, 267 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index d32b67c7..d3d63309 100644 --- a/.env.example +++ b/.env.example @@ -202,6 +202,13 @@ MCP_DOCMOST_PASSWORD= # Default 900000 (15 min). # AI_MCP_CALL_TIMEOUT_MS=900000 +# Deferred tool loading for the in-app AI chat (#332). Default ON: the agent sees +# a compact and only CORE tools + a loadTools meta-tool are active +# each step; deferred tools (the fat/rare ones + all external MCP tools) load on +# demand. Set AI_CHAT_DEFERRED_TOOLS=false to restore the old "all tools always +# active" behavior. +# AI_CHAT_DEFERRED_TOOLS=true + # --- Anonymous public-share AI assistant --- # Opt-in per workspace (AI settings -> "public share assistant"; off by default). # When enabled, anonymous visitors of a published share can ask an AI about that diff --git a/apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts b/apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts index 367f4f4b..8aa039bd 100644 --- a/apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts +++ b/apps/server/src/core/ai-chat/tools/tool-tiers.spec.ts @@ -13,6 +13,12 @@ import { // SHARED_TOOL_SPECS contract spec) so the tier metadata is checked against // exactly what @docmost/mcp ships. import { SHARED_TOOL_SPECS } from '../../../../../../packages/mcp/src/tool-specs'; +// For the live-toolset partition test (F3): the REAL adapter, so the catalog is +// checked against the tools AiChatToolsService.forUser() actually builds — not a +// static list that could drift from it. +import { AiChatToolsService } from './ai-chat-tools.service'; +import * as loader from './docmost-client.loader'; +import type { DocmostClientLike } from './docmost-client.loader'; /** * #332 deferred tool loading — tier metadata, catalog assembly, and the @@ -68,15 +74,94 @@ describe('buildInAppDeferredCatalog (#332)', () => { expect(names).not.toContain('editPageText'); // shared core }); - it('lists all 28 deferred tools (16 inline + 12 shared)', () => { - expect(catalog).toHaveLength(28); - // Each entry is a "name — purpose" line. + it('renders every entry as a "name — purpose" line', () => { + // Non-empty catalog (the length is pinned structurally by the live-toolset + // partition test below, not by a magic constant that rots on every new tool). + expect(catalog.length).toBeGreaterThan(0); for (const entry of catalog) { expect(entry.catalogLine).toMatch(/ — /); } }); }); +/** + * F3 — the deferred is built from STATIC metadata (INLINE_TOOL_TIERS + * + SHARED_TOOL_SPECS), but the loadable-by-name set is derived at RUNTIME from the + * actual toolset (`Object.keys(baseTools)` in ai-chat.service.ts). Those two must + * agree or a tool becomes loadable-but-invisible (agent thinks it doesn't exist) or + * catalogued-but-phantom. INLINE_TOOL_TIERS is a plain hand-maintained Record with + * no compile-time link to the tools AiChatToolsService.forUser() builds, so nothing + * else catches that drift. This test uses forUser()'s LIVE keys as the source of + * truth (mirroring ai-chat-tools.service.spec.ts's loader mock) and asserts a + * two-way partition against buildInAppDeferredCatalog — replacing the old magic + * toHaveLength(28), so a tool added to forUser() without a catalog line (or a + * catalog line without a real tool) fails the suite instead of silently vanishing. + */ +describe('deferred catalog ↔ live forUser() toolset partition (#332, F3)', () => { + let toolKeys: string[]; + const catalogNames = buildInAppDeferredCatalog(SHARED_TOOL_SPECS as never).map( + (e) => e.name, + ); + + beforeAll(async () => { + // Intercept the ESM loader so forUser() builds against the TS-source shared + // specs (no @docmost/mcp build) and never touches the network. + jest.spyOn(loader, 'loadDocmostMcp').mockResolvedValue({ + DocmostClient: function () { + return {} as DocmostClientLike; + } as unknown as loader.DocmostClientCtor, + sharedToolSpecs: SHARED_TOOL_SPECS as Record, + }); + const service = new AiChatToolsService( + { + generateAccessToken: jest.fn().mockResolvedValue('access-token'), + generateCollabToken: jest.fn().mockResolvedValue('collab-token'), + } as never, + {} as never, // aiService — not exercised while merely BUILDING the tools + {} as never, // pageEmbeddingRepo + {} as never, // spaceMemberRepo + {} as never, // pagePermissionRepo + // sandboxStore: forUser() eagerly calls asSink() to wire the stash tool. + { + asSink: () => ({ put: jest.fn(), has: jest.fn(), evict: jest.fn() }), + } as never, + ); + const tools = await service.forUser( + { id: 'user-1', email: 'u@example.com', workspaceId: 'ws-1' } as never, + 'session-1', + 'ws-1', + 'chat-1', + ); + toolKeys = Object.keys(tools); + }); + + afterAll(() => { + jest.restoreAllMocks(); + }); + + it('exposes a non-trivial toolset (sanity: the mock actually built tools)', () => { + expect(toolKeys.length).toBeGreaterThan(20); + }); + + it('every non-core live tool is present in the catalog (no capability silently hidden)', () => { + // forUser() does not itself add loadTools (ai-chat.service does), but guard + // anyway. Every remaining non-core key MUST have a catalog line. + const catalogSet = new Set(catalogNames); + const missing = toolKeys.filter( + (k) => !CORE_TOOL_SET.has(k) && k !== LOAD_TOOLS_NAME && !catalogSet.has(k), + ); + expect(missing).toEqual([]); + }); + + it('every catalog entry corresponds to a real, non-core live tool (no phantom)', () => { + const liveSet = new Set(toolKeys); + const phantom = catalogNames.filter( + (n) => !liveSet.has(n) || CORE_TOOL_SET.has(n), + ); + expect(phantom).toEqual([]); + }); +}); + describe('buildExternalToolCatalog + shortenForCatalog (#332)', () => { it('derives a short "name — purpose" line from each external tool description', () => { const catalog = buildExternalToolCatalog({ diff --git a/apps/server/test/integration/ai-chat-stream.int-spec.ts b/apps/server/test/integration/ai-chat-stream.int-spec.ts index 17deac63..a226e864 100644 --- a/apps/server/test/integration/ai-chat-stream.int-spec.ts +++ b/apps/server/test/integration/ai-chat-stream.int-spec.ts @@ -1,5 +1,7 @@ import * as http from 'node:http'; import { Kysely } from 'kysely'; +import { tool } from 'ai'; +import { z } from 'zod'; import { MockLanguageModelV3, convertArrayToReadableStream } from 'ai/test'; import { AiChatRepo } from '@docmost/db/repos/ai-chat/ai-chat.repo'; import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo'; @@ -318,4 +320,174 @@ describe('AiChatService.stream [integration]', () => { true, ); }); + + /** + * #332 deferred tool loading, the ON path. The riskiest property is that the + * per-turn `activatedTools` Set is created FRESH inside each stream() call, so a + * tool a previous turn activated via loadTools is NOT still active when the next + * turn starts — the new turn begins "cold" (CORE + loadTools only). The unit + * tests only exercise pure prepareAgentStep with hand-fed Sets; this pins the + * real wiring end-to-end (loadTools.execute -> activatedTools -> prepareStep -> + * per-step activeTools) against the real streamText loop, and proves there is no + * cross-turn leak. We drive a MockLanguageModelV3 whose step 1 calls + * loadTools(['createPage']) and assert, via the model's recorded per-step + * CallOptions.tools (the AI SDK filters the provider tool list by activeTools), + * that the deferred tool becomes active on the SAME turn's next step but NOT on a + * fresh turn's first step. + */ + describe('deferred tool loading ON — per-turn activation, no leak (#332)', () => { + // A stub deferred (non-core) tool the agent can activate. Its execute is never + // called — the model only needs to SEE it become active — but it must be a + // valid AI-SDK tool so the SDK includes it in a step's tool list once active. + const createPageStub = tool({ + description: 'create a new page', + inputSchema: z.object({ title: z.string() }), + execute: async () => ({ id: 'p-stub' }), + }); + + // A CORE tool in the toolset, so a cold step shows CORE tools ARE active while + // the deferred createPage is not. `searchPages` is in CORE_TOOL_SET. + const searchPagesStub = tool({ + description: 'search the wiki', + inputSchema: z.object({ query: z.string() }), + execute: async () => [], + }); + + // Same lifecycle harness as buildService() above, but with deferred loading ON + // and a toolset that exposes exactly one deferred tool (createPage) so it is + // catalogued + loadable-by-name. Kept separate so the OFF scenarios are + // untouched. + function buildDeferredService(): AiChatService { + return new AiChatService( + { getChatModel: async () => null } as any, + aiChatRepo, + msgRepo, + {} as any, + { resolve: async () => null } as any, + { + forUser: async () => ({ + searchPages: searchPagesStub, + createPage: createPageStub, + }), + getInAppDeferredCatalog: async () => [ + { name: 'createPage', catalogLine: 'createPage — create a new page.' }, + ], + } as any, + mcpClients as any, + {} as any, + {} as any, + {} as any, + // #332: deferred tool loading ON — the property under test. + { isAiChatDeferredToolsEnabled: () => true } as any, + ); + } + + // Drive ONE stream() turn against `model` and wait for the assistant row to + // settle (mirrors runStream, but builds the deferred-ON service). + async function runDeferredTurn( + model: MockLanguageModelV3, + chatId: string, + body: any, + ): Promise { + closeCalls = 0; + const service = buildDeferredService(); + const { res, cleanup } = await makeRealResponse(); + try { + await service.stream({ + user: { id: userId, workspaceId } as any, + workspace: { id: workspaceId, name: 'WS' } as any, + sessionId: 'sess-1', + body, + res: { raw: res } as any, + signal: new AbortController().signal, + model: model as any, + role: null, + } as any); + await waitFor(async () => { + const rows = await msgRepo.findAllByChat(chatId, workspaceId); + return rows.some( + (r) => + r.role === 'assistant' && + ['completed', 'error', 'aborted'].includes(r.status as string), + ); + }); + await waitFor(() => closeCalls > 0, { timeoutMs: 5_000 }); + } finally { + await cleanup(); + } + } + + // Tool names the provider actually received for a recorded step (activeTools + // filters this list, so it reflects what was active that step). + const toolNames = (call: any): string[] => + ((call?.tools ?? []) as any[]).map((t) => t?.name).filter(Boolean); + + // A model that, on step 1, calls loadTools(['createPage']); on step 2, answers. + function loadThenAnswerModel(): MockLanguageModelV3 { + let step = 0; + return new MockLanguageModelV3({ + doStream: async () => { + const n = step++; + if (n === 0) { + return { + stream: convertArrayToReadableStream([ + { type: 'stream-start', warnings: [] }, + { + type: 'tool-call', + toolCallId: 'lt1', + toolName: 'loadTools', + input: JSON.stringify({ names: ['createPage'] }), + }, + { + type: 'finish', + finishReason: 'tool-calls', + usage: { inputTokens: 5, outputTokens: 3, totalTokens: 8 }, + }, + ] as any), + }; + } + return { stream: successStream() }; + }, + } as any); + } + + it('activates a deferred tool for the SAME turn, and a NEW turn starts cold (no leak)', async () => { + const chatId = (await createChat(db, { workspaceId, creatorId: userId })).id; + + // --- Turn 1: loadTools(createPage) on step 1, then answer on step 2. --- + const model1 = loadThenAnswerModel(); + await runDeferredTurn(model1, chatId, { + chatId, + messages: [userUiMessage('Make me a page')], + }); + + // The turn ran at least two steps (the load round-trip + the answer). + expect(model1.doStreamCalls.length).toBeGreaterThanOrEqual(2); + const step1Tools = toolNames(model1.doStreamCalls[0]); + const step2Tools = toolNames(model1.doStreamCalls[1]); + + // Step 1 starts cold: CORE tools + the loadTools meta-tool are active, but + // the deferred createPage is NOT yet. + expect(step1Tools).toContain('loadTools'); + expect(step1Tools).toContain('searchPages'); // a CORE tool, always active + expect(step1Tools).not.toContain('createPage'); + // Step 2 of the SAME turn sees the just-activated deferred tool. + expect(step2Tools).toContain('createPage'); + + // --- Turn 2 on the SAME chat: must start cold again. --- + const model2 = new MockLanguageModelV3({ + doStream: async () => ({ stream: successStream() }), + } as any); + await runDeferredTurn(model2, chatId, { + chatId, + messages: [userUiMessage('And another thing')], + }); + + const nextTurnFirstStep = toolNames(model2.doStreamCalls[0]); + expect(nextTurnFirstStep).toContain('loadTools'); + // The activated set is per-turn: the prior turn's createPage did NOT leak, + // so the fresh turn's first step sees it deferred again. + expect(nextTurnFirstStep).not.toContain('createPage'); + }); + }); }); -- 2.52.0