feat(ai-chat): per-MCP-server instructions in the agent system prompt (#180)

Admins can now give each EXTERNAL MCP server a free-text instruction ("how/
when to use this server's tools") that the agent receives in its SYSTEM
PROMPT next to the tool descriptions — porting the built-in SERVER_INSTRUCTIONS
idea to admin-configured servers. Trusted, admin-authored text (like a system
prompt); NON-secret, so unlike headersEnc it IS returned in views/forms.

- Migration: nullable `instructions text` on ai_mcp_servers (old rows = null =
  no guidance). Table type + repo insert/update (blank/whitespace -> null via
  blankToNull). DTO `@MaxLength(4000)`. Service threads it through
  McpServerView/toView.
- mcp-clients: `McpServerInstruction { serverName, toolPrefix, instructions }`
  threaded through the toolset/cache/lease. Guidance is built ONLY for a server
  that actually connected AND contributed >=1 callable tool (the allowlist may
  filter all of them out) AND has non-blank text — so a guide never appears for
  tools the agent cannot call. Cached with the toolset, so an edit is picked up
  next turn via the existing CRUD cache invalidation.
- System prompt: `buildMcpToolingBlock` renders an <mcp_tooling> block INSIDE
  the safety sandwich (after context, before the trailing SAFETY_FRAMEWORK) so
  it informs tool choice but cannot override the rules; each section is headed
  by the server's `prefix_*` namespace. Empty/blank -> block omitted. The
  caller (ai-chat.service) now builds the external toolset BEFORE the prompt and
  passes external.instructions; client-handle lifecycle (close-once) unchanged.
- Client: instructions field in types + a Textarea (autosize, maxLength 4000)
  in the MCP-server form with a namespace-prefix hint; i18n (en/ru).

Tests across every layer (prompt block placement + both SAFETY copies; view
blank->null; buildEntry includes guidance only for connected+>=1-tool+non-blank;
DTO MaxLength; repo + integration round-trip; service wiring). Delegated impl
reviewed (APPROVE); applied the import-type follow-up.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
claude code agent 227
2026-06-25 04:52:05 +03:00
parent e536c6f9a9
commit 77ccc596ea
20 changed files with 1011 additions and 211 deletions

View File

@@ -60,7 +60,10 @@ export function prepareAgentStep(
system: string,
): { toolChoice: 'none'; system: string } | undefined {
if (stepNumber >= MAX_AGENT_STEPS - 1) {
return { toolChoice: 'none', system: `${system}\n\n${FINAL_STEP_INSTRUCTION}` };
return {
toolChoice: 'none',
system: `${system}\n\n${FINAL_STEP_INSTRUCTION}`,
};
}
return undefined;
}
@@ -259,9 +262,7 @@ export class AiChatService {
content: incomingText,
// jsonb column: UIMessage parts are JSON-serializable at runtime but not
// structurally `JsonValue`, so cast through unknown.
metadata: (incoming?.parts
? { parts: incoming.parts }
: null) as never,
metadata: (incoming?.parts ? { parts: incoming.parts } : null) as never,
});
// Rebuild the conversation from persisted history (not the client payload),
@@ -280,6 +281,33 @@ export class AiChatService {
// The model is resolved by the controller before hijack (clean 503 path).
// Here we only need the admin-configured system prompt.
const resolved = await this.aiSettings.resolve(workspace.id);
// Build the external MCP toolset FIRST so the system prompt can carry each
// connected server's admin-authored guidance (#180). Merge in admin-
// configured external MCP tools (web search, etc.; §6.8). A down/slow
// external server never crashes the turn — toolsFor skips it and records the
// outcome. The returned client handles MUST be closed in the streamText
// lifecycle (onFinish/onError/onAbort) — leaking them is a bug. Docmost
// tools take precedence on a name clash (external are namespaced, so a clash
// is not expected; the spread order makes intent explicit).
let external: Awaited<ReturnType<McpClientsService['toolsFor']>> = {
tools: {},
clients: [],
outcomes: [],
instructions: [],
};
try {
external = await this.mcpClients.toolsFor(workspace.id);
} catch (err) {
// Building the external toolset must never break the turn; proceed with
// Docmost-only tools. Never log URLs/headers — short message only.
this.logger.warn(
`External MCP toolset unavailable: ${
err instanceof Error ? err.message : 'unknown error'
}`,
);
}
const system = buildSystemPrompt({
workspace,
adminPrompt: resolved?.systemPrompt,
@@ -287,6 +315,8 @@ export class AiChatService {
// the safety framework is still appended by buildSystemPrompt.
roleInstructions: role?.instructions,
openedPage: body.openPage,
// Guidance only for servers that connected and yielded ≥1 callable tool.
mcpInstructions: external.instructions,
});
// Pass the resolved chatId so the write tools can mint provenance tokens
@@ -302,28 +332,6 @@ export class AiChatService {
body.openPage,
);
// Merge in admin-configured external MCP tools (web search, etc.; §6.8).
// A down/slow external server never crashes the turn — toolsFor skips it and
// records the outcome. The returned client handles MUST be closed in the
// streamText lifecycle (onFinish/onError/onAbort) — leaking them is a bug.
// Docmost tools take precedence on a name clash (external are namespaced, so
// a clash is not expected; the spread order makes intent explicit).
let external: Awaited<ReturnType<McpClientsService['toolsFor']>> = {
tools: {},
clients: [],
outcomes: [],
};
try {
external = await this.mcpClients.toolsFor(workspace.id);
} catch (err) {
// Building the external toolset must never break the turn; proceed with
// Docmost-only tools. Never log URLs/headers — short message only.
this.logger.warn(
`External MCP toolset unavailable: ${
err instanceof Error ? err.message : 'unknown error'
}`,
);
}
const tools = { ...external.tools, ...docmostTools };
// Close every external client EXACTLY ONCE across the turn's terminal
@@ -395,144 +403,150 @@ export class AiChatService {
let result: ReturnType<typeof streamText>;
try {
result = streamText({
model,
system,
messages,
tools,
// No maxOutputTokens cap on the agent: tool-call arguments (e.g. a full
// page body for the write tools) are emitted as OUTPUT tokens, so a fixed
// cap would truncate complex tool calls mid-argument. Let the model use its
// natural per-step budget. (Cost/credit limits are an account concern, not
// something to enforce by silently breaking the agent.)
stopWhen: stepCountIs(MAX_AGENT_STEPS),
// Forced finalization: reserve the LAST allowed step for a text-only
// answer. Without this, a turn that spends all its steps on tool calls
// ends with no assistant text (an empty turn). prepareAgentStep forbids
// further tool calls and appends a synthesis instruction on that step,
// concatenated onto the original `system` so the persona is preserved.
prepareStep: ({ stepNumber }) => prepareAgentStep(stepNumber, system),
abortSignal: signal,
onChunk: ({ chunk }) => {
// DIAGNOSTIC (Safari stream-drop investigation) — temporary. Any model
// output chunk means the stream is actively emitting bytes; track first
// + most-recent activity timestamps.
const now = Date.now();
firstModelChunkAt ??= now;
lastModelChunkAt = now;
// 'text-delta' is the assistant's prose; tool-call args are separate chunk
// types — so this mirrors exactly what streams to the client.
if (chunk.type === 'text-delta') inProgressText += chunk.text;
},
onStepFinish: (step) => {
// The finished step's full text is now in `step.text`; fold it in and reset
// the in-progress accumulator for the next step.
capturedSteps.push(step as StepLike);
inProgressText = '';
},
onFinish: async ({ text, finishReason, totalUsage, usage, steps }) => {
// DIAGNOSTIC (Safari stream-drop investigation) — temporary: success
// baseline for Safari comparison.
const diagNow = Date.now();
this.logger.log(
`AI chat stream DIAGNOSTIC (finish): elapsed=${diagNow - streamStartedAt}ms ` +
`firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` +
`heartbeatsSent=${heartbeatsSent} steps=${steps.length}`,
);
await persistAssistant({
text,
toolCalls: serializeSteps(steps),
metadata: {
finishReason,
// Persist the turn's cumulative usage WITH reasoning tokens resolved
// from either the new `outputTokenDetails` or the deprecated top-level
// field, so reopened history / the Markdown export show the thinking
// token cost too.
usage: normalizeStreamUsage(totalUsage as StreamUsage) ?? totalUsage,
// Final-step usage = the context actually fed to the model on the last LLM
// call (full history + tool results) plus the answer it just generated.
// input+output of the FINAL step ≈ the conversation's CURRENT context size,
// distinct from totalUsage which sums every step (cumulative tokens spent).
contextTokens:
(usage?.inputTokens ?? 0) + (usage?.outputTokens ?? 0) || undefined,
// Persist the FULL set of UIMessage parts for the turn (text +
// tool-call/result), so the rebuilt history replays prior tool
// context to the model on later turns.
parts: assistantParts(steps, text),
},
});
// Lifecycle: release the external MCP clients leased for this turn.
await closeExternalClients();
// Generate the chat title for a freshly created chat AFTER the stream's
// provider call has completed — NOT concurrently with it. The z.ai coding
// endpoint stalls one of two concurrent requests to the same plan, which
// black-holed the chat stream (~300s headers timeout) when title
// generation raced it. Running it here (solo, fire-and-forget) avoids the
// race; never block the turn on it, swallow any error.
if (isNewChat && incomingText) {
void this.generateTitle(chatId, workspace.id, incomingText).catch(
(err) => {
this.logger.warn(
`Title generation failed: ${(err as Error)?.message ?? err}`,
);
},
model,
system,
messages,
tools,
// No maxOutputTokens cap on the agent: tool-call arguments (e.g. a full
// page body for the write tools) are emitted as OUTPUT tokens, so a fixed
// cap would truncate complex tool calls mid-argument. Let the model use its
// natural per-step budget. (Cost/credit limits are an account concern, not
// something to enforce by silently breaking the agent.)
stopWhen: stepCountIs(MAX_AGENT_STEPS),
// Forced finalization: reserve the LAST allowed step for a text-only
// answer. Without this, a turn that spends all its steps on tool calls
// ends with no assistant text (an empty turn). prepareAgentStep forbids
// further tool calls and appends a synthesis instruction on that step,
// concatenated onto the original `system` so the persona is preserved.
prepareStep: ({ stepNumber }) => prepareAgentStep(stepNumber, system),
abortSignal: signal,
onChunk: ({ chunk }) => {
// DIAGNOSTIC (Safari stream-drop investigation) — temporary. Any model
// output chunk means the stream is actively emitting bytes; track first
// + most-recent activity timestamps.
const now = Date.now();
firstModelChunkAt ??= now;
lastModelChunkAt = now;
// 'text-delta' is the assistant's prose; tool-call args are separate chunk
// types — so this mirrors exactly what streams to the client.
if (chunk.type === 'text-delta') inProgressText += chunk.text;
},
onStepFinish: (step) => {
// The finished step's full text is now in `step.text`; fold it in and reset
// the in-progress accumulator for the next step.
capturedSteps.push(step as StepLike);
inProgressText = '';
},
onFinish: async ({ text, finishReason, totalUsage, usage, steps }) => {
// DIAGNOSTIC (Safari stream-drop investigation) — temporary: success
// baseline for Safari comparison.
const diagNow = Date.now();
this.logger.log(
`AI chat stream DIAGNOSTIC (finish): elapsed=${diagNow - streamStartedAt}ms ` +
`firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` +
`heartbeatsSent=${heartbeatsSent} steps=${steps.length}`,
);
}
},
onError: async ({ error }) => {
// NestJS Logger.error(message, stack?, context?): pass the real message
// (with statusCode when present) + the stack string, not the Error
// object, so the actual provider cause is clearly logged. Reuse the
// shared formatter so provider error formatting stays unified.
const e = error as { stack?: string };
const errorText = describeProviderError(error, String(error));
this.logger.error(`AI chat stream error: ${errorText}`, e?.stack);
// DIAGNOSTIC (Safari stream-drop investigation) — temporary: timing of
// an error-terminated stream.
const diagNow = Date.now();
this.logger.warn(
`AI chat stream DIAGNOSTIC (error): elapsed=${diagNow - streamStartedAt}ms ` +
`firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` +
`silentGapBeforeDrop=${diagNow - lastModelChunkAt}ms heartbeatsSent=${heartbeatsSent}`,
);
// Persist the PARTIAL answer streamed before the failure (text + any
// finished tool steps) WITH the error in metadata, so the turn shows what
// the user already saw plus the cause — not just a bare error.
await persistAssistant(
buildPartialAssistantRecord(
capturedSteps,
inProgressText,
'error',
errorText,
),
);
await closeExternalClients();
},
onAbort: async ({ steps }) => {
const partialChars =
capturedSteps.reduce((n, s) => n + (s.text?.length ?? 0), 0) +
inProgressText.length;
// Unlike onError/onFinish, this terminal path otherwise writes nothing, so
// an aborted turn (client disconnect / proxy drop / stop()) would be
// invisible in the logs. Log it (warn) so the abort is traceable.
this.logger.warn(
`AI chat stream aborted (chat ${chatId}) after ${steps.length} ` +
`step(s), ${partialChars} chars partial text; persisting partial turn.`,
);
// DIAGNOSTIC (Safari stream-drop investigation) — temporary: THE key
// line — classifies the Safari drop.
const diagNow = Date.now();
this.logger.warn(
`AI chat stream DIAGNOSTIC (abort/disconnect): elapsed=${diagNow - streamStartedAt}ms ` +
`firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` +
`silentGapBeforeDrop=${diagNow - lastModelChunkAt}ms heartbeatsSent=${heartbeatsSent} ` +
`steps=${steps.length}`,
);
await persistAssistant(
buildPartialAssistantRecord(capturedSteps, inProgressText, 'aborted'),
);
await closeExternalClients();
},
await persistAssistant({
text,
toolCalls: serializeSteps(steps),
metadata: {
finishReason,
// Persist the turn's cumulative usage WITH reasoning tokens resolved
// from either the new `outputTokenDetails` or the deprecated top-level
// field, so reopened history / the Markdown export show the thinking
// token cost too.
usage:
normalizeStreamUsage(totalUsage as StreamUsage) ?? totalUsage,
// Final-step usage = the context actually fed to the model on the last LLM
// call (full history + tool results) plus the answer it just generated.
// input+output of the FINAL step ≈ the conversation's CURRENT context size,
// distinct from totalUsage which sums every step (cumulative tokens spent).
contextTokens:
(usage?.inputTokens ?? 0) + (usage?.outputTokens ?? 0) ||
undefined,
// Persist the FULL set of UIMessage parts for the turn (text +
// tool-call/result), so the rebuilt history replays prior tool
// context to the model on later turns.
parts: assistantParts(steps, text),
},
});
// Lifecycle: release the external MCP clients leased for this turn.
await closeExternalClients();
// Generate the chat title for a freshly created chat AFTER the stream's
// provider call has completed — NOT concurrently with it. The z.ai coding
// endpoint stalls one of two concurrent requests to the same plan, which
// black-holed the chat stream (~300s headers timeout) when title
// generation raced it. Running it here (solo, fire-and-forget) avoids the
// race; never block the turn on it, swallow any error.
if (isNewChat && incomingText) {
void this.generateTitle(chatId, workspace.id, incomingText).catch(
(err) => {
this.logger.warn(
`Title generation failed: ${(err as Error)?.message ?? err}`,
);
},
);
}
},
onError: async ({ error }) => {
// NestJS Logger.error(message, stack?, context?): pass the real message
// (with statusCode when present) + the stack string, not the Error
// object, so the actual provider cause is clearly logged. Reuse the
// shared formatter so provider error formatting stays unified.
const e = error as { stack?: string };
const errorText = describeProviderError(error, String(error));
this.logger.error(`AI chat stream error: ${errorText}`, e?.stack);
// DIAGNOSTIC (Safari stream-drop investigation) — temporary: timing of
// an error-terminated stream.
const diagNow = Date.now();
this.logger.warn(
`AI chat stream DIAGNOSTIC (error): elapsed=${diagNow - streamStartedAt}ms ` +
`firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` +
`silentGapBeforeDrop=${diagNow - lastModelChunkAt}ms heartbeatsSent=${heartbeatsSent}`,
);
// Persist the PARTIAL answer streamed before the failure (text + any
// finished tool steps) WITH the error in metadata, so the turn shows what
// the user already saw plus the cause — not just a bare error.
await persistAssistant(
buildPartialAssistantRecord(
capturedSteps,
inProgressText,
'error',
errorText,
),
);
await closeExternalClients();
},
onAbort: async ({ steps }) => {
const partialChars =
capturedSteps.reduce((n, s) => n + (s.text?.length ?? 0), 0) +
inProgressText.length;
// Unlike onError/onFinish, this terminal path otherwise writes nothing, so
// an aborted turn (client disconnect / proxy drop / stop()) would be
// invisible in the logs. Log it (warn) so the abort is traceable.
this.logger.warn(
`AI chat stream aborted (chat ${chatId}) after ${steps.length} ` +
`step(s), ${partialChars} chars partial text; persisting partial turn.`,
);
// DIAGNOSTIC (Safari stream-drop investigation) — temporary: THE key
// line — classifies the Safari drop.
const diagNow = Date.now();
this.logger.warn(
`AI chat stream DIAGNOSTIC (abort/disconnect): elapsed=${diagNow - streamStartedAt}ms ` +
`firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` +
`silentGapBeforeDrop=${diagNow - lastModelChunkAt}ms heartbeatsSent=${heartbeatsSent} ` +
`steps=${steps.length}`,
);
await persistAssistant(
buildPartialAssistantRecord(
capturedSteps,
inProgressText,
'aborted',
),
);
await closeExternalClients();
},
});
// Drain the stream independently of the client socket so the turn always
@@ -652,7 +666,10 @@ export class AiChatService {
'punctuation at the end.',
prompt: firstMessage.slice(0, 2000),
});
const title = text.trim().replace(/^["']|["']$/g, '').slice(0, 120);
const title = text
.trim()
.replace(/^["']|["']$/g, '')
.slice(0, 120);
if (title) {
await this.aiChatRepo.update(chatId, { title }, workspaceId);
}