Merge pull request 'refactor(agent-roles-catalog): YAML catalog with block-scalar instructions (#229)' (#231) from feat/229-catalog-yaml into develop
Reviewed-on: #231
This commit was merged in pull request #231.
This commit is contained in:
@@ -125,6 +125,7 @@
|
||||
"typesense": "^3.0.5",
|
||||
"undici": "7.24.0",
|
||||
"ws": "^8.20.1",
|
||||
"yaml": "^2.8.3",
|
||||
"yauzl": "^3.2.1",
|
||||
"zod": "^4.3.6"
|
||||
},
|
||||
|
||||
@@ -187,7 +187,7 @@ export class AiAgentRolesService {
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Catalog (admin-only). The catalog is curated, untrusted JSON fetched +
|
||||
// Catalog (admin-only). The catalog is curated, untrusted YAML fetched +
|
||||
// validated by AiAgentRolesCatalogProvider; this layer resolves localized
|
||||
// text and reconciles a bundle against the workspace's existing roles.
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
@@ -1,12 +1,23 @@
|
||||
import { BadGatewayException, BadRequestException } from '@nestjs/common';
|
||||
import { AiAgentRolesCatalogProvider } from './ai-agent-roles-catalog.provider';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { parse as parseYaml, stringify as stringifyYaml } from 'yaml';
|
||||
import {
|
||||
AiAgentRolesCatalogProvider,
|
||||
isCatalogBundleFile,
|
||||
isCatalogIndex,
|
||||
isCatalogRole,
|
||||
} from './ai-agent-roles-catalog.provider';
|
||||
|
||||
/**
|
||||
* Provider tests against a mocked remote source (no network). They cover the
|
||||
* happy read path (fetchIndex / fetchBundle), the malformed-shape rejection,
|
||||
* rejection of non-http(s) sources (local sources are gone), and — most
|
||||
* importantly — the `^[a-z0-9-]+$` path-traversal guard that runs BEFORE any
|
||||
* path/URL is built.
|
||||
* happy read path (fetchIndex / fetchBundle) over the YAML catalog format, the
|
||||
* block-scalar `instructions` round-trip, the malformed-shape rejection, the
|
||||
* malformed-YAML rejection, rejection of non-http(s) sources (local sources are
|
||||
* gone), and — most importantly — the `^[a-z0-9-]+$` path-traversal guard that
|
||||
* runs BEFORE any path/URL is built. Fixtures are serialized with the same
|
||||
* `yaml` library the provider parses with (`stringifyYaml`), so the tests
|
||||
* exercise real YAML, not the JSON subset.
|
||||
*/
|
||||
describe('AiAgentRolesCatalogProvider', () => {
|
||||
function makeProvider(source: string) {
|
||||
@@ -71,7 +82,7 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
}
|
||||
|
||||
it('fetchBundle remote happy path => parses + validates', async () => {
|
||||
const json = JSON.stringify({
|
||||
const yaml = stringifyYaml({
|
||||
schemaVersion: 1,
|
||||
language: 'en',
|
||||
roles: [
|
||||
@@ -82,7 +93,7 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
},
|
||||
],
|
||||
});
|
||||
const body = streamOf([new TextEncoder().encode(json)]);
|
||||
const body = streamOf([new TextEncoder().encode(yaml)]);
|
||||
global.fetch = jest
|
||||
.fn()
|
||||
.mockResolvedValue(mockResponse({ body })) as never;
|
||||
@@ -92,12 +103,12 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
});
|
||||
|
||||
it('fetchBundle remote malformed (role missing instructions) => BadGateway', async () => {
|
||||
const json = JSON.stringify({
|
||||
const yaml = stringifyYaml({
|
||||
schemaVersion: 1,
|
||||
language: 'fr',
|
||||
roles: [{ slug: 'researcher', name: 'Chercheur' }],
|
||||
});
|
||||
const body = streamOf([new TextEncoder().encode(json)]);
|
||||
const body = streamOf([new TextEncoder().encode(yaml)]);
|
||||
global.fetch = jest
|
||||
.fn()
|
||||
.mockResolvedValue(mockResponse({ body })) as never;
|
||||
@@ -153,8 +164,9 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
);
|
||||
global.fetch = fetchMock as never;
|
||||
const provider = makeProvider('https://catalog.example.com');
|
||||
// Body shape is irrelevant; an empty stream parses to invalid JSON and
|
||||
// throws, but the fetch call (with its init) still happened.
|
||||
// Body shape is irrelevant; an empty stream parses to an empty YAML doc
|
||||
// (null), fails the shape guard and throws, but the fetch call (with its
|
||||
// init) still happened.
|
||||
await expect(provider.fetchIndex()).rejects.toBeDefined();
|
||||
expect(fetchMock).toHaveBeenCalledWith(
|
||||
expect.any(String),
|
||||
@@ -190,7 +202,7 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
});
|
||||
|
||||
it('small streamed body parses normally (cap not hit)', async () => {
|
||||
const json = JSON.stringify({
|
||||
const yaml = stringifyYaml({
|
||||
schemaVersion: 1,
|
||||
bundles: [
|
||||
{
|
||||
@@ -201,7 +213,7 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
},
|
||||
],
|
||||
});
|
||||
const body = streamOf([new TextEncoder().encode(json)]);
|
||||
const body = streamOf([new TextEncoder().encode(yaml)]);
|
||||
global.fetch = jest
|
||||
.fn()
|
||||
.mockResolvedValue(mockResponse({ body })) as never;
|
||||
@@ -227,7 +239,7 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
});
|
||||
|
||||
it('null body (no readable stream) => response.text() fallback parses', async () => {
|
||||
const json = JSON.stringify({
|
||||
const yaml = stringifyYaml({
|
||||
schemaVersion: 1,
|
||||
bundles: [
|
||||
{
|
||||
@@ -240,7 +252,7 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
});
|
||||
global.fetch = jest
|
||||
.fn()
|
||||
.mockResolvedValue(mockResponse({ body: null, text: json })) as never;
|
||||
.mockResolvedValue(mockResponse({ body: null, text: yaml })) as never;
|
||||
const provider = makeProvider('https://catalog.example.com');
|
||||
const index = await provider.fetchIndex();
|
||||
expect(index.bundles[0].id).toBe('general');
|
||||
@@ -259,8 +271,12 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
);
|
||||
});
|
||||
|
||||
it('invalid JSON body => BadGateway (parse failure)', async () => {
|
||||
const body = streamOf([new TextEncoder().encode('{not valid json')]);
|
||||
it('invalid YAML body => BadGateway (parse failure)', async () => {
|
||||
// An unterminated flow mapping is not valid YAML, so YAML.parse throws and
|
||||
// the provider maps it to BadGateway (not a generic 500).
|
||||
const body = streamOf([
|
||||
new TextEncoder().encode('schemaVersion: {not: closed'),
|
||||
]);
|
||||
global.fetch = jest
|
||||
.fn()
|
||||
.mockResolvedValue(mockResponse({ body })) as never;
|
||||
@@ -270,11 +286,28 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
);
|
||||
});
|
||||
|
||||
it('malformed index.json (valid JSON, wrong shape) => BadGateway', async () => {
|
||||
// Parses as JSON but fails isCatalogIndex (schemaVersion not a number).
|
||||
it('YAML with a duplicate key (strict) => BadGateway (parse failure)', async () => {
|
||||
// strict:true rejects duplicate mapping keys rather than last-wins coercing
|
||||
// them — a defensive parse on untrusted input.
|
||||
const body = streamOf([
|
||||
new TextEncoder().encode(
|
||||
JSON.stringify({ schemaVersion: 'x', bundles: [] }),
|
||||
'schemaVersion: 1\nbundles: []\nschemaVersion: 2\n',
|
||||
),
|
||||
]);
|
||||
global.fetch = jest
|
||||
.fn()
|
||||
.mockResolvedValue(mockResponse({ body })) as never;
|
||||
const provider = makeProvider('https://catalog.example.com');
|
||||
await expect(provider.fetchIndex()).rejects.toBeInstanceOf(
|
||||
BadGatewayException,
|
||||
);
|
||||
});
|
||||
|
||||
it('malformed index.yaml (valid YAML, wrong shape) => BadGateway', async () => {
|
||||
// Parses as YAML but fails isCatalogIndex (schemaVersion not a number).
|
||||
const body = streamOf([
|
||||
new TextEncoder().encode(
|
||||
stringifyYaml({ schemaVersion: 'x', bundles: [] }),
|
||||
),
|
||||
]);
|
||||
global.fetch = jest
|
||||
@@ -283,6 +316,36 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
const provider = makeProvider('https://catalog.example.com');
|
||||
await expect(provider.fetchIndex()).rejects.toThrow(/malformed/i);
|
||||
});
|
||||
|
||||
it('block-scalar instructions round-trips to the exact multi-line string', async () => {
|
||||
// The whole point of the YAML migration: a long `instructions` prompt is
|
||||
// stored as a literal block scalar (|-) for line-by-line diffs, and must
|
||||
// resolve byte-for-byte to the original multi-line string.
|
||||
const instructions = [
|
||||
'Line one of the prompt.',
|
||||
'',
|
||||
' Indented bullet that must survive.',
|
||||
'Final line, no trailing newline.',
|
||||
].join('\n');
|
||||
const yaml = stringifyYaml(
|
||||
{
|
||||
schemaVersion: 1,
|
||||
language: 'en',
|
||||
roles: [{ slug: 'researcher', name: 'Researcher', instructions }],
|
||||
},
|
||||
{ lineWidth: 0 },
|
||||
);
|
||||
// Sanity: the fixture really uses a literal block scalar (|, optionally
|
||||
// with an indentation indicator), not a flow/quoted string.
|
||||
expect(yaml).toMatch(/instructions: \|/);
|
||||
const body = streamOf([new TextEncoder().encode(yaml)]);
|
||||
global.fetch = jest
|
||||
.fn()
|
||||
.mockResolvedValue(mockResponse({ body })) as never;
|
||||
const provider = makeProvider('https://catalog.example.com');
|
||||
const bundle = await provider.fetchBundle('research', 'en');
|
||||
expect(bundle.roles[0].instructions).toBe(instructions);
|
||||
});
|
||||
});
|
||||
|
||||
describe('path-traversal / SSRF guard (^[a-z0-9-]+$)', () => {
|
||||
@@ -304,4 +367,93 @@ describe('AiAgentRolesCatalogProvider', () => {
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pin the REAL shipped catalog files (not synthetic fixtures). The JSON->YAML
|
||||
// migration was a hand conversion, so the realistic failure is a hand-edit
|
||||
// error in one of the 5 content YAML files (the index + the four per-bundle/
|
||||
// lang files: index.yaml plus bundles/{editorial,research}/{en,ru}.yaml) — a
|
||||
// quote/colon in a description, a broken
|
||||
// emoji/arrow, a block-scalar indent slip that silently changes or drops
|
||||
// instructions). Nothing else in CI parses these files — `scripts/check.mjs`
|
||||
// is not wired into any turbo/husky/CI step — so this is the only automated
|
||||
// guard over the shipped content. We read them straight off disk, parse with
|
||||
// the SAME options the provider uses (strict + maxAliasCount, see parseYaml in
|
||||
// the provider), and run them through the provider's own type guards. A future
|
||||
// edit that breaks a real file fails here.
|
||||
// ---------------------------------------------------------------------------
|
||||
describe('real shipped catalog files (the YAML migration must not break them)', () => {
|
||||
// Spec lives at apps/server/src/core/ai-chat/roles/catalog/; the catalog
|
||||
// ships at the repo root (agent-roles-catalog/) — seven levels up.
|
||||
const CATALOG_DIR = join(
|
||||
__dirname,
|
||||
'../../../../../../../agent-roles-catalog',
|
||||
);
|
||||
// Match the provider's parseYaml exactly (untrusted-input parse options).
|
||||
const PARSE_OPTS = { strict: true, maxAliasCount: 100 } as const;
|
||||
|
||||
function readCatalogYaml(rel: string): unknown {
|
||||
return parseYaml(readFileSync(join(CATALOG_DIR, rel), 'utf8'), PARSE_OPTS);
|
||||
}
|
||||
|
||||
// Load + validate the real index lazily (only when a test runs), so a broken
|
||||
// real file fails ONLY these catalog tests — not collection of the entire
|
||||
// spec, which also holds the unrelated mocked-remote provider tests above.
|
||||
function loadRealIndex() {
|
||||
const parsed = readCatalogYaml('index.yaml');
|
||||
if (!isCatalogIndex(parsed)) {
|
||||
throw new Error('Real index.yaml is not a valid catalog index');
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
it('index.yaml parses + validates with the provider guard', () => {
|
||||
expect(isCatalogIndex(readCatalogYaml('index.yaml'))).toBe(true);
|
||||
});
|
||||
|
||||
it('editorial bundle still ships the fact-checker role', () => {
|
||||
const editorial = loadRealIndex().bundles.find((b) => b.id === 'editorial');
|
||||
expect(editorial).toBeDefined();
|
||||
expect(editorial?.roles.map((r) => r.slug)).toContain('fact-checker');
|
||||
});
|
||||
|
||||
// Driven by the real index (read inside the test, so it's lazy): every
|
||||
// declared bundle + language file must parse, validate, and be in EXACT slug
|
||||
// correspondence with the index — every declared role present AND no
|
||||
// undeclared extras — mirroring scripts/check.mjs, which requires both
|
||||
// directions. A bundle or language added later is covered automatically.
|
||||
it('every declared bundle/language file is valid and in exact slug correspondence', () => {
|
||||
const index = loadRealIndex();
|
||||
// Guard against an empty index silently passing the loops below.
|
||||
expect(index.bundles.length).toBeGreaterThan(0);
|
||||
for (const bundle of index.bundles) {
|
||||
const declaredSlugs = bundle.roles.map((r) => r.slug);
|
||||
expect(bundle.languages.length).toBeGreaterThan(0);
|
||||
for (const lang of bundle.languages) {
|
||||
const rel = `bundles/${bundle.id}/${lang}.yaml`;
|
||||
const file = readCatalogYaml(rel);
|
||||
expect(isCatalogBundleFile(file)).toBe(true);
|
||||
// Narrow for TS and access fields safely.
|
||||
if (!isCatalogBundleFile(file)) continue;
|
||||
expect(file.language).toBe(lang);
|
||||
const fileSlugs = file.roles.map((r) => r.slug);
|
||||
// Existing direction: every declared role is present in the file.
|
||||
for (const slug of declaredSlugs) {
|
||||
expect(fileSlugs).toContain(slug);
|
||||
}
|
||||
// Symmetric direction: the file carries NO undeclared/extra roles, so
|
||||
// file slugs and declared slugs must be the SAME set (exact match).
|
||||
// Catches a hand-edit that copies a stray role into a bundle file.
|
||||
expect([...fileSlugs].sort()).toEqual([...declaredSlugs].sort());
|
||||
expect(file.roles.length).toBeGreaterThan(0);
|
||||
for (const role of file.roles) {
|
||||
expect(isCatalogRole(role)).toBe(true);
|
||||
expect(typeof role.instructions).toBe('string');
|
||||
expect(role.instructions.trim().length).toBeGreaterThan(0);
|
||||
expect(role.name.trim().length).toBeGreaterThan(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -4,6 +4,7 @@ import {
|
||||
Injectable,
|
||||
Logger,
|
||||
} from '@nestjs/common';
|
||||
import { parse as parseYamlDoc } from 'yaml';
|
||||
import { EnvironmentService } from '../../../../integrations/environment/environment.service';
|
||||
import {
|
||||
CatalogBundleFile,
|
||||
@@ -28,9 +29,11 @@ const MAX_BYTES = 1_000_000;
|
||||
* base URL — REMOTE only; local-filesystem sources are no longer supported. The
|
||||
* value is baked into the Docker image at build time (set per-branch in CI).
|
||||
*
|
||||
* The catalog is UNTRUSTED input: every file is JSON-parsed and run through a
|
||||
* hand-written type guard before any field is exposed, and every dynamic path
|
||||
* segment is validated against SEGMENT_RE up front (path-traversal + SSRF).
|
||||
* The catalog is UNTRUSTED input: every file is YAML-parsed with a SAFE schema
|
||||
* (standard JSON-compatible tags only — no custom `!!` tags / no code execution)
|
||||
* and run through a hand-written type guard before any field is exposed, and
|
||||
* every dynamic path segment is validated against SEGMENT_RE up front
|
||||
* (path-traversal + SSRF).
|
||||
*/
|
||||
@Injectable()
|
||||
export class AiAgentRolesCatalogProvider {
|
||||
@@ -38,19 +41,19 @@ export class AiAgentRolesCatalogProvider {
|
||||
|
||||
constructor(private readonly environmentService: EnvironmentService) {}
|
||||
|
||||
/** Read + validate the top-level index (`index.json`). */
|
||||
/** Read + validate the top-level index (`index.yaml`). */
|
||||
async fetchIndex(): Promise<CatalogIndex> {
|
||||
const raw = await this.readRelative('index.json');
|
||||
const parsed = this.parseJson(raw, 'index.json');
|
||||
const raw = await this.readRelative('index.yaml');
|
||||
const parsed = this.parseYaml(raw, 'index.yaml');
|
||||
if (!isCatalogIndex(parsed)) {
|
||||
throw new BadGatewayException(
|
||||
'Agent roles catalog index is malformed (index.json)',
|
||||
'Agent roles catalog index is malformed (index.yaml)',
|
||||
);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
/** Read + validate one language file (`bundles/<bundleId>/<language>.json`). */
|
||||
/** Read + validate one language file (`bundles/<bundleId>/<language>.yaml`). */
|
||||
async fetchBundle(
|
||||
bundleId: string,
|
||||
language: string,
|
||||
@@ -58,9 +61,9 @@ export class AiAgentRolesCatalogProvider {
|
||||
// SECURITY: validate BEFORE building any path/URL (path-traversal + SSRF).
|
||||
this.assertSegment(bundleId, 'bundleId');
|
||||
this.assertSegment(language, 'language');
|
||||
const rel = `bundles/${bundleId}/${language}.json`;
|
||||
const rel = `bundles/${bundleId}/${language}.yaml`;
|
||||
const raw = await this.readRelative(rel);
|
||||
const parsed = this.parseJson(raw, rel);
|
||||
const parsed = this.parseYaml(raw, rel);
|
||||
if (!isCatalogBundleFile(parsed)) {
|
||||
throw new BadGatewayException(
|
||||
`Agent roles catalog bundle is malformed (${rel})`,
|
||||
@@ -76,15 +79,29 @@ export class AiAgentRolesCatalogProvider {
|
||||
}
|
||||
}
|
||||
|
||||
/** JSON.parse with a clear BadGateway on malformed content. */
|
||||
private parseJson(raw: string, rel: string): unknown {
|
||||
/**
|
||||
* Safe YAML parse with a clear BadGateway on malformed content. The catalog is
|
||||
* untrusted, so we lean on the `yaml` library's default `core` schema, which
|
||||
* only produces JSON-compatible values (objects/arrays/strings/numbers/
|
||||
* booleans/null) and NEVER constructs arbitrary types or runs code — there is
|
||||
* no `!!js`-style tag handling. `strict: true` rejects duplicate keys instead
|
||||
* of silently coercing them. (Note: in yaml@2.8.x an unknown custom tag does
|
||||
* NOT throw even under `strict` — the parser logs a warning and resolves the
|
||||
* node to a plain scalar; the catalog stays safe because the default schema
|
||||
* never builds arbitrary types from a tag and our hand-written type guards
|
||||
* reject any value of the wrong shape.) The alias-expansion guard
|
||||
* (`maxAliasCount`) bounds billion-laughs blow-ups (the 1 MB streaming
|
||||
* cap already limits the input itself). JSON is a YAML subset, so a leftover
|
||||
* `.json`-style body still parses here too.
|
||||
*/
|
||||
private parseYaml(raw: string, rel: string): unknown {
|
||||
try {
|
||||
return JSON.parse(raw);
|
||||
return parseYamlDoc(raw, { strict: true, maxAliasCount: 100 });
|
||||
} catch (err) {
|
||||
const reason = shortError(err);
|
||||
this.logger.error(`Agent roles catalog JSON parse failed (${rel}): ${reason}`);
|
||||
this.logger.error(`Agent roles catalog YAML parse failed (${rel}): ${reason}`);
|
||||
throw new BadGatewayException(
|
||||
`Agent roles catalog file is not valid JSON (${rel}): ${reason}`,
|
||||
`Agent roles catalog file is not valid YAML (${rel}): ${reason}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
/**
|
||||
* Catalog wire shapes. The catalog is curated, untrusted JSON (a GitHub repo or
|
||||
* Catalog wire shapes. The catalog is curated, untrusted YAML (a GitHub repo or
|
||||
* a local folder), so every shape is validated by a hand-written type guard in
|
||||
* the provider before any field is used — no zod / new deps on the server.
|
||||
* the provider before any field is used — no zod on the server (YAML is parsed
|
||||
* with the `yaml` library's safe, JSON-compatible schema).
|
||||
*
|
||||
* Localized fields (`name` / `description` at the bundle level) are
|
||||
* `Record<language, string>` so one bundle serves many UI languages; per-role
|
||||
@@ -22,7 +23,7 @@ export interface CatalogRole {
|
||||
modelConfig?: Record<string, unknown> | null;
|
||||
}
|
||||
|
||||
/** A single language file: `bundles/<id>/<language>.json`. */
|
||||
/** A single language file: `bundles/<id>/<language>.yaml`. */
|
||||
export interface CatalogBundleFile {
|
||||
schemaVersion: number;
|
||||
language: string;
|
||||
@@ -40,7 +41,7 @@ export interface CatalogBundleMeta {
|
||||
roles: { slug: string; version: number }[];
|
||||
}
|
||||
|
||||
/** Top-level catalog index: `index.json`. */
|
||||
/** Top-level catalog index: `index.yaml`. */
|
||||
export interface CatalogIndex {
|
||||
schemaVersion: number;
|
||||
bundles: CatalogBundleMeta[];
|
||||
|
||||
Reference in New Issue
Block a user