Files
gitmost/packages/mcp/test/unit/page-search.test.mjs
T
claude code agent 227 77b245461f fix(mcp): search_in_page regex via re2 (ReDoS-safe) + review DO F1-F4 (#330 review)
Maintainer decision on the escalated ReDoS fork: use re2. The regex path
compiled agent-supplied patterns with `new RegExp` and ran them synchronously in
the shared event-loop; a catastrophic-backtracking pattern (e.g. `(a+)+$`) hung
the whole Node backend for all users (the tool is in both transports incl. the
in-app apps/server agent), and size caps do NOT bound backtracking.

Switch the regex engine to re2 (Google RE2, linear-time, no backtracking):
- `new RE2(query, caseSensitive?'g':'gi')`. RE2 extends RegExp, so eachMatch and
  the zero-length-match lastIndex guard are unchanged.
- Unsupported patterns are now a CLEAN error, not a hang: RE2 throws on invalid
  syntax AND on the backtracking-only features it can't do (lookaround
  (?=…)/(?<=…), backreferences \1) — caught at compile and returned as a clear
  tool error telling the agent to rewrite without them.
- Removed MAX_CONTAINER_TEXT + the per-container slice (re2 is linear, so it's no
  longer a ReDoS defense, and truncating risked silently dropping real matches in
  a long container); kept MAX_PATTERN_LENGTH as a cheap query sanity cap.
- Verified: `(a+)+$` over 50k `a` completes in ~4ms; lookaround/backref throw.
- Added re2 (^1.21.0) to packages/mcp; lockfile updated.

Reviewer DO items:
- F1 [doc]: removed the false "pass nodeId as a comment anchor" claim
  (create_comment has no nodeId param — it needs a text `selection`). Fixed in
  tool-specs.ts + page-search.ts (module + SearchMatch JSDoc) + client.ts; the ref
  is for get_node/patch_node, and for a comment you build a unique text selection
  from before+match+after.
- F2 [doc]: clarified `#<index>` refs (id-less table/cell) are accepted by get_node
  but NOT patch_node (id-only).
- F3 [test]: round-trip — each match's nodeId fed to the real getNodeByRef
  (attrs.id node + `#<index>` table-cell) to prove the ref format is consumable.
- F4 [test]: before/after edge-pinning (match in first 40 chars of a long
  container; index 0 → before==""; container end → after=="").
- New re2 tests: catastrophic patterns complete fast; lookaround/backref → error.

mcp: tsc clean; node --test 472 passed (+5). apps/server: tsc --noEmit clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-04 17:45:49 +03:00

321 lines
12 KiB
JavaScript

import { test } from "node:test";
import assert from "node:assert/strict";
import { searchInDoc } from "../../build/lib/page-search.js";
import { getNodeByRef } from "../../build/lib/node-ops.js";
// ---------------------------------------------------------------------------
// Document builders. Mirror the Docmost ProseMirror shape: paragraphs/headings
// carry an attrs.id and hold text nodes; a text node may carry marks, and
// adjacent runs with different marks are GLUED by blockPlainText so a match can
// straddle a mark boundary. Table cells hold id-less paragraphs.
// ---------------------------------------------------------------------------
const text = (t, marks) => (marks ? { type: "text", text: t, marks } : { type: "text", text: t });
const para = (id, ...children) => ({ type: "paragraph", attrs: { id }, content: children });
const heading = (id, level, t) => ({
type: "heading",
attrs: { id, level },
content: [text(t)],
});
function doc(...content) {
return { type: "doc", content };
}
test("literal substring: finds every occurrence with total/truncated and refs", () => {
const d = doc(
para("p1", text("The cat sat on the cat mat.")),
heading("h1", 2, "Another cat here"),
);
const res = searchInDoc(d, "cat");
assert.equal(res.total, 3);
assert.equal(res.truncated, false);
assert.equal(res.matches.length, 3);
// First hit: paragraph p1, block index 0.
assert.equal(res.matches[0].nodeId, "p1");
assert.equal(res.matches[0].blockIndex, 0);
assert.equal(res.matches[0].type, "paragraph");
assert.equal(res.matches[0].match, "cat");
// Third hit is in the heading (block index 1).
assert.equal(res.matches[2].nodeId, "h1");
assert.equal(res.matches[2].blockIndex, 1);
assert.equal(res.matches[2].type, "heading");
});
test("context windows: before/after are drawn from the SAME container", () => {
const d = doc(para("p1", text("alpha beta gamma delta")));
const res = searchInDoc(d, "gamma");
assert.equal(res.matches.length, 1);
assert.equal(res.matches[0].before, "alpha beta ");
assert.equal(res.matches[0].match, "gamma");
assert.equal(res.matches[0].after, " delta");
});
test("context windows are bounded to ~40 chars each side", () => {
const long = "x".repeat(100);
const d = doc(para("p1", text(long + "NEEDLE" + long)));
const res = searchInDoc(d, "NEEDLE");
assert.equal(res.matches.length, 1);
assert.equal(res.matches[0].before.length, 40);
assert.equal(res.matches[0].after.length, 40);
});
test("case-insensitive by default; caseSensitive:true narrows", () => {
const d = doc(para("p1", text("Cat CAT cat")));
assert.equal(searchInDoc(d, "cat").total, 3);
assert.equal(searchInDoc(d, "cat", { caseSensitive: true }).total, 1);
// Reported match preserves the ORIGINAL casing even under a folded search.
const res = searchInDoc(d, "cat");
assert.deepEqual(
res.matches.map((m) => m.match),
["Cat", "CAT", "cat"],
);
});
test("match survives an inline mark boundary (glued runs)", () => {
// "т.е." is fractured across three text nodes by bold/italic marks.
const d = doc(
para(
"p1",
text("вводное слово, "),
text("т", [{ type: "bold" }]),
text(".", [{ type: "italic" }]),
text("е", [{ type: "bold" }]),
text(". дальше"),
),
);
const res = searchInDoc(d, "т.е.");
assert.equal(res.total, 1);
assert.equal(res.matches[0].match, "т.е.");
assert.equal(res.matches[0].nodeId, "p1");
});
test("regex engine: character classes and word boundaries", () => {
const d = doc(para("p1", text("v1 v22 version v3")));
const res = searchInDoc(d, "\\bv\\d+\\b", { regex: true });
assert.deepEqual(
res.matches.map((m) => m.match),
["v1", "v22", "v3"],
);
// "version" is not matched by \bv\d+\b.
assert.equal(res.total, 3);
});
test("regex is case-insensitive by default and respects caseSensitive", () => {
const d = doc(para("p1", text("Foo foo FOO")));
assert.equal(searchInDoc(d, "foo", { regex: true }).total, 3);
assert.equal(
searchInDoc(d, "foo", { regex: true, caseSensitive: true }).total,
1,
);
});
test("regex empty/zero-length matches are skipped, not flooded", () => {
const d = doc(para("p1", text("abc")));
// `a*` can match the empty string at every position; we must not emit those.
const res = searchInDoc(d, "a*", { regex: true });
assert.equal(res.total, 1);
assert.equal(res.matches[0].match, "a");
});
test("nodeId for a table cell paragraph WITHOUT an id falls back to #<topLevelIndex>", () => {
// A table at top-level block index 1; its cell paragraphs carry no attrs.id.
const cellPara = (t) => ({ type: "paragraph", content: [text(t)] });
const d = doc(
para("intro", text("before the table")),
{
type: "table",
content: [
{
type: "tableRow",
content: [
{ type: "tableCell", content: [cellPara("needle in a cell")] },
{ type: "tableHeader", content: [cellPara("another needle")] },
],
},
],
},
);
const res = searchInDoc(d, "needle");
assert.equal(res.total, 2);
// Both cell hits report the table's top-level #<index> (block 1) since the
// cell paragraphs have no id.
for (const m of res.matches) {
assert.equal(m.nodeId, "#1");
assert.equal(m.blockIndex, 1);
}
// Context is scoped to the specific cell, not the whole table's glued text.
assert.equal(res.matches[0].after, " in a cell");
assert.equal(res.matches[1].before, "another ");
});
test("nodeId uses attrs.id when the container has one (paragraph & heading)", () => {
const d = doc(heading("h9", 1, "heading needle"), para("p9", text("para needle")));
const res = searchInDoc(d, "needle");
assert.equal(res.matches[0].nodeId, "h9");
assert.equal(res.matches[1].nodeId, "p9");
});
test("limit caps the returned matches but total and truncated stay honest", () => {
const d = doc(para("p1", text("x ".repeat(10).trim()))); // 10 'x'
const res = searchInDoc(d, "x", { limit: 3 });
assert.equal(res.total, 10);
assert.equal(res.matches.length, 3);
assert.equal(res.truncated, true);
});
test("limit is clamped to the [1, 200] range", () => {
const d = doc(para("p1", text("a".repeat(5))));
// A limit above the ceiling still returns all 5 (< 200) without truncation.
const hi = searchInDoc(d, "a", { limit: 9999 });
assert.equal(hi.matches.length, 5);
assert.equal(hi.truncated, false);
// A non-positive limit clamps up to 1.
const lo = searchInDoc(d, "a", { limit: 0 });
assert.equal(lo.matches.length, 1);
assert.equal(lo.total, 5);
assert.equal(lo.truncated, true);
});
test("invalid regex throws a clear tool error", () => {
const d = doc(para("p1", text("hi")));
assert.throws(
() => searchInDoc(d, "(", { regex: true }),
/invalid or unsupported regular expression/i,
);
});
test("RE2: a catastrophic-backtracking pattern completes FAST and correctly (no ReDoS)", () => {
// (a+)+$ against a long run of 'a' followed by a non-'a' is the classic
// catastrophic-backtracking case that wedges the JS RegExp engine for
// seconds/forever. Under RE2 (linear time) it returns effectively instantly.
const d = doc(para("p1", text("a".repeat(50_000) + "b")));
const t0 = Date.now();
const res = searchInDoc(d, "(a+)+$", { regex: true });
const elapsed = Date.now() - t0;
// No '$'-anchored all-'a' run exists (there's a trailing 'b'), so no match.
assert.equal(res.total, 0);
assert.equal(res.matches.length, 0);
// Generous ceiling: the JS engine would take orders of magnitude longer.
assert.ok(elapsed < 1000, `expected fast completion, took ${elapsed}ms`);
});
test("RE2: catastrophic pattern that DOES match still completes fast and finds it", () => {
// (a+)+b matches the whole "aaa…b"; RE2 finds it in linear time.
const d = doc(para("p1", text("a".repeat(20_000) + "b")));
const t0 = Date.now();
const res = searchInDoc(d, "(a+)+b", { regex: true });
const elapsed = Date.now() - t0;
assert.equal(res.total, 1);
assert.equal(res.matches[0].match, "a".repeat(20_000) + "b");
assert.ok(elapsed < 1000, `expected fast completion, took ${elapsed}ms`);
});
test("RE2: unsupported lookaround/backreference patterns yield the clear unsupported-regex error", () => {
const d = doc(para("p1", text("hello")));
// Lookahead / lookbehind / backreference are backtracking-only features RE2
// rejects at compile time — a clean tool error, never a hang.
assert.throws(
() => searchInDoc(d, "foo(?=bar)", { regex: true }),
/invalid or unsupported regular expression/i,
);
assert.throws(
() => searchInDoc(d, "(?<=foo)bar", { regex: true }),
/invalid or unsupported regular expression/i,
);
assert.throws(
() => searchInDoc(d, "(a)\\1", { regex: true }),
/invalid or unsupported regular expression/i,
);
});
test("F3 round-trip: every match's nodeId resolves through the REAL getNodeByRef consumer", () => {
// A doc mixing an attrs.id paragraph and an id-less table-cell paragraph, so
// both ref formats (block id and "#<index>") are exercised end-to-end.
const cellPara = (t) => ({ type: "paragraph", content: [text(t)] });
const d = doc(
para("intro", text("find needle here")), // attrs.id ref -> "intro"
{
type: "table",
content: [
{
type: "tableRow",
content: [
{ type: "tableCell", content: [cellPara("cell needle")] }, // id-less -> "#1"
],
},
],
},
);
const res = searchInDoc(d, "needle");
assert.equal(res.total, 2);
// Match 0: an attrs.id ref must resolve to that exact paragraph.
assert.equal(res.matches[0].nodeId, "intro");
const byId = getNodeByRef(d, res.matches[0].nodeId);
assert.ok(byId, "attrs.id ref must resolve via getNodeByRef");
assert.equal(byId.type, "paragraph");
assert.equal(byId.node.attrs.id, "intro");
// Match 1: an id-less table cell falls back to the table's "#<index>", which
// getNodeByRef resolves to the TOP-LEVEL block (the table) by index.
assert.equal(res.matches[1].nodeId, "#1");
const byIndex = getNodeByRef(d, res.matches[1].nodeId);
assert.ok(byIndex, "#<index> ref must resolve via getNodeByRef");
assert.equal(byIndex.type, "table");
});
test("F4: before/after are pinned correctly at string edges (clamp not dropped)", () => {
// Match within the first CONTEXT (40) chars of a container LONGER than
// CONTEXT: before is only the chars that exist, never a negative-index slice.
const head = doc(para("p1", text("ab NEEDLE" + "x".repeat(100))));
const r1 = searchInDoc(head, "NEEDLE");
assert.equal(r1.matches.length, 1);
assert.equal(r1.matches[0].before, "ab ");
assert.equal(r1.matches[0].after.length, 40); // plenty of trailing 'x'
// Match at index 0: before is empty.
const atStart = doc(para("p1", text("NEEDLE tail")));
const r2 = searchInDoc(atStart, "NEEDLE");
assert.equal(r2.matches[0].before, "");
assert.equal(r2.matches[0].after, " tail");
// Match at the container END: after is empty.
const atEnd = doc(para("p1", text("lead NEEDLE")));
const r3 = searchInDoc(atEnd, "NEEDLE");
assert.equal(r3.matches[0].before, "lead ");
assert.equal(r3.matches[0].after, "");
});
test("empty or whitespace-only query is rejected", () => {
const d = doc(para("p1", text("hi")));
assert.throws(() => searchInDoc(d, ""), /query is empty/i);
assert.throws(() => searchInDoc(d, " "), /query is empty/i);
assert.throws(() => searchInDoc(d, undefined), /query is empty/i);
});
test("an over-long pattern is rejected (anti-ReDoS pattern cap)", () => {
const d = doc(para("p1", text("hi")));
assert.throws(() => searchInDoc(d, "a".repeat(1001)), /too long/i);
});
test("no matches yields an empty, non-truncated result", () => {
const d = doc(para("p1", text("nothing to see")));
const res = searchInDoc(d, "zebra");
assert.deepEqual(res, { total: 0, truncated: false, matches: [] });
});
test("null-safe on a missing/empty doc", () => {
assert.deepEqual(searchInDoc(null, "x"), {
total: 0,
truncated: false,
matches: [],
});
assert.deepEqual(searchInDoc({ type: "doc" }, "x"), {
total: 0,
truncated: false,
matches: [],
});
});