77b245461f
Maintainer decision on the escalated ReDoS fork: use re2. The regex path compiled agent-supplied patterns with `new RegExp` and ran them synchronously in the shared event-loop; a catastrophic-backtracking pattern (e.g. `(a+)+$`) hung the whole Node backend for all users (the tool is in both transports incl. the in-app apps/server agent), and size caps do NOT bound backtracking. Switch the regex engine to re2 (Google RE2, linear-time, no backtracking): - `new RE2(query, caseSensitive?'g':'gi')`. RE2 extends RegExp, so eachMatch and the zero-length-match lastIndex guard are unchanged. - Unsupported patterns are now a CLEAN error, not a hang: RE2 throws on invalid syntax AND on the backtracking-only features it can't do (lookaround (?=…)/(?<=…), backreferences \1) — caught at compile and returned as a clear tool error telling the agent to rewrite without them. - Removed MAX_CONTAINER_TEXT + the per-container slice (re2 is linear, so it's no longer a ReDoS defense, and truncating risked silently dropping real matches in a long container); kept MAX_PATTERN_LENGTH as a cheap query sanity cap. - Verified: `(a+)+$` over 50k `a` completes in ~4ms; lookaround/backref throw. - Added re2 (^1.21.0) to packages/mcp; lockfile updated. Reviewer DO items: - F1 [doc]: removed the false "pass nodeId as a comment anchor" claim (create_comment has no nodeId param — it needs a text `selection`). Fixed in tool-specs.ts + page-search.ts (module + SearchMatch JSDoc) + client.ts; the ref is for get_node/patch_node, and for a comment you build a unique text selection from before+match+after. - F2 [doc]: clarified `#<index>` refs (id-less table/cell) are accepted by get_node but NOT patch_node (id-only). - F3 [test]: round-trip — each match's nodeId fed to the real getNodeByRef (attrs.id node + `#<index>` table-cell) to prove the ref format is consumable. - F4 [test]: before/after edge-pinning (match in first 40 chars of a long container; index 0 → before==""; container end → after==""). - New re2 tests: catastrophic patterns complete fast; lookaround/backref → error. mcp: tsc clean; node --test 472 passed (+5). apps/server: tsc --noEmit clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
321 lines
12 KiB
JavaScript
321 lines
12 KiB
JavaScript
import { test } from "node:test";
|
|
import assert from "node:assert/strict";
|
|
|
|
import { searchInDoc } from "../../build/lib/page-search.js";
|
|
import { getNodeByRef } from "../../build/lib/node-ops.js";
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Document builders. Mirror the Docmost ProseMirror shape: paragraphs/headings
|
|
// carry an attrs.id and hold text nodes; a text node may carry marks, and
|
|
// adjacent runs with different marks are GLUED by blockPlainText so a match can
|
|
// straddle a mark boundary. Table cells hold id-less paragraphs.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const text = (t, marks) => (marks ? { type: "text", text: t, marks } : { type: "text", text: t });
|
|
const para = (id, ...children) => ({ type: "paragraph", attrs: { id }, content: children });
|
|
const heading = (id, level, t) => ({
|
|
type: "heading",
|
|
attrs: { id, level },
|
|
content: [text(t)],
|
|
});
|
|
|
|
function doc(...content) {
|
|
return { type: "doc", content };
|
|
}
|
|
|
|
test("literal substring: finds every occurrence with total/truncated and refs", () => {
|
|
const d = doc(
|
|
para("p1", text("The cat sat on the cat mat.")),
|
|
heading("h1", 2, "Another cat here"),
|
|
);
|
|
const res = searchInDoc(d, "cat");
|
|
assert.equal(res.total, 3);
|
|
assert.equal(res.truncated, false);
|
|
assert.equal(res.matches.length, 3);
|
|
// First hit: paragraph p1, block index 0.
|
|
assert.equal(res.matches[0].nodeId, "p1");
|
|
assert.equal(res.matches[0].blockIndex, 0);
|
|
assert.equal(res.matches[0].type, "paragraph");
|
|
assert.equal(res.matches[0].match, "cat");
|
|
// Third hit is in the heading (block index 1).
|
|
assert.equal(res.matches[2].nodeId, "h1");
|
|
assert.equal(res.matches[2].blockIndex, 1);
|
|
assert.equal(res.matches[2].type, "heading");
|
|
});
|
|
|
|
test("context windows: before/after are drawn from the SAME container", () => {
|
|
const d = doc(para("p1", text("alpha beta gamma delta")));
|
|
const res = searchInDoc(d, "gamma");
|
|
assert.equal(res.matches.length, 1);
|
|
assert.equal(res.matches[0].before, "alpha beta ");
|
|
assert.equal(res.matches[0].match, "gamma");
|
|
assert.equal(res.matches[0].after, " delta");
|
|
});
|
|
|
|
test("context windows are bounded to ~40 chars each side", () => {
|
|
const long = "x".repeat(100);
|
|
const d = doc(para("p1", text(long + "NEEDLE" + long)));
|
|
const res = searchInDoc(d, "NEEDLE");
|
|
assert.equal(res.matches.length, 1);
|
|
assert.equal(res.matches[0].before.length, 40);
|
|
assert.equal(res.matches[0].after.length, 40);
|
|
});
|
|
|
|
test("case-insensitive by default; caseSensitive:true narrows", () => {
|
|
const d = doc(para("p1", text("Cat CAT cat")));
|
|
assert.equal(searchInDoc(d, "cat").total, 3);
|
|
assert.equal(searchInDoc(d, "cat", { caseSensitive: true }).total, 1);
|
|
// Reported match preserves the ORIGINAL casing even under a folded search.
|
|
const res = searchInDoc(d, "cat");
|
|
assert.deepEqual(
|
|
res.matches.map((m) => m.match),
|
|
["Cat", "CAT", "cat"],
|
|
);
|
|
});
|
|
|
|
test("match survives an inline mark boundary (glued runs)", () => {
|
|
// "т.е." is fractured across three text nodes by bold/italic marks.
|
|
const d = doc(
|
|
para(
|
|
"p1",
|
|
text("вводное слово, "),
|
|
text("т", [{ type: "bold" }]),
|
|
text(".", [{ type: "italic" }]),
|
|
text("е", [{ type: "bold" }]),
|
|
text(". дальше"),
|
|
),
|
|
);
|
|
const res = searchInDoc(d, "т.е.");
|
|
assert.equal(res.total, 1);
|
|
assert.equal(res.matches[0].match, "т.е.");
|
|
assert.equal(res.matches[0].nodeId, "p1");
|
|
});
|
|
|
|
test("regex engine: character classes and word boundaries", () => {
|
|
const d = doc(para("p1", text("v1 v22 version v3")));
|
|
const res = searchInDoc(d, "\\bv\\d+\\b", { regex: true });
|
|
assert.deepEqual(
|
|
res.matches.map((m) => m.match),
|
|
["v1", "v22", "v3"],
|
|
);
|
|
// "version" is not matched by \bv\d+\b.
|
|
assert.equal(res.total, 3);
|
|
});
|
|
|
|
test("regex is case-insensitive by default and respects caseSensitive", () => {
|
|
const d = doc(para("p1", text("Foo foo FOO")));
|
|
assert.equal(searchInDoc(d, "foo", { regex: true }).total, 3);
|
|
assert.equal(
|
|
searchInDoc(d, "foo", { regex: true, caseSensitive: true }).total,
|
|
1,
|
|
);
|
|
});
|
|
|
|
test("regex empty/zero-length matches are skipped, not flooded", () => {
|
|
const d = doc(para("p1", text("abc")));
|
|
// `a*` can match the empty string at every position; we must not emit those.
|
|
const res = searchInDoc(d, "a*", { regex: true });
|
|
assert.equal(res.total, 1);
|
|
assert.equal(res.matches[0].match, "a");
|
|
});
|
|
|
|
test("nodeId for a table cell paragraph WITHOUT an id falls back to #<topLevelIndex>", () => {
|
|
// A table at top-level block index 1; its cell paragraphs carry no attrs.id.
|
|
const cellPara = (t) => ({ type: "paragraph", content: [text(t)] });
|
|
const d = doc(
|
|
para("intro", text("before the table")),
|
|
{
|
|
type: "table",
|
|
content: [
|
|
{
|
|
type: "tableRow",
|
|
content: [
|
|
{ type: "tableCell", content: [cellPara("needle in a cell")] },
|
|
{ type: "tableHeader", content: [cellPara("another needle")] },
|
|
],
|
|
},
|
|
],
|
|
},
|
|
);
|
|
const res = searchInDoc(d, "needle");
|
|
assert.equal(res.total, 2);
|
|
// Both cell hits report the table's top-level #<index> (block 1) since the
|
|
// cell paragraphs have no id.
|
|
for (const m of res.matches) {
|
|
assert.equal(m.nodeId, "#1");
|
|
assert.equal(m.blockIndex, 1);
|
|
}
|
|
// Context is scoped to the specific cell, not the whole table's glued text.
|
|
assert.equal(res.matches[0].after, " in a cell");
|
|
assert.equal(res.matches[1].before, "another ");
|
|
});
|
|
|
|
test("nodeId uses attrs.id when the container has one (paragraph & heading)", () => {
|
|
const d = doc(heading("h9", 1, "heading needle"), para("p9", text("para needle")));
|
|
const res = searchInDoc(d, "needle");
|
|
assert.equal(res.matches[0].nodeId, "h9");
|
|
assert.equal(res.matches[1].nodeId, "p9");
|
|
});
|
|
|
|
test("limit caps the returned matches but total and truncated stay honest", () => {
|
|
const d = doc(para("p1", text("x ".repeat(10).trim()))); // 10 'x'
|
|
const res = searchInDoc(d, "x", { limit: 3 });
|
|
assert.equal(res.total, 10);
|
|
assert.equal(res.matches.length, 3);
|
|
assert.equal(res.truncated, true);
|
|
});
|
|
|
|
test("limit is clamped to the [1, 200] range", () => {
|
|
const d = doc(para("p1", text("a".repeat(5))));
|
|
// A limit above the ceiling still returns all 5 (< 200) without truncation.
|
|
const hi = searchInDoc(d, "a", { limit: 9999 });
|
|
assert.equal(hi.matches.length, 5);
|
|
assert.equal(hi.truncated, false);
|
|
// A non-positive limit clamps up to 1.
|
|
const lo = searchInDoc(d, "a", { limit: 0 });
|
|
assert.equal(lo.matches.length, 1);
|
|
assert.equal(lo.total, 5);
|
|
assert.equal(lo.truncated, true);
|
|
});
|
|
|
|
test("invalid regex throws a clear tool error", () => {
|
|
const d = doc(para("p1", text("hi")));
|
|
assert.throws(
|
|
() => searchInDoc(d, "(", { regex: true }),
|
|
/invalid or unsupported regular expression/i,
|
|
);
|
|
});
|
|
|
|
test("RE2: a catastrophic-backtracking pattern completes FAST and correctly (no ReDoS)", () => {
|
|
// (a+)+$ against a long run of 'a' followed by a non-'a' is the classic
|
|
// catastrophic-backtracking case that wedges the JS RegExp engine for
|
|
// seconds/forever. Under RE2 (linear time) it returns effectively instantly.
|
|
const d = doc(para("p1", text("a".repeat(50_000) + "b")));
|
|
const t0 = Date.now();
|
|
const res = searchInDoc(d, "(a+)+$", { regex: true });
|
|
const elapsed = Date.now() - t0;
|
|
// No '$'-anchored all-'a' run exists (there's a trailing 'b'), so no match.
|
|
assert.equal(res.total, 0);
|
|
assert.equal(res.matches.length, 0);
|
|
// Generous ceiling: the JS engine would take orders of magnitude longer.
|
|
assert.ok(elapsed < 1000, `expected fast completion, took ${elapsed}ms`);
|
|
});
|
|
|
|
test("RE2: catastrophic pattern that DOES match still completes fast and finds it", () => {
|
|
// (a+)+b matches the whole "aaa…b"; RE2 finds it in linear time.
|
|
const d = doc(para("p1", text("a".repeat(20_000) + "b")));
|
|
const t0 = Date.now();
|
|
const res = searchInDoc(d, "(a+)+b", { regex: true });
|
|
const elapsed = Date.now() - t0;
|
|
assert.equal(res.total, 1);
|
|
assert.equal(res.matches[0].match, "a".repeat(20_000) + "b");
|
|
assert.ok(elapsed < 1000, `expected fast completion, took ${elapsed}ms`);
|
|
});
|
|
|
|
test("RE2: unsupported lookaround/backreference patterns yield the clear unsupported-regex error", () => {
|
|
const d = doc(para("p1", text("hello")));
|
|
// Lookahead / lookbehind / backreference are backtracking-only features RE2
|
|
// rejects at compile time — a clean tool error, never a hang.
|
|
assert.throws(
|
|
() => searchInDoc(d, "foo(?=bar)", { regex: true }),
|
|
/invalid or unsupported regular expression/i,
|
|
);
|
|
assert.throws(
|
|
() => searchInDoc(d, "(?<=foo)bar", { regex: true }),
|
|
/invalid or unsupported regular expression/i,
|
|
);
|
|
assert.throws(
|
|
() => searchInDoc(d, "(a)\\1", { regex: true }),
|
|
/invalid or unsupported regular expression/i,
|
|
);
|
|
});
|
|
|
|
test("F3 round-trip: every match's nodeId resolves through the REAL getNodeByRef consumer", () => {
|
|
// A doc mixing an attrs.id paragraph and an id-less table-cell paragraph, so
|
|
// both ref formats (block id and "#<index>") are exercised end-to-end.
|
|
const cellPara = (t) => ({ type: "paragraph", content: [text(t)] });
|
|
const d = doc(
|
|
para("intro", text("find needle here")), // attrs.id ref -> "intro"
|
|
{
|
|
type: "table",
|
|
content: [
|
|
{
|
|
type: "tableRow",
|
|
content: [
|
|
{ type: "tableCell", content: [cellPara("cell needle")] }, // id-less -> "#1"
|
|
],
|
|
},
|
|
],
|
|
},
|
|
);
|
|
const res = searchInDoc(d, "needle");
|
|
assert.equal(res.total, 2);
|
|
|
|
// Match 0: an attrs.id ref must resolve to that exact paragraph.
|
|
assert.equal(res.matches[0].nodeId, "intro");
|
|
const byId = getNodeByRef(d, res.matches[0].nodeId);
|
|
assert.ok(byId, "attrs.id ref must resolve via getNodeByRef");
|
|
assert.equal(byId.type, "paragraph");
|
|
assert.equal(byId.node.attrs.id, "intro");
|
|
|
|
// Match 1: an id-less table cell falls back to the table's "#<index>", which
|
|
// getNodeByRef resolves to the TOP-LEVEL block (the table) by index.
|
|
assert.equal(res.matches[1].nodeId, "#1");
|
|
const byIndex = getNodeByRef(d, res.matches[1].nodeId);
|
|
assert.ok(byIndex, "#<index> ref must resolve via getNodeByRef");
|
|
assert.equal(byIndex.type, "table");
|
|
});
|
|
|
|
test("F4: before/after are pinned correctly at string edges (clamp not dropped)", () => {
|
|
// Match within the first CONTEXT (40) chars of a container LONGER than
|
|
// CONTEXT: before is only the chars that exist, never a negative-index slice.
|
|
const head = doc(para("p1", text("ab NEEDLE" + "x".repeat(100))));
|
|
const r1 = searchInDoc(head, "NEEDLE");
|
|
assert.equal(r1.matches.length, 1);
|
|
assert.equal(r1.matches[0].before, "ab ");
|
|
assert.equal(r1.matches[0].after.length, 40); // plenty of trailing 'x'
|
|
|
|
// Match at index 0: before is empty.
|
|
const atStart = doc(para("p1", text("NEEDLE tail")));
|
|
const r2 = searchInDoc(atStart, "NEEDLE");
|
|
assert.equal(r2.matches[0].before, "");
|
|
assert.equal(r2.matches[0].after, " tail");
|
|
|
|
// Match at the container END: after is empty.
|
|
const atEnd = doc(para("p1", text("lead NEEDLE")));
|
|
const r3 = searchInDoc(atEnd, "NEEDLE");
|
|
assert.equal(r3.matches[0].before, "lead ");
|
|
assert.equal(r3.matches[0].after, "");
|
|
});
|
|
|
|
test("empty or whitespace-only query is rejected", () => {
|
|
const d = doc(para("p1", text("hi")));
|
|
assert.throws(() => searchInDoc(d, ""), /query is empty/i);
|
|
assert.throws(() => searchInDoc(d, " "), /query is empty/i);
|
|
assert.throws(() => searchInDoc(d, undefined), /query is empty/i);
|
|
});
|
|
|
|
test("an over-long pattern is rejected (anti-ReDoS pattern cap)", () => {
|
|
const d = doc(para("p1", text("hi")));
|
|
assert.throws(() => searchInDoc(d, "a".repeat(1001)), /too long/i);
|
|
});
|
|
|
|
test("no matches yields an empty, non-truncated result", () => {
|
|
const d = doc(para("p1", text("nothing to see")));
|
|
const res = searchInDoc(d, "zebra");
|
|
assert.deepEqual(res, { total: 0, truncated: false, matches: [] });
|
|
});
|
|
|
|
test("null-safe on a missing/empty doc", () => {
|
|
assert.deepEqual(searchInDoc(null, "x"), {
|
|
total: 0,
|
|
truncated: false,
|
|
matches: [],
|
|
});
|
|
assert.deepEqual(searchInDoc({ type: "doc" }, "x"), {
|
|
total: 0,
|
|
truncated: false,
|
|
matches: [],
|
|
});
|
|
});
|