From 2c2d60a5dcc0fc308ffd0ee2bb664757c7c2199a Mon Sep 17 00:00:00 2001 From: agent_coder Date: Sun, 5 Jul 2026 03:39:01 +0300 Subject: [PATCH] fix(#345): protect inline-code refs and escape footnote-body brackets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The foreign-markdown import normalizer rewrote GFM reference footnotes (`[^id]` + `[^id]: def`) into canonical inline `^[def]` footnotes, but two edge cases corrupted content: 1. A `[^id]` inside an inline-code span (backticks) was rewritten like prose text — only fenced code blocks were protected. Now the rewrite pass splits each line on inline-code spans and only touches the text outside them. 2. An unbalanced `]` in a definition body truncated the resulting `^[...]` footnote at the canonical tokenizer, leaking the tail as literal text. The body's square brackets are now backslash-escaped before wrapping. Adds golden cases for both. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../import/utils/foreign-markdown.spec.ts | 21 +++++++ .../import/utils/foreign-markdown.ts | 58 ++++++++++++++++--- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts index fb22f1dd..94318075 100644 --- a/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts +++ b/apps/server/src/integrations/import/utils/foreign-markdown.spec.ts @@ -48,6 +48,27 @@ describe('normalizeForeignMarkdown — GFM reference footnotes', () => { expect(out).not.toContain('[^1]: def.'); }); + it('never rewrites a reference inside an INLINE-code span (backticks)', () => { + // The `[^1]` inside backticks is literal code and must survive verbatim; + // the one outside is rewritten. (Bug #1: only fenced blocks were protected.) + const out = normalizeForeignMarkdown( + 'Use `arr[^1]` in code but note[^1] in prose.\n\n[^1]: def.', + ); + expect(out).toBe('Use `arr[^1]` in code but note^[def.] in prose.\n'); + }); + + it('escapes brackets in a body so an unbalanced ] cannot truncate the footnote', () => { + // A foreign definition body with a stray `]` would, unescaped, close the + // canonical `^[...]` early and leak the tail as text (bug #2). The body's + // brackets are backslash-escaped so the footnote stays whole. + const out = normalizeForeignMarkdown( + 'Ref[^1] here.\n\n[^1]: see item ] and [more] later', + ); + expect(out).toBe('Ref^[see item \\] and \\[more\\] later] here.\n'); + // The tokenizer must see exactly one unescaped closing bracket (our own). + expect(out.match(/(? { const out = normalizeForeignMarkdown('Dangling[^x] ref.'); expect(out).toBe('Dangling[^x] ref.'); diff --git a/apps/server/src/integrations/import/utils/foreign-markdown.ts b/apps/server/src/integrations/import/utils/foreign-markdown.ts index 6c8e99f6..531171fe 100644 --- a/apps/server/src/integrations/import/utils/foreign-markdown.ts +++ b/apps/server/src/integrations/import/utils/foreign-markdown.ts @@ -52,6 +52,43 @@ function escapeRegExp(value: string): string { return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } +/** + * Backslash-escape any square bracket in a footnote body before it is wrapped in + * `^[...]`. The canonical inline-footnote tokenizer scans the body with bracket + * balancing and closes on the first UNMATCHED `]`, so an unbalanced bracket in a + * foreign definition (e.g. `[^1]: see item ] later`) would otherwise truncate the + * footnote and leak the tail as literal text. Escaping every `[`/`]` makes the + * body an inert run of characters — the tokenizer then closes only on our own + * closing `]`. (A balanced `[link](url)` inside a body still round-trips because + * the escaped form renders the literal brackets, which is the safe reading for a + * footnote body; the alternative — brittle balance tracking — risks worse.) + */ +function escapeFootnoteBody(body: string): string { + return body.replace(/[[\]]/g, '\\$&'); +} + +/** + * Rewrite every `[^id]` reference on a line to its `^[body]` form, but ONLY in the + * text OUTSIDE inline-code spans. A `[^id]` inside backticks is literal code + * content and must be preserved verbatim (a footnote ref never lives inside code). + * We split the line on inline-code spans (paired backtick runs) and rewrite only + * the non-code segments. + */ +function rewriteRefsOutsideInlineCode( + line: string, + replace: (text: string) => string, +): string { + // Alternation: an inline-code span (one or more backticks, then anything up to + // the SAME run of backticks) OR a run of non-backtick text. Unterminated + // backticks fall through as ordinary text (matched by the second branch on the + // leftover), so a stray backtick never swallows the rest of the line. + const parts = line.match(/(`+)(?:(?!\1)[\s\S])*\1|[^`]+|`+/g); + if (!parts) return line; + return parts + .map((seg) => (seg.startsWith('`') ? seg : replace(seg))) + .join(''); +} + /** * Convert GFM reference footnotes (`[^id]` + `[^id]: def`) into canonical inline * footnotes (`^[def]`). @@ -62,9 +99,12 @@ function escapeRegExp(value: string): string { * - Each in-text reference `[^id]` for which a definition was found is replaced by * `^[def]`. References with no matching definition are left literal (there is no * body to inline; the parser fails them open the same way). - * - Code fences are respected on both passes: `[^id]` inside a ``` / ~~~ block is - * never rewritten, and a `[^id]:` line inside a fence is never treated as a - * definition. + * - Code is respected on both passes: `[^id]` inside a fenced ``` / ~~~ block is + * never rewritten and a `[^id]:` line inside a fence is never a definition; and + * on the rewrite pass a `[^id]` inside an INLINE-code span (backticks) is left + * literal too. + * - The inlined body is bracket-escaped so an unbalanced `[`/`]` in a foreign + * definition cannot truncate the resulting `^[...]` footnote. * * Deduplication / reference-ordering / orphan-dropping of the resulting footnotes * is handled downstream by the canonical parser (`assembleFootnotes`); this pass @@ -145,10 +185,14 @@ function convertReferenceFootnotes(markdown: string): string { continue; } - for (const [id, body] of defs) { - const ref = new RegExp('\\[\\^' + escapeRegExp(id) + '\\]', 'g'); - line = line.replace(ref, `^[${body}]`); - } + line = rewriteRefsOutsideInlineCode(line, (segment) => { + let s = segment; + for (const [id, body] of defs) { + const ref = new RegExp('\\[\\^' + escapeRegExp(id) + '\\]', 'g'); + s = s.replace(ref, `^[${escapeFootnoteBody(body)}]`); + } + return s; + }); out.push(line); }