fix(#345): protect inline-code refs and escape footnote-body brackets

The foreign-markdown import normalizer rewrote GFM reference footnotes (`[^id]` + `[^id]: def`) into canonical inline `^[def]` footnotes, but two edge cases corrupted content: 1. A `[^id]` inside an inline-code span (backticks) was rewritten like prose text — only fenced code blocks were protected. Now the rewrite pass splits each line on inline-code spans and only touches the text outside them. 2. An unbalanced `]` in a definition body truncated the resulting `^[...]` footnote at the canonical tokenizer, leaking the tail as literal text. The body's square brackets are now backslash-escaped before wrapping. Adds golden cases for both. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-05 03:39:01 +03:00
parent 1417209915
commit 2c2d60a5dc
2 changed files with 72 additions and 7 deletions
@@ -48,6 +48,27 @@ describe('normalizeForeignMarkdown — GFM reference footnotes', () => {
    expect(out).not.toContain('[^1]: def.');
  });

+  it('never rewrites a reference inside an INLINE-code span (backticks)', () => {
+    // The `[^1]` inside backticks is literal code and must survive verbatim;
+    // the one outside is rewritten. (Bug #1: only fenced blocks were protected.)
+    const out = normalizeForeignMarkdown(
+      'Use `arr[^1]` in code but note[^1] in prose.\n\n[^1]: def.',
+    );
+    expect(out).toBe('Use `arr[^1]` in code but note^[def.] in prose.\n');
+  });
+
+  it('escapes brackets in a body so an unbalanced ] cannot truncate the footnote', () => {
+    // A foreign definition body with a stray `]` would, unescaped, close the
+    // canonical `^[...]` early and leak the tail as text (bug #2). The body's
+    // brackets are backslash-escaped so the footnote stays whole.
+    const out = normalizeForeignMarkdown(
+      'Ref[^1] here.\n\n[^1]: see item ] and [more] later',
+    );
+    expect(out).toBe('Ref^[see item \\] and \\[more\\] later] here.\n');
+    // The tokenizer must see exactly one unescaped closing bracket (our own).
+    expect(out.match(/(?<!\\)\]/g)).toHaveLength(1);
+  });
+
  it('leaves a reference with no matching definition literal (no body to inline)', () => {
    const out = normalizeForeignMarkdown('Dangling[^x] ref.');
    expect(out).toBe('Dangling[^x] ref.');
@@ -52,6 +52,43 @@ function escapeRegExp(value: string): string {
  return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
 }

+/**
+ * Backslash-escape any square bracket in a footnote body before it is wrapped in
+ * `^[...]`. The canonical inline-footnote tokenizer scans the body with bracket
+ * balancing and closes on the first UNMATCHED `]`, so an unbalanced bracket in a
+ * foreign definition (e.g. `[^1]: see item ] later`) would otherwise truncate the
+ * footnote and leak the tail as literal text. Escaping every `[`/`]` makes the
+ * body an inert run of characters — the tokenizer then closes only on our own
+ * closing `]`. (A balanced `[link](url)` inside a body still round-trips because
+ * the escaped form renders the literal brackets, which is the safe reading for a
+ * footnote body; the alternative — brittle balance tracking — risks worse.)
+ */
+function escapeFootnoteBody(body: string): string {
+  return body.replace(/[[\]]/g, '\\$&');
+}
+
+/**
+ * Rewrite every `[^id]` reference on a line to its `^[body]` form, but ONLY in the
+ * text OUTSIDE inline-code spans. A `[^id]` inside backticks is literal code
+ * content and must be preserved verbatim (a footnote ref never lives inside code).
+ * We split the line on inline-code spans (paired backtick runs) and rewrite only
+ * the non-code segments.
+ */
+function rewriteRefsOutsideInlineCode(
+  line: string,
+  replace: (text: string) => string,
+): string {
+  // Alternation: an inline-code span (one or more backticks, then anything up to
+  // the SAME run of backticks) OR a run of non-backtick text. Unterminated
+  // backticks fall through as ordinary text (matched by the second branch on the
+  // leftover), so a stray backtick never swallows the rest of the line.
+  const parts = line.match(/(`+)(?:(?!\1)[\s\S])*\1|[^`]+|`+/g);
+  if (!parts) return line;
+  return parts
+    .map((seg) => (seg.startsWith('`') ? seg : replace(seg)))
+    .join('');
+}
+
 /**
 * Convert GFM reference footnotes (`[^id]` + `[^id]: def`) into canonical inline
 * footnotes (`^[def]`).
@@ -62,9 +99,12 @@ function escapeRegExp(value: string): string {
 * - Each in-text reference `[^id]` for which a definition was found is replaced by
 *   `^[def]`. References with no matching definition are left literal (there is no
 *   body to inline; the parser fails them open the same way).
- * - Code fences are respected on both passes: `[^id]` inside a ``` / ~~~ block is
- *   never rewritten, and a `[^id]:` line inside a fence is never treated as a
- *   definition.
+ * - Code is respected on both passes: `[^id]` inside a fenced ``` / ~~~ block is
+ *   never rewritten and a `[^id]:` line inside a fence is never a definition; and
+ *   on the rewrite pass a `[^id]` inside an INLINE-code span (backticks) is left
+ *   literal too.
+ * - The inlined body is bracket-escaped so an unbalanced `[`/`]` in a foreign
+ *   definition cannot truncate the resulting `^[...]` footnote.
 *
 * Deduplication / reference-ordering / orphan-dropping of the resulting footnotes
 * is handled downstream by the canonical parser (`assembleFootnotes`); this pass
@@ -145,10 +185,14 @@ function convertReferenceFootnotes(markdown: string): string {
      continue;
    }

-    for (const [id, body] of defs) {
-      const ref = new RegExp('\\[\\^' + escapeRegExp(id) + '\\]', 'g');
-      line = line.replace(ref, `^[${body}]`);
-    }
+    line = rewriteRefsOutsideInlineCode(line, (segment) => {
+      let s = segment;
+      for (const [id, body] of defs) {
+        const ref = new RegExp('\\[\\^' + escapeRegExp(id) + '\\]', 'g');
+        s = s.replace(ref, `^[${escapeFootnoteBody(body)}]`);
+      }
+      return s;
+    });
    out.push(line);
  }