test(integrations/client/packages): batch 2-4 unit coverage + zip-slip guard extraction

Batch 2-4 of the test-strategy rollout. Test-only except one minimal, behaviour-preserving extraction in file.utils.ts. All suites green: server 82 suites/836+1todo, editor-ext 86, mcp 270, client (new files) 86. integrations (server): - file.utils.ts: extract pure `isEntryPathSafe(entryName, targetDir)` from extractZipInternal so the zip-slip/path-traversal guard is unit-testable; call site rerouted, behaviour identical (only a warn-message string merged). - file.utils.zip-safety.spec.ts: traversal/strip/__MACOSX/prefix-confusion cases (mutation-resistant: fails if containment loses the path.sep). - import-formatter / import.utils / table-utils / export utils / import.service extractTitleAndRemoveHeading: pure import/export transforms, Notion/XWiki formatting, table colspan widths (idempotent), slug/link rewriting. client: - safeRedirectPath: open-redirect guard, every reject branch independently. - buildChatMarkdown (fence anti-breakout), label-colors, normalize-label, share tree build, page URL builders, notification time-grouping (fake clock). packages: - editor-ext: deriveFootnoteId golden table, parseHtmlEmbedHeight crafted values, orphan footnote extraction. - mcp: deriveFootnoteId parity (drift guard vs editor-ext), applyTextEdits idempotency + cross-block replaceAll, diffDocs/summarizeChange on reorder. Reviewed (APPROVE): extraction behaviour-preserving, assertions mutation-resistant. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 18:22:15 +03:00
parent f8e8ada581
commit 0b2af34029
20 changed files with 2495 additions and 17 deletions
--- a/apps/server/src/integrations/export/utils.spec.ts
+++ b/apps/server/src/integrations/export/utils.spec.ts
@@ -0,0 +1,158 @@
+import {
+  buildTree,
+  computeLocalPath,
+  getExportExtension,
+  extractPageSlugId,
+  getInternalLinkPageName,
+  INTERNAL_LINK_REGEX,
+  PageExportTree,
+} from './utils';
+import { ExportFormat } from './dto/export-dto';
+import { Page } from '@docmost/db/types/entity.types';
+
+/**
+ * Unit tests for export/utils.ts pure helpers:
+ *  - buildTree: groups pages by parentPageId and de-duplicates sibling titles.
+ *  - computeLocalPath / getExportExtension: builds the slugId -> file path map.
+ *  - extractPageSlugId / INTERNAL_LINK_REGEX: parse the trailing slugId.
+ *  - getInternalLinkPageName: derive a page name from a relative file path.
+ */
+
+function page(partial: Partial<Page>): Page {
+  return partial as Page;
+}
+
+describe('buildTree', () => {
+  it('groups pages by their parentPageId', () => {
+    const pages = [
+      page({ id: 'a', parentPageId: 'root', title: 'A', slugId: 'sa' }),
+      page({ id: 'b', parentPageId: 'root', title: 'B', slugId: 'sb' }),
+      page({ id: 'c', parentPageId: 'a', title: 'C', slugId: 'sc' }),
+    ];
+
+    const tree = buildTree(pages);
+
+    expect(Object.keys(tree).sort()).toEqual(['a', 'root']);
+    expect(tree['root'].map((p) => p.id)).toEqual(['a', 'b']);
+    expect(tree['a'].map((p) => p.id)).toEqual(['c']);
+  });
+
+  it('suffixes duplicate sibling titles with " (1)", " (2)"', () => {
+    const pages = [
+      page({ id: '1', parentPageId: 'root', title: 'Doc', slugId: 's1' }),
+      page({ id: '2', parentPageId: 'root', title: 'Doc', slugId: 's2' }),
+      page({ id: '3', parentPageId: 'root', title: 'Doc', slugId: 's3' }),
+    ];
+
+    const tree = buildTree(pages);
+
+    expect(tree['root'].map((p) => p.title)).toEqual([
+      'Doc',
+      'Doc (1)',
+      'Doc (2)',
+    ]);
+  });
+
+  it('does not collide identical titles across different parents', () => {
+    const pages = [
+      page({ id: '1', parentPageId: 'p1', title: 'Same', slugId: 's1' }),
+      page({ id: '2', parentPageId: 'p2', title: 'Same', slugId: 's2' }),
+    ];
+
+    const tree = buildTree(pages);
+
+    expect(tree['p1'][0].title).toBe('Same');
+    expect(tree['p2'][0].title).toBe('Same');
+  });
+
+  it('falls back to "untitled" for empty titles', () => {
+    const pages = [
+      page({ id: '1', parentPageId: 'root', title: '', slugId: 's1' }),
+    ];
+
+    const tree = buildTree(pages);
+
+    expect(tree['root'][0].title).toBe('untitled');
+  });
+
+  it('returns an empty object for empty input', () => {
+    expect(buildTree([])).toEqual({});
+  });
+});
+
+describe('computeLocalPath + getExportExtension', () => {
+  it('builds nested parent/child paths with the markdown extension', () => {
+    const tree: PageExportTree = {
+      // root level uses the literal string 'null' as key only when parentPageId
+      // is null; here we use an explicit top-level key.
+      top: [page({ id: 'parent', title: 'Parent', slugId: 'sp' })],
+      parent: [page({ id: 'child', title: 'Child', slugId: 'sc' })],
+    };
+    const slugIdToPath: Record<string, string> = {};
+
+    computeLocalPath(tree, ExportFormat.Markdown, 'top', '', slugIdToPath);
+
+    expect(slugIdToPath['sp']).toBe('Parent.md');
+    expect(slugIdToPath['sc']).toBe('Parent/Child.md');
+  });
+
+  it('uses the html extension when the format is html', () => {
+    const tree: PageExportTree = {
+      top: [page({ id: 'parent', title: 'Parent', slugId: 'sp' })],
+    };
+    const slugIdToPath: Record<string, string> = {};
+
+    computeLocalPath(tree, ExportFormat.HTML, 'top', '', slugIdToPath);
+
+    expect(slugIdToPath['sp']).toBe('Parent.html');
+  });
+
+  it('getExportExtension returns the right extension and undefined for unknown', () => {
+    expect(getExportExtension(ExportFormat.HTML)).toBe('.html');
+    expect(getExportExtension(ExportFormat.Markdown)).toBe('.md');
+    expect(getExportExtension('pdf')).toBeUndefined();
+  });
+});
+
+describe('extractPageSlugId', () => {
+  it('returns the trailing segment after the last dash', () => {
+    expect(extractPageSlugId('slug-with-dashes-abc123')).toBe('abc123');
+  });
+
+  it('returns the input unchanged when there is no dash (bare slugId)', () => {
+    expect(extractPageSlugId('abc123')).toBe('abc123');
+  });
+
+  it('returns undefined for empty input', () => {
+    expect(extractPageSlugId('')).toBeUndefined();
+  });
+});
+
+describe('INTERNAL_LINK_REGEX', () => {
+  it('matches a /s/{space}/p/{slug} url and captures the slug in group 5', () => {
+    const match = '/s/space/p/page-abc123'.match(INTERNAL_LINK_REGEX);
+    expect(match).not.toBeNull();
+    expect(match![5]).toBe('page-abc123');
+    expect(extractPageSlugId(match![5])).toBe('abc123');
+  });
+
+  it('does not match a non-internal url', () => {
+    expect('https://example.com/foo/bar'.match(INTERNAL_LINK_REGEX)).toBeNull();
+  });
+});
+
+describe('getInternalLinkPageName', () => {
+  it('strips the file extension and decodes the name', () => {
+    expect(getInternalLinkPageName('Parent/My%20Page.md')).toBe('My Page');
+  });
+
+  it('falls back to the raw name without throwing on malformed encoding', () => {
+    // "%E0%A4" is an incomplete escape; decodeURIComponent throws and the
+    // helper returns the raw (still-encoded) name.
+    let result: string | undefined;
+    expect(() => {
+      result = getInternalLinkPageName('dir/%E0%A4.md', 'current.md');
+    }).not.toThrow();
+    expect(result).toBe('%E0%A4');
+  });
+});
--- a/apps/server/src/integrations/import/services/import.service.extract-title.spec.ts
+++ b/apps/server/src/integrations/import/services/import.service.extract-title.spec.ts
@@ -0,0 +1,141 @@
+// Importing ImportService transitively loads import-formatter.ts, which imports
+// the ESM-only @sindresorhus/slugify package (not in jest's transform
+// allowlist). slugify is irrelevant to the method under test, so it is mocked
+// out to keep the module graph loadable under ts-jest.
+jest.mock('@sindresorhus/slugify', () => ({
+  __esModule: true,
+  default: (input: string) => String(input),
+}));
+
+import { ImportService } from './import.service';
+
+/**
+ * Unit tests for ImportService.extractTitleAndRemoveHeading — a pure method
+ * (no `this`, no I/O). It pulls a leading level-1 heading out of a ProseMirror
+ * document, returning its text as the title and the remaining content, and
+ * guarantees at least one paragraph remains.
+ *
+ * The method does not touch the injected deps, so the service is constructed
+ * with placeholder dependencies.
+ */
+
+function makeService(): ImportService {
+  // The method under test never references `this`/injected deps.
+  return new ImportService({} as any, {} as any, {} as any, {} as any);
+}
+
+describe('ImportService.extractTitleAndRemoveHeading', () => {
+  const service = makeService();
+
+  it('extracts a leading H1 as the title and removes the heading from content', () => {
+    const state = {
+      type: 'doc',
+      content: [
+        {
+          type: 'heading',
+          attrs: { level: 1 },
+          content: [{ type: 'text', text: 'My Title' }],
+        },
+        { type: 'paragraph', content: [{ type: 'text', text: 'body' }] },
+      ],
+    };
+
+    const result = service.extractTitleAndRemoveHeading(state);
+
+    expect(result.title).toBe('My Title');
+    // heading removed, only the paragraph remains
+    expect(result.prosemirrorJson.content).toHaveLength(1);
+    expect(result.prosemirrorJson.content[0].type).toBe('paragraph');
+    expect(result.prosemirrorJson.content[0].content[0].text).toBe('body');
+    // doc type preserved via spread
+    expect(result.prosemirrorJson.type).toBe('doc');
+  });
+
+  it('returns a null title and keeps content when there is no leading H1', () => {
+    const state = {
+      type: 'doc',
+      content: [
+        { type: 'paragraph', content: [{ type: 'text', text: 'first' }] },
+        {
+          type: 'heading',
+          attrs: { level: 1 },
+          content: [{ type: 'text', text: 'Later Heading' }],
+        },
+      ],
+    };
+
+    const result = service.extractTitleAndRemoveHeading(state);
+
+    expect(result.title).toBeNull();
+    // nothing removed
+    expect(result.prosemirrorJson.content).toHaveLength(2);
+    expect(result.prosemirrorJson.content[0].type).toBe('paragraph');
+  });
+
+  it('does not treat a level-2 heading as a title', () => {
+    const state = {
+      type: 'doc',
+      content: [
+        {
+          type: 'heading',
+          attrs: { level: 2 },
+          content: [{ type: 'text', text: 'Subheading' }],
+        },
+      ],
+    };
+
+    const result = service.extractTitleAndRemoveHeading(state);
+
+    expect(result.title).toBeNull();
+    expect(result.prosemirrorJson.content).toHaveLength(1);
+    expect(result.prosemirrorJson.content[0].type).toBe('heading');
+  });
+
+  it('injects one empty paragraph when the content becomes empty', () => {
+    // A document that is just a single H1 -> after removal, content is empty
+    // and one empty paragraph is injected.
+    const state = {
+      type: 'doc',
+      content: [
+        {
+          type: 'heading',
+          attrs: { level: 1 },
+          content: [{ type: 'text', text: 'Only Title' }],
+        },
+      ],
+    };
+
+    const result = service.extractTitleAndRemoveHeading(state);
+
+    expect(result.title).toBe('Only Title');
+    expect(result.prosemirrorJson.content).toEqual([
+      { type: 'paragraph', content: [] },
+    ]);
+  });
+
+  it('injects an empty paragraph for an already-empty document', () => {
+    const state = { type: 'doc', content: [] };
+
+    const result = service.extractTitleAndRemoveHeading(state);
+
+    expect(result.title).toBeNull();
+    expect(result.prosemirrorJson.content).toEqual([
+      { type: 'paragraph', content: [] },
+    ]);
+  });
+
+  it('yields a null title when an H1 has no text node', () => {
+    const state = {
+      type: 'doc',
+      content: [{ type: 'heading', attrs: { level: 1 }, content: [] }],
+    };
+
+    const result = service.extractTitleAndRemoveHeading(state);
+
+    expect(result.title).toBeNull();
+    // heading removed, empty paragraph injected
+    expect(result.prosemirrorJson.content).toEqual([
+      { type: 'paragraph', content: [] },
+    ]);
+  });
+});
--- a/apps/server/src/integrations/import/utils/file.utils.ts
+++ b/apps/server/src/integrations/import/utils/file.utils.ts
@@ -30,6 +30,52 @@ export function getFileTaskFolderPath(
  }
 }

+/**
+ * Pure path-safety decision for a single ZIP entry (zip-slip / path-traversal guard).
+ *
+ * Reproduces exactly the inline check previously embedded in `extractZipInternal`:
+ *  1. Strip any leading slashes from the entry name.
+ *  2. Reject names that fail `yauzl.validateFileName` (e.g. backslashes,
+ *     relative `..` segments, drive letters).
+ *  3. Reject `__MACOSX/` metadata entries.
+ *  4. Resolve the entry against the target directory and require it to stay
+ *     strictly inside `targetDir` using a `targetResolved + path.sep` prefix check
+ *     (the trailing separator prevents sibling-directory prefix confusion, e.g.
+ *     `/tmp/x` must not match `/tmp/x-evil`).
+ *
+ * @param entryName  The decoded (UTF-8) entry file name from the archive.
+ * @param targetDir  Directory the archive is being extracted into.
+ * @returns `{ safe }` and, when safe, the resolved absolute path of the entry.
+ */
+export function isEntryPathSafe(
+  entryName: string,
+  targetDir: string,
+): { safe: boolean; resolved?: string } {
+  // Strip leading slashes so absolute-looking entries cannot escape the target.
+  const safe = entryName.replace(/^\/+/, '');
+
+  const validationError = yauzl.validateFileName(safe);
+  if (validationError) {
+    return { safe: false };
+  }
+
+  // Skip macOS resource-fork metadata entries.
+  if (safe.startsWith('__MACOSX/')) {
+    return { safe: false };
+  }
+
+  const fullPath = path.join(targetDir, safe);
+  const resolved = path.resolve(fullPath);
+  const targetResolved = path.resolve(targetDir);
+
+  // Containment check: resolved path must live strictly inside the target dir.
+  if (!resolved.startsWith(targetResolved + path.sep)) {
+    return { safe: false };
+  }
+
+  return { safe: true, resolved };
+}
+
 /**
 * Extracts a ZIP archive.
 */
@@ -103,29 +149,15 @@ function extractZipInternal(
          const name = entry.fileName.toString('utf8');
          const safe = name.replace(/^\/+/, '');

-          const validationError = yauzl.validateFileName(safe);
-          if (validationError) {
-            console.warn(`Skipping invalid entry (${validationError})`);
-            zipfile.readEntry();
-            return;
-          }
-
-          if (safe.startsWith('__MACOSX/')) {
+          // Zip-slip / path-traversal guard (see isEntryPathSafe).
+          if (!isEntryPathSafe(name, target).safe) {
+            console.warn(`Skipping unsafe entry: ${safe}`);
            zipfile.readEntry();
            return;
          }

          const fullPath = path.join(target, safe);

-          const resolved = path.resolve(fullPath);
-          const targetResolved = path.resolve(target);
-
-          if (!resolved.startsWith(targetResolved + path.sep)) {
-            console.warn(`Skipping entry (path outside target): ${safe}`);
-            zipfile.readEntry();
-            return;
-          }
-
          // Handle directories
          if (/\/$/.test(name)) {
            try {
--- a/apps/server/src/integrations/import/utils/file.utils.zip-safety.spec.ts
+++ b/apps/server/src/integrations/import/utils/file.utils.zip-safety.spec.ts
@@ -0,0 +1,105 @@
+import * as path from 'path';
+import { isEntryPathSafe } from './file.utils';
+
+/**
+ * Unit tests for isEntryPathSafe: the pure zip-slip / path-traversal guard
+ * extracted from extractZipInternal. The contract reproduced from the
+ * production inline check is, in order:
+ *   1. strip leading slashes from the entry name;
+ *   2. reject names that fail yauzl.validateFileName (relative `..` segments,
+ *      backslashes, drive letters, etc.);
+ *   3. reject `__MACOSX/` metadata entries;
+ *   4. resolve the (stripped) entry under the target dir and require it to stay
+ *      strictly inside the target via a `targetResolved + path.sep` prefix check.
+ *
+ * The separator in step 4 is the load-bearing detail: it prevents sibling-dir
+ * prefix confusion (e.g. target `/tmp/x` vs `/tmp/x-evil`). The tests below are
+ * written so that weakening that check to a bare `startsWith(targetResolved)`
+ * makes at least one test fail.
+ */
+describe('isEntryPathSafe', () => {
+  // Use an absolute target; on the test platform path.sep is '/'.
+  const target = path.resolve('/tmp/x');
+
+  it('accepts a normal nested entry and resolves it inside the target', () => {
+    const result = isEntryPathSafe('a/b/c.png', target);
+    expect(result.safe).toBe(true);
+    expect(result.resolved).toBe(path.join(target, 'a/b/c.png'));
+    // Resolved path must live strictly under the target directory.
+    expect(result.resolved!.startsWith(target + path.sep)).toBe(true);
+  });
+
+  it('strips a single leading slash and then treats the entry as safe', () => {
+    const result = isEntryPathSafe('/a/b/c.png', target);
+    expect(result.safe).toBe(true);
+    expect(result.resolved).toBe(path.join(target, 'a/b/c.png'));
+  });
+
+  it('strips multiple leading slashes and then treats the entry as safe', () => {
+    const result = isEntryPathSafe('///a/b.png', target);
+    expect(result.safe).toBe(true);
+    expect(result.resolved).toBe(path.join(target, 'a/b.png'));
+  });
+
+  it('skips (marks unsafe) __MACOSX metadata entries', () => {
+    const result = isEntryPathSafe('__MACOSX/foo', target);
+    expect(result.safe).toBe(false);
+    expect(result.resolved).toBeUndefined();
+  });
+
+  it('rejects a relative ../../ traversal entry', () => {
+    // yauzl.validateFileName flags this as an "invalid relative path", so it is
+    // rejected before the containment check ever runs. Either way: unsafe.
+    const result = isEntryPathSafe('../../etc/passwd', target);
+    expect(result.safe).toBe(false);
+    expect(result.resolved).toBeUndefined();
+  });
+
+  it('rejects an entry whose resolved path would land in a sibling directory (prefix confusion)', () => {
+    // The classic off-by-one: target `/tmp/x` must NOT contain `/tmp/x-evil`.
+    // Such an escape can only be expressed with a `..` segment, which the guard
+    // rejects. This asserts the guard holds for the sibling-escape attempt.
+    const result = isEntryPathSafe('../x-evil/p', target);
+    expect(result.safe).toBe(false);
+    expect(result.resolved).toBeUndefined();
+  });
+
+  it('rejects an entry that resolves to exactly the target dir (no trailing separator)', () => {
+    // `.` resolves to the target itself. The strict `targetResolved + path.sep`
+    // prefix check rejects it; a weakened `startsWith(targetResolved)` (without
+    // the separator) would WRONGLY accept it. This test is the mutation killer
+    // for the separator: if the separator is dropped, this assertion fails.
+    const result = isEntryPathSafe('.', target);
+    expect(result.safe).toBe(false);
+    expect(result.resolved).toBeUndefined();
+  });
+
+  it('keeps the target/sibling boundary: a bare-prefix sibling is not inside the target', () => {
+    // Direct statement of the invariant the separator protects. The resolved
+    // sibling path shares the target's basename as a prefix but is a different
+    // directory; only the `+ path.sep` form correctly classifies it as outside.
+    const target2 = path.resolve('/tmp/x');
+    const siblingResolved = path.resolve(path.join(target2, '..', 'x-evil', 'p'));
+    expect(siblingResolved.startsWith(target2)).toBe(true); // weak (buggy) check matches
+    expect(siblingResolved.startsWith(target2 + path.sep)).toBe(false); // strict check rejects
+  });
+
+  it('rejects an entry containing a backslash via yauzl.validateFileName', () => {
+    // Backslashes are flagged by yauzl.validateFileName as invalid characters,
+    // so such entries are unsafe regardless of where they would resolve.
+    const result = isEntryPathSafe('a\\b.png', target);
+    expect(result.safe).toBe(false);
+    expect(result.resolved).toBeUndefined();
+  });
+
+  it('accepts a stripped absolute path that lands inside the target', () => {
+    // Documented ACTUAL behaviour: an entry like `/etc/passwd` has its leading
+    // slash stripped to `etc/passwd`, which resolves to <target>/etc/passwd —
+    // strictly inside the target, hence safe. (This is the point of the strip:
+    // an absolute-looking entry is re-anchored under the target rather than
+    // escaping to the filesystem root.)
+    const result = isEntryPathSafe('/etc/passwd', target);
+    expect(result.safe).toBe(true);
+    expect(result.resolved).toBe(path.join(target, 'etc/passwd'));
+  });
+});
--- a/apps/server/src/integrations/import/utils/import-formatter.spec.ts
+++ b/apps/server/src/integrations/import/utils/import-formatter.spec.ts
@@ -0,0 +1,403 @@
+// @sindresorhus/slugify ships as ESM and is not in jest's transform allowlist,
+// so it cannot be imported under ts-jest here. Mock it with a deterministic
+// lowercase/dash slugifier that matches the real output for the simple ASCII
+// titles used in these tests (e.g. "Real Title" -> "real-title"). This keeps
+// the test focused on the formatter's own slug-composition logic.
+jest.mock('@sindresorhus/slugify', () => ({
+  __esModule: true,
+  default: (input: string) =>
+    String(input)
+      .trim()
+      .toLowerCase()
+      .replace(/[^a-z0-9]+/g, '-')
+      .replace(/^-+|-+$/g, ''),
+}));
+
+import { load, CheerioAPI, Cheerio } from 'cheerio';
+import {
+  rewriteInternalLinksToMentionHtml,
+  notionFormatter,
+  xwikiFormatter,
+  defaultHtmlFormatter,
+  unwrapFromParagraph,
+} from './import-formatter';
+
+/**
+ * Unit tests for import-formatter.ts. These are pure DOM transforms driven by
+ * cheerio. Each test loads a snippet, runs the target function against the
+ * cheerio root, and asserts the mutated markup / return value. Assertions are
+ * written to fail if the corresponding branch were silently removed.
+ */
+
+type PageMeta = { id: string; title: string; slugId: string };
+
+function makeRoot(html: string): { $: CheerioAPI; $root: Cheerio<any> } {
+  const $ = load(html);
+  return { $, $root: $.root() };
+}
+
+describe('rewriteInternalLinksToMentionHtml', () => {
+  const creatorId = 'creator-1';
+  const sourcePageId = 'source-page-1';
+  const workspaceId = 'workspace-1';
+
+  it('replaces an internal link whose text equals the page title with a mention span', async () => {
+    const meta: PageMeta = {
+      id: 'target-id-1',
+      title: 'Design Doc',
+      slugId: 'slugABC',
+    };
+    // currentFilePath dir is "docs"; href "./target.md" resolves to "docs/target.md"
+    const map = new Map<string, PageMeta>([['docs/target.md', meta]]);
+    const { $, $root } = makeRoot(
+      '<a href="./target.md">Design Doc</a>',
+    );
+
+    const backlinks = await rewriteInternalLinksToMentionHtml(
+      $,
+      $root,
+      'docs/index.md',
+      map,
+      creatorId,
+      sourcePageId,
+      workspaceId,
+    );
+
+    const $mention = $root.find('span[data-type="mention"]');
+    expect($mention.length).toBe(1);
+    expect($mention.attr('data-entity-type')).toBe('page');
+    expect($mention.attr('data-entity-id')).toBe('target-id-1');
+    expect($mention.attr('data-label')).toBe('Design Doc');
+    expect($mention.attr('data-slug-id')).toBe('slugABC');
+    expect($mention.attr('data-creator-id')).toBe(creatorId);
+    expect($mention.attr('data-id')).toBeTruthy();
+    expect($mention.text()).toBe('Design Doc');
+    // original anchor must be gone
+    expect($root.find('a').length).toBe(0);
+
+    expect(backlinks).toEqual([
+      { sourcePageId, targetPageId: 'target-id-1', workspaceId },
+    ]);
+  });
+
+  it('rewrites href to /s/{space}/p/{slug} when text differs from the title', async () => {
+    const meta: PageMeta = {
+      id: 'target-id-2',
+      title: 'Real Title',
+      slugId: 'slug999',
+    };
+    const map = new Map<string, PageMeta>([['docs/target.md', meta]]);
+    const { $, $root } = makeRoot(
+      '<a href="./target.md">click here</a>',
+    );
+
+    const backlinks = await rewriteInternalLinksToMentionHtml(
+      $,
+      $root,
+      'docs/index.md',
+      map,
+      creatorId,
+      sourcePageId,
+      workspaceId,
+      'myspace',
+    );
+
+    // still an anchor, no mention span
+    expect($root.find('span[data-type="mention"]').length).toBe(0);
+    const $a = $root.find('a');
+    expect($a.length).toBe(1);
+    // slugify('Real Title') => 'real-title'
+    expect($a.attr('href')).toBe('/s/myspace/p/real-title-slug999');
+    expect($a.attr('data-internal')).toBe('true');
+    expect($a.text()).toBe('click here');
+
+    expect(backlinks).toEqual([
+      { sourcePageId, targetPageId: 'target-id-2', workspaceId },
+    ]);
+  });
+
+  it('uses /p/{slug} when no spaceSlug is provided', async () => {
+    const meta: PageMeta = {
+      id: 'target-id-3',
+      title: 'Other Page',
+      slugId: 'slug777',
+    };
+    const map = new Map<string, PageMeta>([['docs/target.md', meta]]);
+    const { $, $root } = makeRoot('<a href="./target.md">label</a>');
+
+    await rewriteInternalLinksToMentionHtml(
+      $,
+      $root,
+      'docs/index.md',
+      map,
+      creatorId,
+      sourcePageId,
+      workspaceId,
+    );
+
+    expect($root.find('a').attr('href')).toBe('/p/other-page-slug777');
+  });
+
+  it('leaves external http and /api/ hrefs untouched and records no backlink', async () => {
+    const map = new Map<string, PageMeta>();
+    const { $, $root } = makeRoot(
+      '<a href="https://example.com/page">ext</a><a href="/api/files/x">api</a>',
+    );
+
+    const backlinks = await rewriteInternalLinksToMentionHtml(
+      $,
+      $root,
+      'docs/index.md',
+      map,
+      creatorId,
+      sourcePageId,
+      workspaceId,
+    );
+
+    const hrefs = $root
+      .find('a')
+      .map((_, el) => $(el).attr('href'))
+      .get();
+    expect(hrefs).toEqual(['https://example.com/page', '/api/files/x']);
+    expect($root.find('a').first().attr('data-internal')).toBeUndefined();
+    expect(backlinks).toEqual([]);
+  });
+
+  it('falls back without throwing on a malformed decodeURIComponent href', async () => {
+    const meta: PageMeta = {
+      id: 'target-id-4',
+      title: 'Broken',
+      slugId: 'slug000',
+    };
+    // The raw (un-decodable) href is what gets joined: "docs/%E0%A4%A.md".
+    const map = new Map<string, PageMeta>([['docs/%E0%A4%A.md', meta]]);
+    const { $, $root } = makeRoot('<a href="%E0%A4%A.md">Broken</a>');
+
+    let backlinks: any;
+    await expect(
+      (async () => {
+        backlinks = await rewriteInternalLinksToMentionHtml(
+          $,
+          $root,
+          'docs/index.md',
+          map,
+          creatorId,
+          sourcePageId,
+          workspaceId,
+        );
+      })(),
+    ).resolves.not.toThrow();
+
+    // Because the raw path matched the map, it still produced a mention.
+    expect($root.find('span[data-type="mention"]').length).toBe(1);
+    expect(backlinks).toEqual([
+      { sourcePageId, targetPageId: 'target-id-4', workspaceId },
+    ]);
+  });
+
+  it('accumulates one backlink per resolved link', async () => {
+    const a: PageMeta = { id: 'id-a', title: 'A', slugId: 's-a' };
+    const b: PageMeta = { id: 'id-b', title: 'B', slugId: 's-b' };
+    const map = new Map<string, PageMeta>([
+      ['docs/a.md', a],
+      ['docs/b.md', b],
+    ]);
+    const { $, $root } = makeRoot(
+      '<a href="./a.md">A</a><a href="./b.md">B</a>',
+    );
+
+    const backlinks = await rewriteInternalLinksToMentionHtml(
+      $,
+      $root,
+      'docs/index.md',
+      map,
+      creatorId,
+      sourcePageId,
+      workspaceId,
+    );
+
+    expect(backlinks).toEqual([
+      { sourcePageId, targetPageId: 'id-a', workspaceId },
+      { sourcePageId, targetPageId: 'id-b', workspaceId },
+    ]);
+  });
+});
+
+describe('notionFormatter', () => {
+  it('converts a multi-column column-list to data-type="columns" with the right layout', () => {
+    const html =
+      '<div class="column-list">' +
+      '<div class="column"><p>one</p></div>' +
+      '<div class="column"><p>two</p></div>' +
+      '<div class="column"><p>three</p></div>' +
+      '</div>';
+    const { $, $root } = makeRoot(html);
+
+    notionFormatter($, $root);
+
+    const $cols = $root.find('div[data-type="columns"]');
+    expect($cols.length).toBe(1);
+    // 3 columns => COLUMN_LAYOUTS[3] === 'three_equal'
+    expect($cols.attr('data-layout')).toBe('three_equal');
+    expect($root.find('div[data-type="column"]').length).toBe(3);
+    // original column-list wrapper is gone
+    expect($root.find('div.column-list').length).toBe(0);
+  });
+
+  it('uses two_equal layout for exactly two columns', () => {
+    const html =
+      '<div class="column-list">' +
+      '<div class="column"><p>one</p></div>' +
+      '<div class="column"><p>two</p></div>' +
+      '</div>';
+    const { $, $root } = makeRoot(html);
+
+    notionFormatter($, $root);
+
+    expect($root.find('div[data-type="columns"]').attr('data-layout')).toBe(
+      'two_equal',
+    );
+  });
+
+  it('converts figure.equation into a mathBlock with the tex text', () => {
+    const html =
+      '<figure class="equation">' +
+      '<annotation encoding="application/x-tex">E = mc^2</annotation>' +
+      '</figure>';
+    const { $, $root } = makeRoot(html);
+
+    notionFormatter($, $root);
+
+    const $math = $root.find('div[data-type="mathBlock"]');
+    expect($math.length).toBe(1);
+    expect($math.attr('data-katex')).toBe('true');
+    expect($math.text()).toBe('E = mc^2');
+    expect($root.find('figure.equation').length).toBe(0);
+  });
+
+  it('converts ul.to-do-list items to a taskList with data-checked reflecting checkbox-on', () => {
+    const html =
+      '<ul class="to-do-list">' +
+      '<li><div class="checkbox checkbox-on"></div>' +
+      '<span class="to-do-children-checked">done item</span></li>' +
+      '<li><div class="checkbox checkbox-off"></div>' +
+      '<span class="to-do-children-unchecked">open item</span></li>' +
+      '</ul>';
+    const { $, $root } = makeRoot(html);
+
+    notionFormatter($, $root);
+
+    const $list = $root.find('ul[data-type="taskList"]');
+    expect($list.length).toBe(1);
+    const $items = $list.find('li[data-type="taskItem"]');
+    expect($items.length).toBe(2);
+    expect($items.eq(0).attr('data-checked')).toBe('true');
+    expect($items.eq(1).attr('data-checked')).toBe('false');
+    // checked item has a checked input; unchecked does not
+    expect($items.eq(0).find('input[checked]').length).toBe(1);
+    expect($items.eq(1).find('input[checked]').length).toBe(0);
+    // text is carried over
+    expect($items.eq(0).find('p').text()).toBe('done item');
+    expect($items.eq(1).find('p').text()).toBe('open item');
+  });
+});
+
+describe('xwikiFormatter', () => {
+  it('replaces the root with the contents of #xwikicontent when present', () => {
+    const html =
+      '<div id="header">junk</div>' +
+      '<div id="xwikicontent"><p>real body</p><h2>heading</h2></div>';
+    const { $, $root } = makeRoot(html);
+
+    xwikiFormatter($, $root);
+
+    expect($root.find('#header').length).toBe(0);
+    expect($root.find('#xwikicontent').length).toBe(0);
+    expect($root.find('p').text()).toBe('real body');
+    expect($root.find('h2').text()).toBe('heading');
+  });
+
+  it('leaves HTML without #xwikicontent unchanged', () => {
+    const html = '<div id="header">junk</div><p>body</p>';
+    const { $, $root } = makeRoot(html);
+    const before = $root.html();
+
+    xwikiFormatter($, $root);
+
+    expect($root.html()).toBe(before);
+  });
+});
+
+describe('defaultHtmlFormatter', () => {
+  it('replaces a recognized provider anchor with a data-type="embed" div', () => {
+    const url = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ';
+    const { $, $root } = makeRoot(`<a href="${url}">video</a>`);
+
+    defaultHtmlFormatter($, $root);
+
+    const $embed = $root.find('div[data-type="embed"]');
+    expect($embed.length).toBe(1);
+    expect($embed.attr('data-provider')).toBe('youtube');
+    expect($embed.attr('data-src')).toBe(url);
+    // the anchor is gone
+    expect($root.find('a').length).toBe(0);
+  });
+
+  it('leaves an anchor as a link when provider resolves to iframe', () => {
+    // A plain non-provider URL falls through to the default iframe provider,
+    // which the formatter explicitly skips.
+    const url = 'https://example.com/some/page';
+    const { $, $root } = makeRoot(`<a href="${url}">site</a>`);
+
+    defaultHtmlFormatter($, $root);
+
+    expect($root.find('div[data-type="embed"]').length).toBe(0);
+    const $a = $root.find('a');
+    expect($a.length).toBe(1);
+    expect($a.attr('href')).toBe(url);
+  });
+});
+
+describe('unwrapFromParagraph', () => {
+  it('replaces the wrapper entirely when the node is the only child of a <p>', () => {
+    const { $, $root } = makeRoot('<p><img src="x.png"></p>');
+    const $node = $root.find('img');
+
+    unwrapFromParagraph($, $node);
+
+    // the <p> wrapper is gone, the img is hoisted to the root
+    expect($root.find('p').length).toBe(0);
+    expect($root.find('img').length).toBe(1);
+  });
+
+  it('moves the node before the wrapper when there are sibling contents', () => {
+    const { $, $root } = makeRoot('<p>text before <img src="x.png"></p>');
+    const $node = $root.find('img');
+
+    unwrapFromParagraph($, $node);
+
+    // img moved out; the paragraph still holds the sibling text
+    const html = $root.html() || '';
+    // img must appear before the paragraph in document order
+    const imgIndex = html.indexOf('<img');
+    const pIndex = html.indexOf('<p');
+    expect(imgIndex).toBeGreaterThanOrEqual(0);
+    expect(pIndex).toBeGreaterThanOrEqual(0);
+    expect(imgIndex).toBeLessThan(pIndex);
+    expect($root.find('p').text()).toContain('text before');
+  });
+
+  it('returns (does not infinite-loop) on adversarial nesting', () => {
+    // Node wrapped in nested <a> and <p> wrappers.
+    const { $, $root } = makeRoot(
+      '<p><a href="#"><img src="x.png"></a></p>',
+    );
+    const $node = $root.find('img');
+
+    // If unwrapFromParagraph looped forever this call would hang the test.
+    expect(() => unwrapFromParagraph($, $node)).not.toThrow();
+    // It fully unwrapped: no surrounding p/a left around the img.
+    expect($node.closest('p, a').length).toBe(0);
+    expect($root.find('img').length).toBe(1);
+  });
+});
--- a/apps/server/src/integrations/import/utils/import.utils.spec.ts
+++ b/apps/server/src/integrations/import/utils/import.utils.spec.ts
@@ -0,0 +1,137 @@
+import {
+  stripNotionID,
+  extractNotionPartialId,
+  resolveRelativeAttachmentPath,
+} from './import.utils';
+
+/**
+ * Unit tests for the pure helpers in import.utils.ts:
+ *  - stripNotionID / extractNotionPartialId: filename suffix parsing.
+ *  - resolveRelativeAttachmentPath: maps an HTML-relative attachment href onto
+ *    a key that exists in the extracted-archive candidate map.
+ */
+
+describe('stripNotionID', () => {
+  it('strips a 32-hex suffix preceded by a space separator', () => {
+    // 32 hex chars with a leading space.
+    const id = 'a1b2c3d4e5f60718293a4b5c6d7e8f90';
+    expect(stripNotionID(`My Page ${id}`)).toBe('My Page');
+  });
+
+  it('strips a 32-hex suffix preceded by a dash separator', () => {
+    const id = 'a1b2c3d4e5f60718293a4b5c6d7e8f90';
+    expect(stripNotionID(`My-Page-${id}`)).toBe('My-Page');
+  });
+
+  it('strips a 32-hex suffix with no separator', () => {
+    const id = 'a1b2c3d4e5f60718293a4b5c6d7e8f90';
+    expect(stripNotionID(`MyPage${id}`)).toBe('MyPage');
+  });
+
+  it('strips a partial UUID suffix "{4}-{4}"', () => {
+    expect(stripNotionID('Cool 324d-35ab')).toBe('Cool');
+  });
+
+  it('leaves a name without an ID unchanged', () => {
+    expect(stripNotionID('Just A Title')).toBe('Just A Title');
+  });
+});
+
+describe('extractNotionPartialId', () => {
+  it('returns prefix/suffix (lowercased) for a partial UUID folder name', () => {
+    expect(extractNotionPartialId('Cool 324D-35AB')).toEqual({
+      prefix: '324d',
+      suffix: '35ab',
+    });
+  });
+
+  it('returns null when there is no partial UUID suffix', () => {
+    expect(extractNotionPartialId('No Id Here')).toBeNull();
+  });
+
+  it('returns null when the suffix lacks the leading space', () => {
+    // The regex requires a leading space before "{4}-{4}".
+    expect(extractNotionPartialId('Name324d-35ab')).toBeNull();
+  });
+});
+
+describe('resolveRelativeAttachmentPath', () => {
+  it('returns the direct candidate when it exists', () => {
+    const candidates = new Map<string, string>([
+      ['attachments/file.png', '/abs/attachments/file.png'],
+    ]);
+    expect(
+      resolveRelativeAttachmentPath(
+        './attachments/file.png',
+        'pages',
+        candidates,
+      ),
+    ).toBe('attachments/file.png');
+  });
+
+  it('strips the Confluence "download/attachments/" prefix to match the archive layout', () => {
+    const candidates = new Map<string, string>([
+      ['attachments/123/diagram.png', '/abs/attachments/123/diagram.png'],
+    ]);
+    expect(
+      resolveRelativeAttachmentPath(
+        'download/attachments/123/diagram.png',
+        'pages',
+        candidates,
+      ),
+    ).toBe('attachments/123/diagram.png');
+  });
+
+  it('decodes a percent-encoded name before matching', () => {
+    const candidates = new Map<string, string>([
+      ['attachments/my file.png', '/abs/attachments/my file.png'],
+    ]);
+    expect(
+      resolveRelativeAttachmentPath(
+        'attachments/my%20file.png',
+        'pages',
+        candidates,
+      ),
+    ).toBe('attachments/my file.png');
+  });
+
+  it('falls back to the raw (still-encoded) value on a malformed escape without throwing', () => {
+    // "%E0%A4" is an incomplete UTF-8 sequence; decodeURIComponent throws and
+    // the helper keeps the raw string, which then matches the candidate key.
+    const candidates = new Map<string, string>([
+      ['attachments/%E0%A4.png', '/abs/attachments/%E0%A4.png'],
+    ]);
+    let result: string | null = null;
+    expect(() => {
+      result = resolveRelativeAttachmentPath(
+        'attachments/%E0%A4.png',
+        'pages',
+        candidates,
+      );
+    }).not.toThrow();
+    expect(result).toBe('attachments/%E0%A4.png');
+  });
+
+  it('returns null when nothing matches', () => {
+    const candidates = new Map<string, string>([
+      ['attachments/other.png', '/abs/attachments/other.png'],
+    ]);
+    expect(
+      resolveRelativeAttachmentPath(
+        './attachments/missing.png',
+        'pages',
+        candidates,
+      ),
+    ).toBeNull();
+  });
+
+  it('matches via the pageDir-joined fallback path', () => {
+    // raw resolves under pageDir when neither the direct nor confluence key hit.
+    const candidates = new Map<string, string>([
+      ['pages/sub/img.png', '/abs/pages/sub/img.png'],
+    ]);
+    expect(
+      resolveRelativeAttachmentPath('sub/img.png', 'pages', candidates),
+    ).toBe('pages/sub/img.png');
+  });
+});
--- a/apps/server/src/integrations/import/utils/table-utils.spec.ts
+++ b/apps/server/src/integrations/import/utils/table-utils.spec.ts
@@ -0,0 +1,105 @@
+import { load, CheerioAPI, Cheerio } from 'cheerio';
+import { normalizeTableColumnWidths } from './table-utils';
+
+/**
+ * Unit tests for normalizeTableColumnWidths: it writes a `colwidth` attribute
+ * onto the first-row cells of every <table>, deriving widths from a <colgroup>
+ * or the first row, accounting for colspan, and falling back to a default
+ * per-column width (150px) when no pixel widths are present. Re-running the
+ * transform on its own output must be a no-op (idempotent).
+ */
+
+const DEFAULT = 150;
+
+function run(html: string): { $: CheerioAPI; $root: Cheerio<any> } {
+  const $ = load(html);
+  const $root = $.root();
+  normalizeTableColumnWidths($, $root);
+  return { $, $root };
+}
+
+function firstRowColwidths($root: Cheerio<any>): (string | undefined)[] {
+  return $root
+    .find('table')
+    .first()
+    .find('> tbody > tr, > thead > tr, > tr')
+    .first()
+    .children('td, th')
+    .map((_, el) => (el as any).attribs?.colwidth)
+    .get();
+}
+
+describe('normalizeTableColumnWidths', () => {
+  it('applies colgroup <col width> to the first-row cells', () => {
+    const html =
+      '<table>' +
+      '<colgroup><col width="120"><col width="80"></colgroup>' +
+      '<tbody><tr><td>a</td><td>b</td></tr></tbody>' +
+      '</table>';
+    const { $root } = run(html);
+
+    expect(firstRowColwidths($root)).toEqual(['120', '80']);
+  });
+
+  it('falls back to first-row cell widths when there is no colgroup', () => {
+    const html =
+      '<table><tbody>' +
+      '<tr><td style="width: 200px">a</td><td width="90">b</td></tr>' +
+      '</tbody></table>';
+    const { $root } = run(html);
+
+    expect(firstRowColwidths($root)).toEqual(['200', '90']);
+  });
+
+  it('splits a colspan width across the spanned columns', () => {
+    // colspan=2 with width 100 => each derived column ~50, the spanning cell
+    // then gets the joined slice "50,50".
+    const html =
+      '<table><tbody>' +
+      '<tr><td colspan="2" width="100">merged</td></tr>' +
+      '</tbody></table>';
+    const { $root } = run(html);
+
+    expect(firstRowColwidths($root)).toEqual(['50,50']);
+  });
+
+  it('ignores em/% widths (treated as no width) and applies the default', () => {
+    const html =
+      '<table><tbody>' +
+      '<tr><td style="width: 10em">a</td><td style="width: 50%">b</td></tr>' +
+      '</tbody></table>';
+    const { $root } = run(html);
+
+    expect(firstRowColwidths($root)).toEqual([String(DEFAULT), String(DEFAULT)]);
+  });
+
+  it('applies the default per-column width to a markdown-style table with no widths', () => {
+    const html =
+      '<table><tbody>' +
+      '<tr><td>a</td><td>b</td><td>c</td></tr>' +
+      '<tr><td>1</td><td>2</td><td>3</td></tr>' +
+      '</tbody></table>';
+    const { $root } = run(html);
+
+    expect(firstRowColwidths($root)).toEqual([
+      String(DEFAULT),
+      String(DEFAULT),
+      String(DEFAULT),
+    ]);
+  });
+
+  it('is idempotent: re-running on its own output changes nothing', () => {
+    const html =
+      '<table>' +
+      '<colgroup><col width="120"><col width="80"></colgroup>' +
+      '<tbody><tr><td>a</td><td>b</td></tr></tbody>' +
+      '</table>';
+    const { $, $root } = run(html);
+    const afterFirst = $root.html();
+
+    // second pass
+    normalizeTableColumnWidths($, $root);
+    expect($root.html()).toBe(afterFirst);
+    expect(firstRowColwidths($root)).toEqual(['120', '80']);
+  });
+});