07ebd8c63e
Must-fix: - Move canonicalizeFootnotes OUT of parseProsemirrorContent. It now runs only on FULL writes (createPage, updatePageContent operation==='replace'), never on an append/prepend fragment (a fragment would lose definition-only footnotes or synthesize a bogus empty list). Add a server binding spec. - Match the live plugin's list PLACEMENT: a single already-canonical footnotesList is left exactly where it sits (the plugin never repositions a sole correct list), so the first write no longer reorders content that follows the list. Applied to BOTH the editor-ext copy and the MCP mirror; pinned by a shared golden corpus case with content after the list. - Fix MCP tool count 38 -> 39 (README x3, AGENTS.md) and the transformJs param help (add canonicalizeFootnotes/insertInlineFootnote). Simplifications: - Remove the dead duplicate re-id mechanism (deriveFootnoteId/suffix/occurrence) from the PURE canonicalizer in both copies — references are never renamed, so the derived ids were never requested; first-wins-drop is the real behaviour. This also makes the editor-ext footnote-util note about "no cross-package copy" true again. - Remove the sentinel round-trip in insertInlineFootnote: a generalized insertNodesAfterAnchor core inserts the footnoteReference node directly. - Drop the redundant per-definition deep clone in step 4 (shallow id-normalizing copy; out is already deep-cloned). Docs / architecture: - Correct the editor-ext copy's "It exists because…" header to its real consumers (server import, page.service create/update, client paste). - Note markdownToProseMirror reuse for create/update comment in collaboration.ts. - A: shared golden JSON corpus exercised by BOTH the editor-ext copy and the MCP mirror (footnote-corpus.ts / .mjs) so "the two copies behave identically" is checkable. - C: split the MCP canonicalizer into a pure mirror + footnote-authoring.ts. - B: import services persist via a different path, so left one-line consolidation comments at the call sites rather than folding (does not fall out cleanly). Tests: insertFootnote wrapper guards + docmost_transform dryRun auto-canonicalize (MCP mock), page.service create/update + append/prepend binding (server jest), shared corpus incl. nested-container reference. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
277 lines
8.3 KiB
TypeScript
277 lines
8.3 KiB
TypeScript
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
|
|
import { PageRepo } from '@docmost/db/repos/page/page.repo';
|
|
import { MultipartFile } from '@fastify/multipart';
|
|
import * as path from 'path';
|
|
import {
|
|
htmlToJson,
|
|
jsonToText,
|
|
tiptapExtensions,
|
|
} from '../../../collaboration/collaboration.util';
|
|
import { InjectKysely } from 'nestjs-kysely';
|
|
import { KyselyDB } from '@docmost/db/types/kysely.types';
|
|
import {
|
|
generateSlugId,
|
|
sanitizeFileName,
|
|
createByteCountingStream,
|
|
} from '../../../common/helpers';
|
|
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
|
|
import { TiptapTransformer } from '@hocuspocus/transformer';
|
|
import * as Y from 'yjs';
|
|
import { markdownToHtml, canonicalizeFootnotes } from '@docmost/editor-ext';
|
|
import {
|
|
FileTaskStatus,
|
|
FileTaskType,
|
|
getFileTaskFolderPath,
|
|
} from '../utils/file.utils';
|
|
import { v7 as uuid7 } from 'uuid';
|
|
import { StorageService } from '../../storage/storage.service';
|
|
import { InjectQueue } from '@nestjs/bullmq';
|
|
import { Queue } from 'bullmq';
|
|
import { QueueJob, QueueName } from '../../queue/constants';
|
|
import { load } from 'cheerio';
|
|
import { normalizeImportHtml } from '../utils/import-formatter';
|
|
|
|
@Injectable()
|
|
export class ImportService {
|
|
private readonly logger = new Logger(ImportService.name);
|
|
|
|
constructor(
|
|
private readonly pageRepo: PageRepo,
|
|
private readonly storageService: StorageService,
|
|
@InjectKysely() private readonly db: KyselyDB,
|
|
@InjectQueue(QueueName.FILE_TASK_QUEUE)
|
|
private readonly fileTaskQueue: Queue,
|
|
) {}
|
|
|
|
async importPage(
|
|
filePromise: Promise<MultipartFile>,
|
|
userId: string,
|
|
spaceId: string,
|
|
workspaceId: string,
|
|
) {
|
|
const file = await filePromise;
|
|
const fileBuffer = await file.toBuffer();
|
|
const fileExtension = path.extname(file.filename).toLowerCase();
|
|
const fileName = sanitizeFileName(
|
|
path.basename(file.filename, fileExtension),
|
|
);
|
|
const fileContent = fileBuffer.toString();
|
|
|
|
let prosemirrorState = null;
|
|
let createdPage = null;
|
|
|
|
try {
|
|
if (fileExtension.endsWith('.md')) {
|
|
prosemirrorState = await this.processMarkdown(fileContent);
|
|
} else if (fileExtension.endsWith('.html')) {
|
|
prosemirrorState = await this.processHTML(fileContent);
|
|
}
|
|
} catch (err) {
|
|
// Surface the real cause instead of a generic mask, so the failure is
|
|
// diagnosable from the HTTP response (project convention: never swallow).
|
|
const reason =
|
|
err instanceof Error ? `${err.name}: ${err.message}` : String(err);
|
|
this.logger.error(`Error processing file content: ${reason}`, err);
|
|
throw new BadRequestException(
|
|
`Error processing file content: ${reason}`,
|
|
);
|
|
}
|
|
|
|
if (!prosemirrorState) {
|
|
const message = 'Failed to create ProseMirror state';
|
|
this.logger.error(message);
|
|
throw new BadRequestException(message);
|
|
}
|
|
|
|
const extracted = this.extractTitleAndRemoveHeading(prosemirrorState);
|
|
const title = extracted.title;
|
|
// Imported markdown/HTML is built via markdownToHtml -> htmlToJson, which
|
|
// never runs the editor's footnoteSyncPlugin, so the footnote topology keeps
|
|
// the source's PHYSICAL definition order (out of order vs. references),
|
|
// retains orphan definitions, and is not deduped. Canonicalize before
|
|
// persisting so the stored page matches the editor's invariant (issue #228).
|
|
// Pure + idempotent + shape-safe: a doc with no footnotes is unchanged.
|
|
// (Future consolidation, architecture B: this import path persists directly
|
|
// via pageRepo.insertPage rather than through PageService.createPage, so the
|
|
// canonicalize call lives here; folding both into one "prepare JSON for
|
|
// persist" helper is a sensible follow-up.)
|
|
const prosemirrorJson = canonicalizeFootnotes(extracted.prosemirrorJson);
|
|
|
|
const pageTitle = title || fileName;
|
|
|
|
if (prosemirrorJson) {
|
|
try {
|
|
const pagePosition = await this.getNewPagePosition(spaceId);
|
|
|
|
createdPage = await this.pageRepo.insertPage({
|
|
slugId: generateSlugId(),
|
|
title: pageTitle,
|
|
content: prosemirrorJson,
|
|
textContent: jsonToText(prosemirrorJson),
|
|
ydoc: await this.createYdoc(prosemirrorJson),
|
|
position: pagePosition,
|
|
spaceId: spaceId,
|
|
creatorId: userId,
|
|
workspaceId: workspaceId,
|
|
lastUpdatedById: userId,
|
|
});
|
|
|
|
this.logger.debug(
|
|
`Successfully imported "${title}${fileExtension}. ID: ${createdPage.id} - SlugId: ${createdPage.slugId}"`,
|
|
);
|
|
} catch (err) {
|
|
const reason =
|
|
err instanceof Error ? `${err.name}: ${err.message}` : String(err);
|
|
this.logger.error(`Failed to create imported page: ${reason}`, err);
|
|
throw new BadRequestException(
|
|
`Failed to create imported page: ${reason}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
return createdPage;
|
|
}
|
|
|
|
async processMarkdown(markdownInput: string): Promise<any> {
|
|
try {
|
|
const html = await markdownToHtml(markdownInput);
|
|
return this.processHTML(html);
|
|
} catch (err) {
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
async processHTML(htmlInput: string): Promise<any> {
|
|
try {
|
|
const $ = load(htmlInput);
|
|
normalizeImportHtml($, $.root());
|
|
return htmlToJson($.html() || '');
|
|
} catch (err) {
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
|
|
if (prosemirrorJson) {
|
|
// this.logger.debug(`Converting prosemirror json state to ydoc`);
|
|
|
|
const ydoc = TiptapTransformer.toYdoc(
|
|
prosemirrorJson,
|
|
'default',
|
|
tiptapExtensions,
|
|
);
|
|
|
|
Y.encodeStateAsUpdate(ydoc);
|
|
|
|
return Buffer.from(Y.encodeStateAsUpdate(ydoc));
|
|
}
|
|
return null;
|
|
}
|
|
|
|
extractTitleAndRemoveHeading(prosemirrorState: any) {
|
|
let title: string | null = null;
|
|
|
|
const content = prosemirrorState.content ?? [];
|
|
|
|
if (
|
|
content.length > 0 &&
|
|
content[0].type === 'heading' &&
|
|
content[0].attrs?.level === 1
|
|
) {
|
|
title = content[0].content?.[0]?.text ?? null;
|
|
content.shift();
|
|
}
|
|
|
|
// ensure at least one paragraph
|
|
if (content.length === 0) {
|
|
content.push({
|
|
type: 'paragraph',
|
|
content: [],
|
|
});
|
|
}
|
|
|
|
return {
|
|
title,
|
|
prosemirrorJson: {
|
|
...prosemirrorState,
|
|
content,
|
|
},
|
|
};
|
|
}
|
|
|
|
async getNewPagePosition(
|
|
spaceId: string,
|
|
parentPageId?: string,
|
|
): Promise<string> {
|
|
let query = this.db
|
|
.selectFrom('pages')
|
|
.select(['id', 'position'])
|
|
.where('spaceId', '=', spaceId)
|
|
.orderBy('position', (ob) => ob.collate('C').desc())
|
|
.limit(1);
|
|
|
|
if (parentPageId) {
|
|
query = query.where('parentPageId', '=', parentPageId);
|
|
} else {
|
|
query = query.where('parentPageId', 'is', null);
|
|
}
|
|
|
|
const lastPage = await query.executeTakeFirst();
|
|
|
|
if (lastPage) {
|
|
return generateJitteredKeyBetween(lastPage.position, null);
|
|
} else {
|
|
return generateJitteredKeyBetween(null, null);
|
|
}
|
|
}
|
|
|
|
async importZip(
|
|
filePromise: Promise<MultipartFile>,
|
|
source: string,
|
|
userId: string,
|
|
spaceId: string,
|
|
workspaceId: string,
|
|
) {
|
|
const file = await filePromise;
|
|
const fileExtension = path.extname(file.filename).toLowerCase();
|
|
const fileName = sanitizeFileName(
|
|
path.basename(file.filename, fileExtension),
|
|
);
|
|
const fileNameWithExt = fileName + fileExtension;
|
|
|
|
const fileTaskId = uuid7();
|
|
const filePath = `${getFileTaskFolderPath(FileTaskType.Import, workspaceId)}/${fileTaskId}/${fileNameWithExt}`;
|
|
|
|
// upload file
|
|
const { stream, getBytesRead } = createByteCountingStream(file.file);
|
|
|
|
await this.storageService.upload(filePath, stream);
|
|
|
|
const fileSize = getBytesRead();
|
|
|
|
const fileTask = await this.db
|
|
.insertInto('fileTasks')
|
|
.values({
|
|
id: fileTaskId,
|
|
type: FileTaskType.Import,
|
|
source: source,
|
|
status: FileTaskStatus.Processing,
|
|
fileName: fileNameWithExt,
|
|
filePath: filePath,
|
|
fileSize: fileSize,
|
|
fileExt: 'zip',
|
|
creatorId: userId,
|
|
spaceId: spaceId,
|
|
workspaceId: workspaceId,
|
|
})
|
|
.returningAll()
|
|
.executeTakeFirst();
|
|
|
|
await this.fileTaskQueue.add(QueueJob.IMPORT_TASK, {
|
|
fileTaskId: fileTaskId,
|
|
});
|
|
|
|
return fileTask;
|
|
}
|
|
}
|