diff --git a/CHANGELOG.md b/CHANGELOG.md index 992b6af6..6c2aa9c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **Persistent AI-chat history as the source of truth + server-side export.** + An assistant turn is now persisted to the database step by step: the row is + inserted upfront as `streaming` and updated as each agent step finishes, then + finalized once to `completed`/`error`/`aborted`. A process that dies mid-turn + keeps every finished step, and a startup sweep flips any dangling `streaming` + row (untouched for 10 minutes) to `aborted`. Chat "Copy" now exports + server-side from these rows (`POST /ai-chat/export`) rather than from live + client state, so the export is identical whether a chat is freshly streaming, + just switched to, or reloaded — and is available from the first turn of a new + chat. (#183, #174) + - **AI-agent attribution for MCP writes.** Comments (and pages) created through the MCP endpoint by a dedicated agent account are now badged as "AI", with unspoofable provenance derived from a per-user `is_agent` flag (not from the diff --git a/apps/client/public/locales/en-US/translation.json b/apps/client/public/locales/en-US/translation.json index b57fffa8..bd8c4ed3 100644 --- a/apps/client/public/locales/en-US/translation.json +++ b/apps/client/public/locales/en-US/translation.json @@ -258,6 +258,7 @@ "Copy to space": "Copy to space", "Copy chat": "Copy chat", "Copied": "Copied", + "Failed to export chat": "Failed to export chat", "Duplicate": "Duplicate", "Select a user": "Select a user", "Select a group": "Select a group", diff --git a/apps/client/public/locales/ru-RU/translation.json b/apps/client/public/locales/ru-RU/translation.json index e3d46ad3..f8c59436 100644 --- a/apps/client/public/locales/ru-RU/translation.json +++ b/apps/client/public/locales/ru-RU/translation.json @@ -257,6 +257,7 @@ "Copy": "Копировать", "Copy to space": "Копировать в пространство", "Copied": "Скопировано", + "Failed to export chat": "Не удалось экспортировать чат", "Duplicate": "Дублировать", "Select a user": "Выберите пользователя", "Select a group": "Выберите группу", diff --git a/apps/client/src/features/ai-chat/components/ai-chat-window.tsx b/apps/client/src/features/ai-chat/components/ai-chat-window.tsx index 740945c4..de0b9923 100644 --- a/apps/client/src/features/ai-chat/components/ai-chat-window.tsx +++ b/apps/client/src/features/ai-chat/components/ai-chat-window.tsx @@ -6,7 +6,6 @@ import { useRef, useState, } from "react"; -import { type UIMessage } from "@ai-sdk/react"; import { Group, Loader, Tooltip } from "@mantine/core"; import { IconArrowsDiagonal, @@ -40,7 +39,7 @@ import { } from "@/features/ai-chat/queries/ai-chat-query.ts"; import ConversationList from "@/features/ai-chat/components/conversation-list.tsx"; import ChatThread from "@/features/ai-chat/components/chat-thread.tsx"; -import { buildChatMarkdown } from "@/features/ai-chat/utils/chat-markdown.ts"; +import { exportAiChat } from "@/features/ai-chat/services/ai-chat-service.ts"; import { useChatSession } from "@/features/ai-chat/hooks/use-chat-session.ts"; import { shouldCollapseOnOutsidePointer, @@ -121,7 +120,7 @@ function clampGeom(g: { * ported from the GitmostAgent.jsx design. */ export default function AiChatWindow() { - const { t } = useTranslation(); + const { t, i18n } = useTranslation(); const clipboard = useClipboard({ timeout: 500 }); const queryClient = useQueryClient(); const [windowOpen, setWindowOpen] = useAtom(aiChatWindowOpenAtom); @@ -162,30 +161,11 @@ export default function AiChatWindow() { const { data: messageRows, isLoading: messagesLoading } = useAiChatMessagesQuery(activeChatId ?? undefined); - // Live snapshot of the active thread's useChat state, kept up to date by - // ChatThread. Lets the export include the in-progress (not-yet-persisted) - // streaming turn. A ref avoids re-rendering this window on every token. - const liveThreadRef = useRef<{ - messages: UIMessage[]; - isStreaming: boolean; - banner: string | null; - }>({ - messages: [], - isStreaming: false, - banner: null, - }); - // Live turn-token total (reasoning + output) for the in-flight turn, pushed up // (THROTTLED to ~8 Hz inside ChatThread) so the header badge ticks mid-stream. // `null` means no turn is in flight -> the badge falls back to the persisted // context size below. const [liveTurnTokens, setLiveTurnTokens] = useState(null); - // Whether the on-screen thread currently holds at least one message. Reported - // reactively by ChatThread (the live snapshot lives in a non-reactive ref). This - // lets the "Copy chat" button stay available for a brand-new, not-yet-persisted - // chat whose first turn is in flight or was interrupted — that case has no - // persisted rows yet, so a persisted-rows-only gate would hide the button (#174). - const [hasLiveContent, setHasLiveContent] = useState(false); // The page the user is currently viewing. AiChatWindow lives in a pathless // parent layout route, so useParams() can't see :pageSlug. Match the full @@ -214,6 +194,7 @@ export default function AiChatWindow() { threadKey, waitingForHistory, onTurnFinished, + onServerChatId, cancelPendingAdoption, } = useChatSession({ activeChatId, @@ -254,20 +235,19 @@ export default function AiChatWindow() { [cancelPendingAdoption, setActiveChatId, setDraft, setSelectedRoleId], ); - // The active chat object (for its title) and an export gate: only enable the - // export button when an existing chat with loaded persisted rows is active. + // The active chat object (for its title) and an export gate. The export is now + // SERVER-sourced (the DB is the single source of truth — #183): the assistant + // row is persisted upfront + per step, so even a brand-new chat whose first + // turn is streaming/interrupted has a server row to render. Enable the button + // whenever a persisted chat is active (`activeChatId` is set). For a BRAND-NEW + // chat that id is adopted EARLY — at the stream's `start` chunk via + // onServerChatId (#174) — so the Copy button is available during the first + // turn's stream, not only after it terminates. const activeChat = useMemo( () => chats?.items?.find((c) => c.id === activeChatId) ?? null, [chats, activeChatId], ); - // Export is available when there is anything to export: either persisted rows - // for the active chat, OR a live on-screen thread with at least one message. - // The live arm covers a brand-new chat whose first turn is streaming or was - // interrupted before the server persisted any row (#174); the persisted arm is - // the steady-state path for an already-saved chat (#160). - const canExport = - hasLiveContent || - (!!activeChatId && !!messageRows && messageRows.length > 0); + const canExport = !!activeChatId; // The role to display in the header and as the assistant's name. Prefer the // persisted role of an existing chat (chat-list JOIN); fall back to the role @@ -284,53 +264,21 @@ export default function AiChatWindow() { return picked ? { name: picked.name, emoji: picked.emoji } : null; }, [activeChat, enabledRoles, selectedRoleId]); - // Build a Markdown export from the already-loaded persisted rows (no network - // call) and copy it to the clipboard. The "Copied" notification is the - // feedback. - const handleCopy = useCallback(() => { - // Export gate. There must be SOMETHING to export — either a live on-screen - // message or a persisted row. A brand-new chat whose first turn is streaming - // or was interrupted has live messages but no persisted rows yet; it still - // exports the on-screen thread WYSIWYG (#174). Only a truly empty chat (no - // live messages and no rows) is non-exportable (the button is hidden too — - // see `canExport`). - const live = liveThreadRef.current; - const hasRows = !!messageRows && messageRows.length > 0; - if (live.messages.length === 0 && !hasRows) return; - // WYSIWYG export: the live on-screen messages ARE the document (so a partial - // reply from an interrupted turn — which never reached the persisted rows — - // is exported just as it appears). The persisted rows enrich each live - // message (token usage / error / timestamp) by id and serve as the fallback - // when the live mirror is empty. The on-screen banner is appended too. See - // issues #160 and #174. `chatId` may be null for a not-yet-saved chat — use a - // placeholder so the header line still renders. - const markdown = buildChatMarkdown({ - title: activeChat?.title ?? null, - chatId: activeChatId ?? "unsaved", - live: live.messages.map((m) => ({ - id: m.id, - role: m.role, - parts: (m.parts ?? []) as { type: string; text?: string }[], - metadata: m.metadata as - | { - usage?: { - inputTokens?: number; - outputTokens?: number; - totalTokens?: number; - reasoningTokens?: number; - }; - error?: string; - } - | undefined, - })), - rows: messageRows, - isStreaming: live.isStreaming, - banner: live.banner, - t, - }); - clipboard.copy(markdown); - notifications.show({ message: t("Copied") }); - }, [activeChatId, messageRows, activeChat, clipboard, t]); + // Fetch the server-rendered Markdown export and copy it to the clipboard. The + // server is the single source of truth (#183): it renders the transcript from + // the persisted rows — including an interrupted turn's in-progress row — so the + // export is identical whether the chat is freshly streaming, just switched to, + // or reloaded. The `lang` of the active i18n drives the few localized labels. + const handleCopy = useCallback(async () => { + if (!activeChatId) return; + try { + const markdown = await exportAiChat(activeChatId, i18n.language); + clipboard.copy(markdown); + notifications.show({ message: t("Copied") }); + } catch { + notifications.show({ message: t("Failed to export chat"), color: "red" }); + } + }, [activeChatId, clipboard, t, i18n.language]); // Current context size for the active chat: how much the conversation now // occupies in the model's context window — NOT the cumulative tokens spent. @@ -685,9 +633,8 @@ export default function AiChatWindow() { onRolePicked={(role) => setSelectedRoleId(role.id)} assistantName={currentRole?.name} onTurnFinished={onTurnFinished} - liveStateRef={liveThreadRef} + onServerChatId={onServerChatId} onLiveTurnTokens={setLiveTurnTokens} - onLiveContentChange={setHasLiveContent} /> )} diff --git a/apps/client/src/features/ai-chat/components/chat-thread.tsx b/apps/client/src/features/ai-chat/components/chat-thread.tsx index fb405a56..c906a940 100644 --- a/apps/client/src/features/ai-chat/components/chat-thread.tsx +++ b/apps/client/src/features/ai-chat/components/chat-thread.tsx @@ -1,11 +1,4 @@ -import { - useCallback, - useEffect, - useMemo, - useRef, - useState, - type MutableRefObject, -} from "react"; +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import { generateId } from "ai"; import { ActionIcon, Box, Group, Stack, Text } from "@mantine/core"; import { IconClockHour4, IconX } from "@tabler/icons-react"; @@ -68,30 +61,18 @@ interface ChatThreadProps { * authoritative id the server streamed on the assistant message metadata, or * undefined on a failed turn — see adopt-chat-id.ts for the full #137 design. */ onTurnFinished: (serverChatId?: string) => void; - /** Parent-owned ref that this thread keeps updated with its live useChat - * snapshot (full message list + streaming flag), so the header's - * "Copy chat" export can include the in-progress, not-yet-persisted - * assistant message. A ref (not state) avoids re-rendering the parent on - * every streamed delta. */ - liveStateRef?: MutableRefObject<{ - messages: UIMessage[]; - isStreaming: boolean; - banner: string | null; - }>; + /** Called EARLY (at the stream's `start` chunk) with the authoritative server + * chat id streamed on the assistant message metadata, so a brand-new chat + * adopts its real id WHILE the first turn is still streaming (#174 — makes the + * Copy/export button available mid-stream). Distinct from onTurnFinished, + * which fires only at the terminal outcome. */ + onServerChatId?: (serverChatId?: string) => void; /** Reports the live turn-token total (reasoning + output) for the in-flight * turn so the parent can show a header badge that ticks mid-stream. THROTTLED * here (~8 Hz) so the parent re-renders a handful of times a second, not on * every streamed delta. Called with `null` when no turn is in flight (the * parent then reverts the badge to the persisted context size). */ onLiveTurnTokens?: (tokens: number | null) => void; - /** Reports whether the live thread currently holds at least one message, so the - * parent can gate the "Copy chat" button on the on-screen thread rather than on - * the persisted rows alone. This stays truthy for a brand-new, not-yet-saved - * chat the moment its first user message appears — so an interrupted very first - * turn (no persisted rows yet) is still exportable (#174). Called with `false` - * on unmount so a thread torn down by `key` on chat switch can't leave the - * button enabled for the next, possibly empty, chat. */ - onLiveContentChange?: (hasContent: boolean) => void; } /** @@ -135,9 +116,8 @@ export default function ChatThread({ onRolePicked, assistantName, onTurnFinished, - liveStateRef, + onServerChatId, onLiveTurnTokens, - onLiveContentChange, }: ChatThreadProps) { const { t } = useTranslation(); @@ -306,6 +286,26 @@ export default function ChatThread({ // Keep the flush helper pointed at the latest sendMessage instance. sendMessageRef.current = sendMessage; + // EARLY chat-id adoption (#174): the server streams the authoritative chat id + // on the assistant message metadata at the `start` chunk (message.metadata. + // chatId — see adopt-chat-id.ts / chatStreamMetadata). Forward it to the parent + // AS SOON AS it appears (mid-stream), so a brand-new chat adopts its real id + // WHILE the first turn is still streaming and activeChatId-gated affordances + // (the Copy/export button) light up immediately, instead of only at onFinish. + // Keyed by the last-seen id so we forward each distinct id exactly once. The + // parent's onServerChatId is idempotent and a no-op once the chat has an id. + const lastForwardedChatIdRef = useRef(undefined); + useEffect(() => { + if (!onServerChatId) return; + const tail = messages[messages.length - 1]; + if (tail?.role !== "assistant") return; + const serverChatId = extractServerChatId(tail); + if (!serverChatId || serverChatId === lastForwardedChatIdRef.current) + return; + lastForwardedChatIdRef.current = serverChatId; + onServerChatId(serverChatId); + }, [messages, onServerChatId]); + // Live "turn was interrupted" marker for the CURRENT session. The red error // banner (driven by `error`) covers the error case; this covers an aborted // turn, distinguishing a manual Stop (`isAbort`) from a dropped connection @@ -328,44 +328,6 @@ export default function ChatThread({ // the SAME on-screen banner text can be mirrored into the export (issue #160). const errorView = error ? describeChatError(error.message ?? "", t) : null; - // The exact banner the user sees under the message list, flattened to a single - // string for the "Copy chat" export so the artifact records the interruption - // WYSIWYG. Mirrors the JSX precedence below: error first, else the stop notice. - const banner = errorView - ? errorView.detail - ? `${errorView.title} — ${errorView.detail}` - : errorView.title - : stopNotice === "manual" - ? t("Response stopped.") - : stopNotice === "disconnect" - ? t("Connection lost — the answer was interrupted.") - : null; - - // Mirror the live useChat snapshot into the parent-owned ref so the export - // (handled in AiChatWindow) can include the in-progress streaming turn AND the - // on-screen banner. The cleanup clears the ref on unmount so a thread torn down - // by `key` on chat switch can't leak its (possibly still-streaming) tail into - // the next chat's export before the new thread's effect repopulates the ref. - useEffect(() => { - if (!liveStateRef) return; - liveStateRef.current = { messages, isStreaming, banner }; - return () => { - liveStateRef.current = { messages: [], isStreaming: false, banner: null }; - }; - }, [liveStateRef, messages, isStreaming, banner]); - - // Reactively report "the live thread has content" to the parent. `liveStateRef` - // above is a ref (deliberately non-reactive so streaming deltas don't re-render - // the parent), so the export button needs a SEPARATE reactive signal to flip on - // for a not-yet-persisted chat. Keyed on the boolean only — identical values are - // a no-op setState in the parent, so this does not add per-delta re-renders. - const hasLiveContent = messages.length > 0; - useEffect(() => { - if (!onLiveContentChange) return; - onLiveContentChange(hasLiveContent); - return () => onLiveContentChange(false); - }, [onLiveContentChange, hasLiveContent]); - // Report the live turn-token total to the parent header badge, THROTTLED to // ~8 Hz so the parent re-renders a few times a second instead of on every // streamed delta. The tail assistant message's reasoning+output (estimate while diff --git a/apps/client/src/features/ai-chat/hooks/use-chat-session.test.tsx b/apps/client/src/features/ai-chat/hooks/use-chat-session.test.tsx index 8104d1e6..0080cc80 100644 --- a/apps/client/src/features/ai-chat/hooks/use-chat-session.test.tsx +++ b/apps/client/src/features/ai-chat/hooks/use-chat-session.test.tsx @@ -64,7 +64,10 @@ describe("useChatSession", () => { result.current.onTurnFinished(undefined); expect(setActiveChatId).not.toHaveBeenCalled(); // The refetch lands with the new row => adopt it. - rerender({ activeChatId: null, chats: { items: [{ id: "x" }, { id: "new" }] } }); + rerender({ + activeChatId: null, + chats: { items: [{ id: "x" }, { id: "new" }] }, + }); expect(setActiveChatId).toHaveBeenCalledWith("new"); }); @@ -88,7 +91,10 @@ describe("useChatSession", () => { }); result.current.onTurnFinished(undefined); // a was deleted, new was added — same length, but membership changed. - rerender({ activeChatId: null, chats: { items: [{ id: "b" }, { id: "new" }] } }); + rerender({ + activeChatId: null, + chats: { items: [{ id: "b" }, { id: "new" }] }, + }); expect(setActiveChatId).toHaveBeenCalledWith("new"); }); @@ -171,6 +177,40 @@ describe("useChatSession", () => { expect(setActiveChatId).not.toHaveBeenCalledWith("late"); }); + it("#174 early adopt: onServerChatId adopts the streamed id mid-stream (Copy button available during the first turn)", () => { + // Brand-new chat: no id yet. The server streams the real chat id "A" on the + // `start` chunk WHILE the first turn is still streaming (before onTurnFinished + // fires at the terminal outcome). The hook must adopt it immediately so the + // window's activeChatId-gated Copy/export button lights up during the stream. + const { result, setActiveChatId } = setup({ + activeChatId: null, + chats: { items: [] }, + }); + result.current.onServerChatId("A"); + expect(setActiveChatId).toHaveBeenCalledWith("A"); + }); + + it("#174 early adopt is in-place: threadKey stays stable (live stream not torn down)", () => { + const chats = { items: [] }; + const { result, rerender } = setup({ activeChatId: null, chats }); + const keyBefore = result.current.threadKey; + result.current.onServerChatId("A"); + // Parent reflects the adopted id back in; the SAME mount key is kept so the + // in-flight useChat store (the streaming turn) is preserved. + rerender({ activeChatId: "A", chats }); + expect(result.current.threadKey).toBe(keyBefore); + }); + + it("#174 early adopt: no-op for an existing chat and for a missing id", () => { + const { result, setActiveChatId } = setup({ + activeChatId: "chat-1", + chats: { items: [{ id: "chat-1" }] }, + }); + result.current.onServerChatId("chat-1"); // already has an id + result.current.onServerChatId(undefined); // no streamed id + expect(setActiveChatId).not.toHaveBeenCalled(); + }); + it("in-place adopt keeps threadKey stable; an external switch remounts", () => { const chats = { items: [{ id: "B" }] }; const { result, rerender } = setup({ activeChatId: null, chats }); diff --git a/apps/client/src/features/ai-chat/hooks/use-chat-session.ts b/apps/client/src/features/ai-chat/hooks/use-chat-session.ts index 998f2631..d21ebd11 100644 --- a/apps/client/src/features/ai-chat/hooks/use-chat-session.ts +++ b/apps/client/src/features/ai-chat/hooks/use-chat-session.ts @@ -34,6 +34,13 @@ export interface UseChatSessionResult { /** Call when a turn finishes; `serverChatId` is the authoritative streamed id * (undefined on a failed turn). Handles new-chat id adoption + invalidations. */ onTurnFinished: (serverChatId?: string) => void; + /** Call EARLY (at the stream's `start` chunk) with the authoritative streamed + * chat id so a brand-new chat adopts its real id WHILE its first turn is still + * streaming — making `activeChatId`-gated affordances (e.g. the Copy/export + * button, #174) available immediately. In-place adoption only (same mount key, + * no list/messages invalidation — that is left to onTurnFinished at the end). + * Idempotent and a no-op once the chat already has an id. */ + onServerChatId: (serverChatId?: string) => void; /** Disarm any pending error-path new-chat fallback. The window calls this from * startNewChat/selectChat so a late refetch can't yank the user back into a * just-failed chat after they explicitly moved on. */ @@ -85,13 +92,10 @@ export function useChatSession( // `newThread`/`switchThread` to (re)mount, `adoptThread` for in-place adoption. // Initial: a non-null activeChatId switches to it; a null one gets a fresh // session key with no chat id yet. - const [thread, dispatch] = useReducer( - threadSessionReducer, - undefined, - () => - activeChatId === null - ? newThread(`new-${generateId()}`) - : switchThread(activeChatId), + const [thread, dispatch] = useReducer(threadSessionReducer, undefined, () => + activeChatId === null + ? newThread(`new-${generateId()}`) + : switchThread(activeChatId), ); // Error-path fallback for new-chat id adoption. When a brand-new chat's first @@ -150,6 +154,31 @@ export function useChatSession( [chats, setActiveChatId, onInvalidateChatList, onInvalidateChatMessages], ); + // EARLY adoption (#174): adopt the authoritative streamed chat id the moment + // the server emits it on the `start` chunk, so a brand-new chat gets its real + // `activeChatId` WHILE its first turn streams — not only at terminal + // onTurnFinished. This makes the activeChatId-gated Copy/export button + // available during the first turn. Pure in-place adoption (same mount key, like + // the primary path) with NO invalidation: the list/messages refresh stays on + // onTurnFinished at the end of the turn. Reads the live id from the ref so a + // repeat call after adoption is a no-op (resolveAdoptedChatId only fires for a + // still-new chat). + const onServerChatId = useCallback( + (serverChatId?: string) => { + const adopted = resolveAdoptedChatId( + activeChatIdRef.current, + serverChatId, + ); + if (!adopted) return; + activeChatIdRef.current = adopted; + setActiveChatId(adopted); + dispatch({ type: "adopt", chatId: adopted }); + // Early adoption beat the error-path fallback to it — disarm. + pendingNewChatRef.current = null; + }, + [setActiveChatId], + ); + // FALLBACK resolver. Armed only by onTurnFinished when a brand-new chat's first // turn errored before the `start` chunk (no authoritative id streamed). Once // the per-user list refetch lands with the just-created row, adopt the SINGLE @@ -233,6 +262,7 @@ export function useChatSession( threadKey: thread.key, waitingForHistory, onTurnFinished, + onServerChatId, cancelPendingAdoption, }; } diff --git a/apps/client/src/features/ai-chat/services/ai-chat-service.ts b/apps/client/src/features/ai-chat/services/ai-chat-service.ts index 181afc65..cc8e6b5a 100644 --- a/apps/client/src/features/ai-chat/services/ai-chat-service.ts +++ b/apps/client/src/features/ai-chat/services/ai-chat-service.ts @@ -50,6 +50,24 @@ export async function deleteAiChat(chatId: string): Promise { await api.post("/ai-chat/delete", { chatId }); } +/** + * Export a chat to Markdown (#183). The server renders the transcript from the + * persisted rows (the DB is the single source of truth — including an + * interrupted turn's in-progress row, persisted upfront + per step), so the + * client just copies the returned string. `lang` localizes the few fixed + * role/tool labels; defaults to English server-side when omitted. + */ +export async function exportAiChat( + chatId: string, + lang?: string, +): Promise { + const req = await api.post<{ markdown: string }>("/ai-chat/export", { + chatId, + lang, + }); + return req.data.markdown; +} + /** * Agent roles API (`/ai-chat/roles`). `list` is available to any workspace * member (for the chat-creation picker); create/update/delete are admin-only @@ -76,6 +94,8 @@ export async function updateAiRole(data: IAiRoleUpdate): Promise { /** Soft-delete a role (admin). */ export async function deleteAiRole(id: string): Promise<{ success: true }> { - const req = await api.post<{ success: true }>("/ai-chat/roles/delete", { id }); + const req = await api.post<{ success: true }>("/ai-chat/roles/delete", { + id, + }); return req.data; } diff --git a/apps/client/src/features/ai-chat/utils/chat-markdown.test.ts b/apps/client/src/features/ai-chat/utils/chat-markdown.test.ts deleted file mode 100644 index a22b2f4f..00000000 --- a/apps/client/src/features/ai-chat/utils/chat-markdown.test.ts +++ /dev/null @@ -1,747 +0,0 @@ -import { describe, it, expect } from "vitest"; -import { buildChatMarkdown } from "@/features/ai-chat/utils/chat-markdown.ts"; -import type { IAiChatMessageRow } from "@/features/ai-chat/types/ai-chat.types.ts"; - -/** - * Tests for the client-only Markdown export builder. The output embeds a live - * `new Date().toISOString()` export timestamp; we never assert that value, only - * the deterministic structure (headings, numbering, fenced blocks, totals). - * - * A pass-through translator keeps role/tool labels predictable so the - * structural assertions are stable without an i18n runtime. - */ -const t = (key: string, values?: Record): string => { - if (values && typeof values.name === "string") { - return key.replace("{{name}}", values.name); - } - return key; -}; - -function row(partial: Partial): IAiChatMessageRow { - return { - id: partial.id ?? "id", - role: partial.role ?? "user", - content: partial.content ?? null, - metadata: partial.metadata ?? null, - createdAt: partial.createdAt ?? "2026-06-21T00:00:00.000Z", - }; -} - -describe("buildChatMarkdown — structure", () => { - it("emits the title heading, chat id and message count", () => { - const md = buildChatMarkdown({ - title: "My chat", - chatId: "chat-123", - rows: [], - t, - }); - expect(md).toContain("# My chat"); - expect(md).toContain("- Chat ID: `chat-123`"); - expect(md).toContain("- Messages: 0"); - expect(md).toContain("- Exported:"); // timestamp present, value not asserted - }); - - it("falls back to the translated 'Untitled chat' for empty/blank titles", () => { - expect( - buildChatMarkdown({ title: null, chatId: "c", rows: [], t }), - ).toContain("# Untitled chat"); - expect( - buildChatMarkdown({ title: " ", chatId: "c", rows: [], t }), - ).toContain("# Untitled chat"); - }); - - it("numbers rows sequentially with role headings", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ role: "user", content: "hi" }), - row({ role: "assistant", content: "hello" }), - row({ role: "user", content: "again" }), - ], - t, - }); - expect(md).toContain("## 1. You"); - expect(md).toContain("## 2. AI agent"); - expect(md).toContain("## 3. You"); - // Heading numbering is strictly index+1, not e.g. role-relative. - expect(md).not.toContain("## 0."); - }); - - it("renders the per-row text content from `content` when no metadata.parts", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [row({ role: "user", content: "plain body" })], - t, - }); - expect(md).toContain("plain body"); - }); -}); - -describe("buildChatMarkdown — text parts", () => { - it("skips empty / whitespace-only text parts", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "ignored-content", - metadata: { - parts: [ - { type: "text", text: " " }, - { type: "text", text: "" }, - { type: "text", text: "kept line" }, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - ] as any, - }, - }), - ], - t, - }); - expect(md).toContain("kept line"); - // Whitespace-only part contributed no block of its own. - expect(md).not.toContain(" \n\n"); - // When metadata.parts exists, the plain `content` fallback is NOT used. - expect(md).not.toContain("ignored-content"); - }); -}); - -describe("buildChatMarkdown — tool parts", () => { - it("renders a tool label, name, state and fenced Input/Output blocks", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "", - metadata: { - parts: [ - { - type: "tool-getPage", - state: "output-available", - input: { pageId: "p1" }, - output: { id: "p1", title: "Home" }, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any, - ], - }, - }), - ], - t, - }); - // Known tool name maps to its label key; raw name in backticks; done state. - expect(md).toContain("**Tool: Read page** (`getPage`) — done"); - expect(md).toContain("Input:"); - expect(md).toContain("Output:"); - // Fenced JSON blocks contain the stringified payloads. - expect(md).toContain('"pageId": "p1"'); - expect(md).toContain('"title": "Home"'); - expect(md).toContain("```json"); - }); - - it("renders the generic label for an unknown tool and surfaces errorText", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "", - metadata: { - parts: [ - { - type: "tool-mysteryTool", - state: "output-error", - input: { a: 1 }, - errorText: "boom", - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any, - ], - }, - }), - ], - t, - }); - expect(md).toContain( - "**Tool: Ran tool mysteryTool** (`mysteryTool`) — error", - ); - expect(md).toContain("**Error:** boom"); - }); - - it("does not throw on a circular tool input (falls back to String)", () => { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const circular: any = {}; - circular.self = circular; - expect(() => - buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "", - metadata: { - parts: [ - { - type: "tool-getPage", - state: "input-available", - input: circular, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any, - ], - }, - }), - ], - t, - }), - ).not.toThrow(); - }); -}); - -describe("buildChatMarkdown — fence anti-breakout", () => { - it("lengthens the delimiter so embedded ``` cannot break out of the block", () => { - // Tool input whose stringified string form contains a literal ``` run. - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "", - metadata: { - parts: [ - { - type: "tool-getPage", - state: "output-available", - // A bare string passes through stringify() verbatim. - input: "before ``` after", - output: "x", - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any, - ], - }, - }), - ], - t, - }); - // The fence around the 3-backtick content must use at least 4 backticks so - // the embedded ``` run cannot terminate the block. - expect(md).toContain("````json\nbefore ``` after\n````"); - // Robust anti-breakout check: the opening fence delimiter is strictly - // longer than the longest backtick run inside the wrapped content. (A naive - // `not.toContain("```json...")` is a false negative — a 4-backtick fence - // textually contains the 3-backtick substring.) - const open = md.match(/(`{3,})json\nbefore/); - expect(open).not.toBeNull(); - expect(open![1].length).toBeGreaterThan(3); // > the 3-backtick run in content - }); - - it("uses a 5-backtick fence when the content has a 4-backtick run", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "", - metadata: { - parts: [ - { - type: "tool-getPage", - state: "output-available", - input: "a ```` b", - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any, - ], - }, - }), - ], - t, - }); - expect(md).toContain("`````json\na ```` b\n`````"); - }); -}); - -describe("buildChatMarkdown — token totals", () => { - it("prints the total-tokens line only when the summed usage is > 0", () => { - const withTokens = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "x", - metadata: { usage: { inputTokens: 10, outputTokens: 5 } }, - }), - ], - t, - }); - expect(withTokens).toContain("- Total tokens: 15"); - // Per-row usage footer too. - expect(withTokens).toContain("_Tokens — in: 10, out: 5, total: 15_"); - }); - - it("omits the total-tokens line when the sum is 0 / usage absent", () => { - const noTokens = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ role: "user", content: "hi" }), - row({ - role: "assistant", - content: "x", - metadata: { usage: { inputTokens: 0, outputTokens: 0 } }, - }), - ], - t, - }); - expect(noTokens).not.toContain("- Total tokens:"); - }); - - it("uses totalTokens when present rather than summing in/out", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "x", - metadata: { - usage: { inputTokens: 3, outputTokens: 4, totalTokens: 99 }, - }, - }), - ], - t, - }); - expect(md).toContain("- Total tokens: 99"); - }); - - it("appends the reasoning figure to the row footer when reasoningTokens > 0", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "x", - metadata: { - usage: { inputTokens: 10, outputTokens: 8, reasoningTokens: 3 }, - }, - }), - ], - t, - }); - expect(md).toContain("_Tokens — in: 10, out: 8, reasoning: 3, total: 18_"); - }); - - it("omits the reasoning figure when reasoningTokens is 0 / absent", () => { - const zero = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "x", - metadata: { - usage: { inputTokens: 10, outputTokens: 5, reasoningTokens: 0 }, - }, - }), - ], - t, - }); - expect(zero).toContain("_Tokens — in: 10, out: 5, total: 15_"); - expect(zero).not.toContain("reasoning:"); - - const absent = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - role: "assistant", - content: "x", - metadata: { usage: { inputTokens: 10, outputTokens: 5 } }, - }), - ], - t, - }); - expect(absent).not.toContain("reasoning:"); - }); -}); - -// A minimal on-screen (live) message, matching the subset buildChatMarkdown reads. -function live(partial: { - id?: string; - role?: string; - parts?: { type: string; text?: string }[]; - metadata?: { usage?: Record; error?: string }; -}) { - return { - id: partial.id ?? "live-id", - role: partial.role ?? "assistant", - parts: partial.parts ?? [], - metadata: partial.metadata, - }; -} - -describe("buildChatMarkdown — live (WYSIWYG) source", () => { - it("uses the live messages as the document (what's on screen), numbered from 1", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - // Persisted rows hold only the user turn; the assistant reply is live-only. - rows: [row({ id: "u1", role: "user", content: "persisted user" })], - live: [ - live({ - id: "u1", - role: "user", - parts: [{ type: "text", text: "on-screen user" }], - }), - live({ - id: "a1", - role: "assistant", - parts: [{ type: "text", text: "on-screen reply" }], - }), - ], - isStreaming: false, - t, - }); - expect(md).toContain("## 1. You"); - expect(md).toContain("## 2. AI agent"); - expect(md).toContain("on-screen user"); - expect(md).toContain("on-screen reply"); - // Message count reflects the LIVE document, not rows + live. - expect(md).toContain("- Messages: 2"); - }); - - it("captures a partial reply from an interrupted (non-streaming) turn — no 'generating' note", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [row({ id: "u1", role: "user", content: "q" })], - live: [ - live({ id: "u1", role: "user", parts: [{ type: "text", text: "q" }] }), - live({ - id: "a-live", - role: "assistant", - parts: [{ type: "text", text: "partial plan before the drop" }], - }), - ], - isStreaming: false, // the stream dropped — not streaming anymore - banner: "Connection lost — the answer was interrupted.", - t, - }); - // The partial assistant answer that was on screen IS in the export. - expect(md).toContain("partial plan before the drop"); - // It is NOT flagged still-generating (the turn is over, just interrupted). - expect(md).not.toContain("still being generated"); - // The on-screen banner is recorded at the end. - expect(md).toContain("Connection lost — the answer was interrupted."); - }); - - it("flags ONLY the tail assistant as still generating, and only while streaming", () => { - const streaming = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [], - live: [ - live({ - id: "a", - role: "assistant", - parts: [{ type: "text", text: "done earlier" }], - }), - live({ - id: "u", - role: "user", - parts: [{ type: "text", text: "next q" }], - }), - live({ - id: "b", - role: "assistant", - parts: [{ type: "text", text: "streaming now" }], - }), - ], - isStreaming: true, - t, - }); - // Exactly one "still being generated" note (the tail assistant). - expect(streaming.match(/still being generated/g)?.length).toBe(1); - - const idle = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [], - live: [ - live({ - id: "b", - role: "assistant", - parts: [{ type: "text", text: "final" }], - }), - ], - isStreaming: false, - t, - }); - expect(idle).not.toContain("still being generated"); - }); - - it("does NOT flag a completed assistant as generating when the streaming tail is a user message", () => { - // The `status === "submitted"` window: the user just sent, isStreaming is - // already true, but the new assistant turn has no message yet so the tail is - // the USER message. The previous assistant answer is complete on screen and - // must not be marked still-generating (WYSIWYG; regression for #160 review). - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [], - live: [ - live({ - id: "a", - role: "assistant", - parts: [{ type: "text", text: "completed answer" }], - }), - live({ - id: "u", - role: "user", - parts: [{ type: "text", text: "the new question" }], - }), - ], - isStreaming: true, - t, - }); - expect(md).toContain("completed answer"); - expect(md).not.toContain("still being generated"); - }); - - it("emits the heading + note for a streaming tail assistant with empty parts", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [row({ id: "u1", role: "user", content: "q" })], - live: [ - live({ id: "u1", role: "user", parts: [{ type: "text", text: "q" }] }), - live({ id: "a-live", role: "assistant", parts: [] }), - ], - isStreaming: true, - t, - }); - expect(md).toContain("## 2. AI agent"); - expect(md).toContain("still being generated"); - }); -}); - -describe("buildChatMarkdown — live enrichment from persisted rows", () => { - it("pulls usage / error / timestamp from the persisted row matched by id", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - id: "a1", - role: "assistant", - content: "x", - createdAt: "2026-06-22T10:00:00.000Z", - metadata: { - usage: { inputTokens: 10, outputTokens: 5 }, - error: "rate limited", - }, - }), - ], - live: [ - // Same id as the persisted row, but no usage/error/timestamp on the live msg. - live({ - id: "a1", - role: "assistant", - parts: [{ type: "text", text: "reply" }], - }), - ], - isStreaming: false, - t, - }); - expect(md).toContain("reply"); - // Token footer + total come from the enriched row. - expect(md).toContain("_Tokens — in: 10, out: 5, total: 15_"); - expect(md).toContain("- Total tokens: 15"); - expect(md).toContain("**⚠️ Error:** rate limited"); - // The persisted timestamp is carried into the export. - expect(md).toContain(""); - }); - - it("prefers authoritative usage already on the live message over the row's", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ - id: "a1", - role: "assistant", - content: "x", - metadata: { - usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, - }, - }), - ], - live: [ - live({ - id: "a1", - role: "assistant", - parts: [{ type: "text", text: "reply" }], - metadata: { - usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 }, - }, - }), - ], - isStreaming: false, - t, - }); - // The live (authoritative, freshest) usage wins, not the stale row usage. - expect(md).toContain("- Total tokens: 150"); - expect(md).not.toContain("- Total tokens: 2"); - }); - - it("a current-turn live message with no matching row renders without a footer", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [row({ id: "u1", role: "user", content: "q" })], - live: [ - live({ id: "u1", role: "user", parts: [{ type: "text", text: "q" }] }), - live({ - id: "a-live", - role: "assistant", - parts: [{ type: "text", text: "fresh reply" }], - }), - ], - isStreaming: false, - t, - }); - expect(md).toContain("fresh reply"); - // No persisted row for the live assistant -> no token footer, no timestamp. - expect(md).not.toContain("_Tokens —"); - expect(md).not.toContain(""); - }); -}); - -describe("buildChatMarkdown — fallback + banner", () => { - it("falls back to the persisted rows when there are no live messages", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [ - row({ role: "user", content: "from rows" }), - row({ - role: "assistant", - content: "answer", - metadata: { usage: { inputTokens: 4, outputTokens: 6 } }, - }), - ], - live: [], // empty live mirror -> fallback path - isStreaming: false, - t, - }); - expect(md).toContain("## 1. You"); - expect(md).toContain("## 2. AI agent"); - expect(md).toContain("from rows"); - expect(md).toContain("- Messages: 2"); - expect(md).toContain("- Total tokens: 10"); - }); - - it("appends the on-screen banner once, after the messages", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [row({ role: "user", content: "q" })], - live: [ - live({ id: "u", role: "user", parts: [{ type: "text", text: "q" }] }), - ], - isStreaming: false, - banner: "Rate limit reached — try again shortly.", - t, - }); - expect(md).toContain("_⚠️ Rate limit reached — try again shortly._"); - // Banner comes after the (only) message block. - expect(md.indexOf("Rate limit reached")).toBeGreaterThan( - md.indexOf("## 1."), - ); - }); - - it("omits the banner block when there is no banner", () => { - const md = buildChatMarkdown({ - title: "t", - chatId: "c", - rows: [row({ role: "user", content: "q" })], - live: [ - live({ id: "u", role: "user", parts: [{ type: "text", text: "q" }] }), - ], - isStreaming: false, - banner: null, - t, - }); - expect(md).not.toContain("_⚠️"); - }); -}); - -// #174: a brand-new, not-yet-persisted chat whose first turn is streaming (or was -// interrupted) has live messages but NO persisted rows yet, and its chat id is not -// known (the caller passes a placeholder). The export must still capture the -// on-screen thread WYSIWYG from the live messages alone. -describe("buildChatMarkdown — first-turn export with no persisted base (#174)", () => { - it("builds the document from live messages alone when rows are empty", () => { - const md = buildChatMarkdown({ - title: null, - chatId: "unsaved", - rows: [], - live: [ - live({ - id: "u1", - role: "user", - parts: [{ type: "text", text: "hello" }], - }), - live({ - id: "a1", - role: "assistant", - parts: [{ type: "text", text: "partial reply" }], - }), - ], - isStreaming: true, - t, - }); - // Both on-screen messages are serialized, numbered from 1. - expect(md).toContain("## 1. You"); - expect(md).toContain("hello"); - expect(md).toContain("## 2. AI agent"); - expect(md).toContain("partial reply"); - // The streaming tail assistant is flagged as in-progress. - expect(md).toContain("still being generated"); - // The placeholder chat id and the live message count are recorded. - expect(md).toContain("- Chat ID: `unsaved`"); - expect(md).toContain("- Messages: 2"); - // No persisted timestamp exists for a current-turn live message. - expect(md).not.toContain("`); - - blocks.push(...renderMessageParts(item.parts, t)); - - // A generating assistant may have empty/no parts yet — the heading (above) - // and this note still record the in-progress turn. - if (item.generating) { - blocks.push( - "_⏳ This message is still being generated — the export captured a partial, in-progress response._", - ); - } - - // A persisted per-message error (the raw provider text) may coexist with the - // trailing `banner` (the classified on-screen alert) when the failed turn's - // row has already been refetched by export time. They describe the same - // failure at different fidelity; showing both is an accepted, minor redundancy. - if (item.error) { - blocks.push(`**⚠️ Error:** ${item.error}`); - } - - const usage = item.usage; - if (usage) { - const total = usage.totalTokens ?? rowTokens(usage); - // Reasoning (thinking) tokens are shown only when the provider reported a - // positive count; old rows / non-reasoning providers omit it. - const reasoning = - usage.reasoningTokens && usage.reasoningTokens > 0 - ? `, reasoning: ${usage.reasoningTokens}` - : ""; - blocks.push( - `_Tokens — in: ${usage.inputTokens ?? "?"}, out: ${usage.outputTokens ?? "?"}${reasoning}, total: ${total}_`, - ); - } - }); - - // Record the on-screen banner (error / dropped connection / manual stop) so - // the export reflects exactly what the user saw, including an interruption. - if (banner && banner.trim().length > 0) { - blocks.push("---"); - blocks.push(`_⚠️ ${banner.trim()}_`); - } - - // Blank line between blocks so the Markdown renders cleanly. - return blocks.join("\n\n"); -} diff --git a/apps/server/src/core/ai-chat/ai-chat.controller.export.spec.ts b/apps/server/src/core/ai-chat/ai-chat.controller.export.spec.ts new file mode 100644 index 00000000..f46aeaa0 --- /dev/null +++ b/apps/server/src/core/ai-chat/ai-chat.controller.export.spec.ts @@ -0,0 +1,159 @@ +import { ForbiddenException } from '@nestjs/common'; +import { AiChatController } from './ai-chat.controller'; +import { + planFinalizeAssistant, + applyFinalize, + flushAssistant, + type AssistantFlush, +} from './ai-chat.service'; +import type { User, Workspace } from '@docmost/db/types/entity.types'; + +/** + * Wiring spec for the #183 `POST /ai-chat/export` endpoint. It must: own-gate via + * the chat lookup (workspace-scoped + creator-owned), load the FULL transcript + * via findAllByChat, render server-side, and return `{ markdown }`. Exercised by + * instantiating the controller with hand-rolled mocks — no Nest graph, no DB. + */ +describe('AiChatController.export', () => { + const user = { id: 'u1' } as User; + const workspace = { id: 'ws1' } as Workspace; + + function makeController( + over: { + chat?: unknown; + rows?: unknown[]; + } = {}, + ) { + const chat = + 'chat' in over + ? over.chat + : { id: 'c1', creatorId: 'u1', title: 'My chat' }; + const aiChatRepo = { + findById: jest.fn().mockResolvedValue(chat), + }; + const aiChatMessageRepo = { + findAllByChat: jest.fn().mockResolvedValue( + over.rows ?? [ + { + id: 'm1', + role: 'user', + content: 'hi', + metadata: null, + status: null, + }, + { + id: 'm2', + role: 'assistant', + content: 'hello', + metadata: null, + status: 'completed', + }, + ], + ), + }; + const controller = new AiChatController( + {} as never, + aiChatRepo as never, + aiChatMessageRepo as never, + {} as never, + ); + return { controller, aiChatRepo, aiChatMessageRepo }; + } + + it('renders the full transcript and returns { markdown }', async () => { + const { controller, aiChatMessageRepo } = makeController(); + const res = await controller.export({ chatId: 'c1' }, user, workspace); + expect(aiChatMessageRepo.findAllByChat).toHaveBeenCalledWith('c1', 'ws1'); + expect(res.markdown).toContain('# My chat'); + expect(res.markdown).toContain('## 1. You'); + expect(res.markdown).toContain('## 2. AI agent'); + }); + + it('forbids a chat the user does not own', async () => { + const { controller } = makeController({ + chat: { id: 'c1', creatorId: 'someone-else', title: 'X' }, + }); + await expect( + controller.export({ chatId: 'c1' }, user, workspace), + ).rejects.toBeInstanceOf(ForbiddenException); + }); + + it('forbids a missing / foreign-workspace chat', async () => { + const { controller } = makeController({ chat: null }); + await expect( + controller.export({ chatId: 'c1' }, user, workspace), + ).rejects.toBeInstanceOf(ForbiddenException); + }); + + it('localizes labels when lang=ru is passed', async () => { + const { controller } = makeController(); + const res = await controller.export( + { chatId: 'c1', lang: 'ru' }, + user, + workspace, + ); + expect(res.markdown).toContain('## 1. Вы'); + expect(res.markdown).toContain('## 2. ИИ-агент'); + }); +}); + +/** + * The terminal-finalize dispatch (#183): the assistant row is INSERTed upfront + * as 'streaming' and finalized once on the terminal callback. When the upfront + * insert SUCCEEDED (we hold an id) finalize UPDATEs that row; when it FAILED + * (assistantId is undefined) finalize falls back to INSERTing the terminal row + * so the turn is not lost — the only safety against losing the turn entirely. + * + * `planFinalizeAssistant` is the pure decision; `applyFinalize` is the REAL + * dispatch the service uses, exercised here over a mock repo (not a copy of the + * logic) so a production drift would fail the test (#186 review). + */ +describe('finalizeAssistant dispatch (planFinalizeAssistant + applyFinalize)', () => { + const workspaceId = 'ws1'; + + // Drive the SAME applyFinalize the service calls (no duplicated logic). + async function dispatchFinalize( + repo: { insert: jest.Mock; update: jest.Mock }, + assistantId: string | undefined, + flushed: AssistantFlush, + ): Promise { + await applyFinalize( + repo, + planFinalizeAssistant(assistantId), + { chatId: 'c1', workspaceId, userId: 'u1' }, + flushed, + ); + } + + it('plan: update when the upfront insert returned an id', () => { + expect(planFinalizeAssistant('a1')).toEqual({ kind: 'update', id: 'a1' }); + }); + + it('plan: insert (fallback) when there is no upfront id', () => { + expect(planFinalizeAssistant(undefined)).toEqual({ kind: 'insert' }); + }); + + it('(a) upfront insert succeeded -> finalize UPDATEs the row by id', async () => { + const repo = { insert: jest.fn(), update: jest.fn() }; + const flushed = flushAssistant([], 'final answer', 'completed', { + finishReason: 'stop', + }); + await dispatchFinalize(repo, 'a1', flushed); + expect(repo.update).toHaveBeenCalledWith('a1', workspaceId, flushed); + expect(repo.insert).not.toHaveBeenCalled(); + }); + + it('(b) upfront insert failed -> finalize INSERTs the terminal payload', async () => { + const repo = { insert: jest.fn(), update: jest.fn() }; + const flushed = flushAssistant([], 'partial', 'error', { error: 'boom' }); + await dispatchFinalize(repo, undefined, flushed); + expect(repo.update).not.toHaveBeenCalled(); + expect(repo.insert).toHaveBeenCalledTimes(1); + const arg = repo.insert.mock.calls[0][0]; + // The fallback insert carries the terminal content/status/metadata. + expect(arg.role).toBe('assistant'); + expect(arg.content).toBe('partial'); + expect(arg.status).toBe('error'); + expect((arg.metadata as { error?: string }).error).toBe('boom'); + }); +}); diff --git a/apps/server/src/core/ai-chat/ai-chat.controller.ts b/apps/server/src/core/ai-chat/ai-chat.controller.ts index a8ddccb1..0f243dec 100644 --- a/apps/server/src/core/ai-chat/ai-chat.controller.ts +++ b/apps/server/src/core/ai-chat/ai-chat.controller.ts @@ -20,7 +20,7 @@ import { JwtAuthGuard } from '../../common/guards/jwt-auth.guard'; import { AuthUser } from '../../common/decorators/auth-user.decorator'; import { AuthWorkspace } from '../../common/decorators/auth-workspace.decorator'; import { SkipTransform } from '../../common/decorators/skip-transform.decorator'; -import { User, Workspace } from '@docmost/db/types/entity.types'; +import { AiChat, User, Workspace } from '@docmost/db/types/entity.types'; import { PaginationOptions } from '@docmost/db/pagination/pagination-options'; import { AiChatRepo } from '@docmost/db/repos/ai-chat/ai-chat.repo'; import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo'; @@ -31,10 +31,12 @@ import { AiChatService, AiChatStreamBody } from './ai-chat.service'; import { AiTranscriptionService } from './ai-transcription.service'; import { ChatIdDto, + ExportChatDto, GetChatMessagesDto, RenameChatDto, } from './dto/ai-chat.dto'; import { describeProviderError } from '../../integrations/ai/ai-error.util'; +import { buildChatMarkdown } from './chat-markdown.util'; /** * Per-user AI chat API (§6.1). Routes are POST to match this codebase's @@ -81,6 +83,36 @@ export class AiChatController { ); } + /** + * Export a chat to Markdown (#183). The DB is the single source of truth: the + * whole transcript is loaded (oldest -> newest) and rendered server-side. Now + * that the assistant row is persisted upfront and per step, an interrupted + * turn is included up to its last finished step. Workspace-scoped and owner- + * gated via assertOwnedChat (same as the other read endpoints). Returns + * `{ markdown }`. `lang` localizes the few fixed labels (default English). + */ + @HttpCode(HttpStatus.OK) + @Post('export') + async export( + @Body() dto: ExportChatDto, + @AuthUser() user: User, + @AuthWorkspace() workspace: Workspace, + ): Promise<{ markdown: string }> { + const chat = await this.assertOwnedChat(dto.chatId, user, workspace); + const rows = await this.aiChatMessageRepo.findAllByChat( + dto.chatId, + workspace.id, + ); + const markdown = buildChatMarkdown({ + title: chat.title ?? null, + chatId: dto.chatId, + rows, + // normalizeLang(undefined) already yields 'en', so no `?? 'en'` is needed. + lang: dto.lang, + }); + return { markdown }; + } + /** Rename a chat. */ @HttpCode(HttpStatus.OK) @Post('rename') @@ -90,7 +122,11 @@ export class AiChatController { @AuthWorkspace() workspace: Workspace, ) { await this.assertOwnedChat(dto.chatId, user, workspace); - await this.aiChatRepo.update(dto.chatId, { title: dto.title }, workspace.id); + await this.aiChatRepo.update( + dto.chatId, + { title: dto.title }, + workspace.id, + ); return { success: true }; } @@ -145,7 +181,10 @@ export class AiChatController { // Resolve the agent role for this turn BEFORE hijack: existing chats read it // from ai_chats.role_id (authoritative), a new chat from body.roleId. The // role drives both the persona and the optional model override below. - const role = await this.aiChatService.resolveRoleForRequest(workspace, body); + const role = await this.aiChatService.resolveRoleForRequest( + workspace, + body, + ); // Resolve the model (applying the role's optional override) BEFORE hijack so // an unconfigured provider — including a role pointing at an unconfigured @@ -232,7 +271,9 @@ export class AiChatController { let file = null; try { // Whisper hard-caps uploads at 25MB; allow a single file. - file = await req.file({ limits: { fileSize: 25 * 1024 * 1024, files: 1 } }); + file = await req.file({ + limits: { fileSize: 25 * 1024 * 1024, files: 1 }, + }); } catch (err: any) { if (err?.statusCode === 413) { throw new BadRequestException('Audio file too large (max 25MB)'); @@ -283,11 +324,12 @@ export class AiChatController { chatId: string, user: User, workspace: Workspace, - ): Promise { + ): Promise { const chat = await this.aiChatRepo.findById(chatId, workspace.id); if (!chat || chat.creatorId !== user.id) { throw new ForbiddenException(); } + return chat; } } diff --git a/apps/server/src/core/ai-chat/ai-chat.service.lifecycle.spec.ts b/apps/server/src/core/ai-chat/ai-chat.service.lifecycle.spec.ts new file mode 100644 index 00000000..77e9d3c4 --- /dev/null +++ b/apps/server/src/core/ai-chat/ai-chat.service.lifecycle.spec.ts @@ -0,0 +1,61 @@ +import { Logger } from '@nestjs/common'; +import { AiChatService } from './ai-chat.service'; + +/** + * Lifecycle unit tests for AiChatService.onModuleInit (#183 crash-recovery + * sweep). The sweep is BEST-EFFORT: a failure must be logged (warn) but must + * NEVER throw out of onModuleInit and block server startup. Exercised with a + * hand-rolled mock repo — no Nest graph, no DB. Only `aiChatMessageRepo` is + * touched by onModuleInit, so the other constructor deps are stubbed as never. + */ +describe('AiChatService.onModuleInit (startup sweep)', () => { + function makeService(sweepStreaming: jest.Mock) { + const aiChatMessageRepo = { sweepStreaming }; + const service = new AiChatService( + {} as never, // ai + {} as never, // aiChatRepo + aiChatMessageRepo as never, + {} as never, // aiSettings + {} as never, // tools + {} as never, // mcpClients + {} as never, // aiAgentRoleRepo + {} as never, // pageRepo + {} as never, // pageAccess + ); + return { service, aiChatMessageRepo }; + } + + afterEach(() => jest.restoreAllMocks()); + + it('happy path: calls sweepStreaming and resolves', async () => { + const sweepStreaming = jest.fn().mockResolvedValue(0); + const { service } = makeService(sweepStreaming); + await expect(service.onModuleInit()).resolves.toBeUndefined(); + expect(sweepStreaming).toHaveBeenCalledTimes(1); + }); + + it('logs how many rows were swept when > 0', async () => { + const sweepStreaming = jest.fn().mockResolvedValue(3); + const logSpy = jest + .spyOn(Logger.prototype, 'log') + .mockImplementation(() => undefined); + const { service } = makeService(sweepStreaming); + await service.onModuleInit(); + expect(logSpy).toHaveBeenCalledTimes(1); + expect(String(logSpy.mock.calls[0][0])).toContain('3'); + }); + + it('sweepStreaming throws -> onModuleInit resolves (does NOT throw) and warns', async () => { + const sweepStreaming = jest + .fn() + .mockRejectedValue(new Error('db unavailable')); + const warnSpy = jest + .spyOn(Logger.prototype, 'warn') + .mockImplementation(() => undefined); + const { service } = makeService(sweepStreaming); + // Must not throw — a sweep failure may never block startup. + await expect(service.onModuleInit()).resolves.toBeUndefined(); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(String(warnSpy.mock.calls[0][0])).toContain('db unavailable'); + }); +}); diff --git a/apps/server/src/core/ai-chat/ai-chat.service.spec.ts b/apps/server/src/core/ai-chat/ai-chat.service.spec.ts index 31281fd4..bfeafb97 100644 --- a/apps/server/src/core/ai-chat/ai-chat.service.spec.ts +++ b/apps/server/src/core/ai-chat/ai-chat.service.spec.ts @@ -6,7 +6,7 @@ import { serializeSteps, rowToUiMessage, prepareAgentStep, - buildPartialAssistantRecord, + flushAssistant, chatStreamMetadata, accumulateStepUsage, MAX_AGENT_STEPS, @@ -233,101 +233,108 @@ describe('prepareAgentStep', () => { // The synthesis instruction is appended. expect(result?.system).toContain(FINAL_STEP_INSTRUCTION); }); - - it('pins the off-by-one boundary (MAX-2 is not final, MAX-1 is)', () => { - // Boundary expressed via the constant, not a hardcoded 18/19, so the test - // tracks MAX_AGENT_STEPS if the cap ever changes. - expect(prepareAgentStep(MAX_AGENT_STEPS - 2, 'SYS')).toBeUndefined(); - const atBoundary = prepareAgentStep(MAX_AGENT_STEPS - 1, 'SYS'); - expect(atBoundary).toBeDefined(); - expect(atBoundary?.toolChoice).toBe('none'); - }); }); /** - * Unit test for buildPartialAssistantRecord: the pure helper that shapes the - * assistant-message record persisted on a partial/failed turn (the streamText - * onError / onAbort paths). It captures the PARTIAL answer the user already saw - * (finished steps' text + tool parts, plus the in-progress step's text) so a - * provider error / disconnect no longer throws the streamed answer away. Pinning - * the record shape here covers the persist-partial logic without seaming - * streamText itself. + * flushAssistant (#183): the PURE row builder behind the step-granular durable + * write path. It runs identically for the upfront insert (empty steps, + * 'streaming'), every per-step update, and the terminal finalize — so a future + * background worker can call the same function. These tests pin the four status + * shapes and the `metadata.parts` shape that rowToUiMessage/findRecent depend on + * (per-step text + tool parts via assistantParts, in-progress text appended). */ -describe('buildPartialAssistantRecord', () => { +describe('flushAssistant', () => { type AnyPart = Record; - it('records an empty turn with the error text (preserves old behavior)', () => { - const rec = buildPartialAssistantRecord( - [], - '', - 'error', - '401: Unauthorized', - ); - expect(rec).toEqual({ - text: '', - toolCalls: null, - metadata: { - finishReason: 'error', - parts: [], - error: '401: Unauthorized', - }, - }); + const toolStep = { + text: 'looked it up', + toolCalls: [{ toolCallId: 'c1', toolName: 'getPage', input: { id: 'p1' } }], + toolResults: [ + { toolCallId: 'c1', toolName: 'getPage', output: { title: 'T' } }, + ], + }; + + it('upfront seed: empty streaming row (no content, no toolCalls, empty parts)', () => { + const f = flushAssistant([], '', 'streaming'); + expect(f.status).toBe('streaming'); + expect(f.content).toBe(''); + expect(f.toolCalls).toBeNull(); + expect(f.metadata.parts).toEqual([]); + // No finishReason while streaming (it is not a terminal state). + expect('finishReason' in f.metadata).toBe(false); }); - it('persists in-progress text (no finished steps) as the partial answer', () => { - const rec = buildPartialAssistantRecord( - [], - 'partial answer', - 'error', - 'boom', - ); - expect(rec.text).toBe('partial answer'); - expect(rec.metadata.parts).toEqual([ + it('streaming update folds in finished steps but keeps status streaming', () => { + const f = flushAssistant([toolStep], '', 'streaming'); + expect(f.status).toBe('streaming'); + expect(f.content).toBe('looked it up'); + const parts = f.metadata.parts as AnyPart[]; + expect(parts).toContainEqual({ type: 'text', text: 'looked it up' }); + const toolPart = parts.find((p) => p.type === 'tool-getPage'); + expect(toolPart!.state).toBe('output-available'); + expect(f.toolCalls).not.toBeNull(); + }); + + it('completed: attaches finishReason + normalized usage + contextTokens', () => { + const f = flushAssistant([toolStep], '', 'completed', { + finishReason: 'stop', + usage: { inputTokens: 10, outputTokens: 5, totalTokens: 15 }, + contextTokens: 15, + }); + expect(f.status).toBe('completed'); + expect(f.metadata.finishReason).toBe('stop'); + expect(f.metadata.usage).toEqual({ + inputTokens: 10, + outputTokens: 5, + totalTokens: 15, + reasoningTokens: undefined, + }); + expect(f.metadata.contextTokens).toBe(15); + }); + + it('error: records the error and a derived finishReason', () => { + const f = flushAssistant([], 'partial answer', 'error', { error: 'boom' }); + expect(f.status).toBe('error'); + expect(f.content).toBe('partial answer'); + expect(f.metadata.error).toBe('boom'); + // Derives finishReason from the terminal status when none is supplied. + expect(f.metadata.finishReason).toBe('error'); + expect(f.metadata.parts).toEqual([ { type: 'text', text: 'partial answer' }, ]); - expect(rec.metadata.error).toBe('boom'); }); - it('combines a finished tool step with trailing in-progress text', () => { - const steps = [ - { - text: 'looked it up', - toolCalls: [ - { toolCallId: 'c1', toolName: 'getPage', input: { id: 'p1' } }, - ], - toolResults: [ - { toolCallId: 'c1', toolName: 'getPage', output: { title: 'T' } }, - ], - }, - ]; - const rec = buildPartialAssistantRecord( - steps, - ' and then', - 'error', - 'boom', - ); - const parts = rec.metadata.parts as AnyPart[]; - // The finished step's text part is present. - expect(parts).toContainEqual({ type: 'text', text: 'looked it up' }); - // The paired tool call+result becomes an output-available part. - const toolPart = parts.find((p) => p.type === 'tool-getPage'); - expect(toolPart).toBeDefined(); - expect(toolPart!.state).toBe('output-available'); - // The in-progress text is appended LAST so the parts match the stream order. + it('aborted: in-progress text appended last, no error key', () => { + const f = flushAssistant([toolStep], ' and then', 'aborted'); + expect(f.status).toBe('aborted'); + expect(f.metadata.finishReason).toBe('aborted'); + expect('error' in f.metadata).toBe(false); + expect(f.content).toBe('looked it up and then'); + const parts = f.metadata.parts as AnyPart[]; expect(parts[parts.length - 1]).toEqual({ type: 'text', text: ' and then', }); - expect(rec.text).toBe('looked it up and then'); - expect(rec.toolCalls).not.toBeNull(); - expect(rec.metadata.error).toBe('boom'); }); - it('omits the error key on the abort path (no errorText)', () => { - const rec = buildPartialAssistantRecord([], 'half', 'aborted'); - expect(rec.metadata.finishReason).toBe('aborted'); - expect('error' in rec.metadata).toBe(false); - expect(rec.text).toBe('half'); + it('combines a finished tool step with trailing in-progress text (error path)', () => { + // The error path captures the PARTIAL answer the user already saw: each + // finished step's text + tool parts, then the in-progress step's text last. + const flushed = flushAssistant([toolStep], ' and then', 'error', { + error: 'boom', + }); + const parts = flushed.metadata.parts as AnyPart[]; + expect(parts).toContainEqual({ type: 'text', text: 'looked it up' }); + const toolPart = parts.find((p) => p.type === 'tool-getPage'); + expect(toolPart!.state).toBe('output-available'); + // In-progress text appended LAST so the parts match the stream order. + expect(parts[parts.length - 1]).toEqual({ + type: 'text', + text: ' and then', + }); + expect(flushed.content).toBe('looked it up and then'); + expect(flushed.toolCalls).not.toBeNull(); + expect(flushed.metadata.error).toBe('boom'); }); }); diff --git a/apps/server/src/core/ai-chat/ai-chat.service.ts b/apps/server/src/core/ai-chat/ai-chat.service.ts index 8a807ba5..5c4b1f0e 100644 --- a/apps/server/src/core/ai-chat/ai-chat.service.ts +++ b/apps/server/src/core/ai-chat/ai-chat.service.ts @@ -1,4 +1,9 @@ -import { ForbiddenException, Injectable, Logger } from '@nestjs/common'; +import { + ForbiddenException, + Injectable, + Logger, + OnModuleInit, +} from '@nestjs/common'; import { FastifyReply } from 'fastify'; import { streamText, @@ -124,7 +129,7 @@ export interface AiChatStreamArgs { * can be rebuilt for `convertToModelMessages`. */ @Injectable() -export class AiChatService { +export class AiChatService implements OnModuleInit { private readonly logger = new Logger(AiChatService.name); constructor( @@ -139,6 +144,32 @@ export class AiChatService { private readonly pageAccess: PageAccessService, ) {} + /** + * Crash-recovery sweep on server start (#183): any assistant row left in the + * 'streaming' state is the relic of a turn whose process died before it + * reached a terminal status. Flip those to 'aborted' so history/export show + * them settled (with whatever finished steps were already persisted) instead + * of perpetually "streaming". Best-effort: a sweep failure is logged but must + * never block server startup. + */ + async onModuleInit(): Promise { + try { + const swept = await this.aiChatMessageRepo.sweepStreaming(); + if (swept > 0) { + this.logger.log( + `Startup sweep: marked ${swept} dangling 'streaming' assistant ` + + `message(s) as 'aborted'.`, + ); + } + } catch (err) { + this.logger.warn( + `Startup sweep of dangling 'streaming' messages failed: ${ + err instanceof Error ? err.message : 'unknown error' + }`, + ); + } + } + /** * Resolve the agent role that applies to this stream request, scoped to the * workspace and soft-delete aware. For an EXISTING chat the role is read from @@ -395,31 +426,6 @@ export class AiChatService { const tools = { ...external.tools, ...docmostTools }; - // Persist the assistant message. Used by onFinish (full result) and the - // abort/error paths (partial result). Guarded so we persist at most once. - let persisted = false; - const persistAssistant = async (data: { - text: string; - toolCalls: unknown; - metadata: Record; - }): Promise => { - if (persisted) return; - persisted = true; - try { - await this.aiChatMessageRepo.insert({ - chatId, - workspaceId: workspace.id, - userId: user.id, - role: 'assistant', - content: data.text ?? '', - toolCalls: (data.toolCalls ?? null) as never, - metadata: data.metadata as never, - }); - } catch (err) { - this.logger.error('Failed to persist assistant message', err as Error); - } - }; - // Accumulate the turn's streamed output so a provider error / disconnect can // persist the PARTIAL answer the user already saw — the SDK's onError/onAbort // callbacks don't hand us the in-progress text. `capturedSteps` holds finished @@ -428,6 +434,101 @@ export class AiChatService { const capturedSteps: StepLike[] = []; let inProgressText = ''; + // Step-granular durability (#183): create the assistant row UPFRONT in the + // 'streaming' state (before any token), then UPDATE it as each step finishes + // and finalize it once on the terminal callback. If the process dies + // mid-turn the row survives with every finished step already persisted; the + // startup sweep (sweepStreaming) later flips a dangling 'streaming' row to + // 'aborted'. The DB is now the single source of truth for the turn — the + // socket is never required for the write path. A failed upfront insert is + // logged and leaves assistantId undefined; the per-step/terminal updates then + // no-op (guarded below) so the turn still streams to the user. + let assistantId: string | undefined; + try { + const seed = flushAssistant([], '', 'streaming'); + const seeded = await this.aiChatMessageRepo.insert({ + chatId, + workspaceId: workspace.id, + userId: user.id, + role: 'assistant', + content: seed.content, + // jsonb columns: cast through never (same as the user insert above). + toolCalls: (seed.toolCalls ?? null) as never, + metadata: seed.metadata as never, + status: seed.status, + }); + assistantId = seeded?.id; + } catch (err) { + this.logger.error( + `Failed to insert upfront assistant row (chat ${chatId}, workspace ${workspace.id})`, + err as Error, + ); + } + + // Per-step (non-terminal) update: persist the finished steps the moment a + // step ends. Tolerant — a failed update is logged and swallowed so it never + // throws into the stream. Keeps status 'streaming'. + const updateStreaming = async (): Promise => { + if (!assistantId) return; + // Cheap short-circuit once the turn is finalized (see `finalized` below). + // The AUTHORITATIVE guard is `onlyIfStreaming` on the UPDATE: a late + // fire-and-forget step update could still be in flight on another pool + // connection when finalize runs, so the SQL `WHERE status='streaming'` + // (not this flag) is what prevents it clobbering the terminal row. + if (finalized) return; + try { + await this.aiChatMessageRepo.update( + assistantId, + workspace.id, + flushAssistant(capturedSteps, '', 'streaming'), + { onlyIfStreaming: true }, + ); + } catch (err) { + this.logger.warn( + `Failed to update streaming assistant row: ${ + err instanceof Error ? err.message : 'unknown error' + }`, + ); + } + }; + + // Serialize the per-step updates (#183 review): onStepFinish fires them + // without await, so two could otherwise commit out of order on different pool + // connections (step N landing after N+1). Chaining each onto the previous + // keeps the persisted row monotonic with step order; each link short-circuits + // on `finalized`, so a tail of late updates is cheap. + let stepUpdateChain: Promise = Promise.resolve(); + + // Terminal finalize: write the completed/error/aborted row exactly once + // across the (mutually-exclusive, at-most-once) onFinish/onError/onAbort + // callbacks — mirroring the pre-#183 persist-at-most-once guard for the + // TERMINAL status (the row may be updated many times with 'streaming' before + // this fires once). + let finalized = false; + const finalizeAssistant = async ( + flushed: AssistantFlush, + ): Promise => { + if (finalized) return; + finalized = true; + const plan = planFinalizeAssistant(assistantId); + try { + // Shared dispatch (see applyFinalize): UPDATE the upfront row, or — when + // the upfront insert failed (kind 'insert') — INSERT the terminal row as + // the only safety against losing the turn entirely. + await applyFinalize( + this.aiChatMessageRepo, + plan, + { chatId, workspaceId: workspace.id, userId: user.id }, + flushed, + ); + } catch (err) { + this.logger.error( + `Failed to finalize assistant message (kind=${plan.kind})`, + err as Error, + ); + } + }; + // DIAGNOSTIC (Safari stream-drop investigation) — temporary. Measure // first-chunk latency, the model-silent gap right before a disconnect, and // how many SSE heartbeats were written, so a Safari drop can be classified @@ -476,6 +577,12 @@ export class AiChatService { // the in-progress accumulator for the next step. capturedSteps.push(step as StepLike); inProgressText = ''; + // Step-granular durability (#183): persist this finished step (its text + + // tool calls + tool RESULTS) the moment it ends, so a process death after + // this point still recovers the step. Not awaited here (never block the + // stream), but SERIALIZED via stepUpdateChain so the writes commit in + // step order; updateStreaming is error-tolerant (logs + swallows). + stepUpdateChain = stepUpdateChain.then(() => updateStreaming()); }, onFinish: async ({ text, finishReason, totalUsage, usage, steps }) => { // DIAGNOSTIC (Safari stream-drop investigation) — temporary: success @@ -486,30 +593,31 @@ export class AiChatService { `firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` + `heartbeatsSent=${heartbeatsSent} steps=${steps.length}`, ); - await persistAssistant({ - text, - toolCalls: serializeSteps(steps), - metadata: { - finishReason, - // Persist the turn's cumulative usage WITH reasoning tokens resolved - // from either the new `outputTokenDetails` or the deprecated top-level - // field, so reopened history / the Markdown export show the thinking - // token cost too. - usage: - normalizeStreamUsage(totalUsage as StreamUsage) ?? totalUsage, - // Final-step usage = the context actually fed to the model on the last LLM - // call (full history + tool results) plus the answer it just generated. - // input+output of the FINAL step ≈ the conversation's CURRENT context size, - // distinct from totalUsage which sums every step (cumulative tokens spent). + // Finalize the assistant row (#183): the upfront 'streaming' row is + // UPDATEd to 'completed' with the turn's final text, cumulative usage and + // full UIMessage parts. We pass the SDK `steps` (which carry the final + // step's text) as the captured steps so metadata.parts matches the + // pre-#183 onFinish record exactly; `inProgressText` is '' here (the last + // step already finished). Final-step usage (usage.input+output) ≈ the + // conversation's CURRENT context size, distinct from totalUsage. + // + // COLUMN-SEMANTICS NOTE (#183): `content` is built by flushAssistant as + // the CONCATENATION of every step's text (stepsText), whereas pre-#183 + // it stored only the FINAL step's text. This is a deliberate, harmless + // change: the UI and the Markdown export render from `metadata.parts` + // (per-step text + tool parts), not from `content`; `content` is the + // plain-text projection (full-text search / fallback). A multi-step + // turn's `content` therefore now holds all steps' prose, not just the + // last block. + await finalizeAssistant( + flushAssistant(steps as StepLike[], '', 'completed', { + finishReason: finishReason as string, + usage: totalUsage as StreamUsage, contextTokens: (usage?.inputTokens ?? 0) + (usage?.outputTokens ?? 0) || undefined, - // Persist the FULL set of UIMessage parts for the turn (text + - // tool-call/result), so the rebuilt history replays prior tool - // context to the model on later turns. - parts: assistantParts(steps, text), - }, - }); + }), + ); // Lifecycle: release the external MCP clients leased for this turn. await closeExternalClients(); @@ -545,16 +653,14 @@ export class AiChatService { `firstChunkLatency=${firstModelChunkAt ? firstModelChunkAt - streamStartedAt : 'none'}ms ` + `silentGapBeforeDrop=${diagNow - lastModelChunkAt}ms heartbeatsSent=${heartbeatsSent}`, ); - // Persist the PARTIAL answer streamed before the failure (text + any + // Finalize the PARTIAL answer streamed before the failure (text + any // finished tool steps) WITH the error in metadata, so the turn shows what - // the user already saw plus the cause — not just a bare error. - await persistAssistant( - buildPartialAssistantRecord( - capturedSteps, - inProgressText, - 'error', - errorText, - ), + // the user already saw plus the cause — not just a bare error. Status + // 'error' (#183). + await finalizeAssistant( + flushAssistant(capturedSteps, inProgressText, 'error', { + error: errorText, + }), ); await closeExternalClients(); }, @@ -578,12 +684,8 @@ export class AiChatService { `silentGapBeforeDrop=${diagNow - lastModelChunkAt}ms heartbeatsSent=${heartbeatsSent} ` + `steps=${steps.length}`, ); - await persistAssistant( - buildPartialAssistantRecord( - capturedSteps, - inProgressText, - 'aborted', - ), + await finalizeAssistant( + flushAssistant(capturedSteps, inProgressText, 'aborted'), ); await closeExternalClients(); }, @@ -1032,38 +1134,132 @@ export function rowToUiMessage(row: AiChatMessage): Omit & { } /** - * Build the assistant-message record persisted on a partial/failed turn (the - * streamText onError / onAbort paths). Captures the partial answer the user - * already saw: each finished step's text + tool parts (via assistantParts), - * then the in-progress step's text appended last. When `errorText` is provided - * it is recorded in metadata.error so the cause shows in history; an aborted - * turn passes none. Pure, so the partial-recording shape is unit-testable - * without seaming streamText. + * The persisted-row patch shape produced by {@link flushAssistant}. It is the + * SAME shape the assistant repo insert/update consume (content + toolCalls + + * metadata) plus the lifecycle `status` column added in #183. */ -export function buildPartialAssistantRecord( - steps: ReadonlyArray | undefined, +export interface AssistantFlush { + content: string; + toolCalls: unknown; + metadata: Record; + status: 'streaming' | 'completed' | 'error' | 'aborted'; +} + +/** + * Pure decision for the terminal finalize (#183): given whether the upfront + * assistant row exists (`assistantId`), choose whether the terminal payload is + * written by UPDATEing that row or — when the upfront insert failed and there is + * no id — by INSERTing a fresh terminal row so the turn is not lost entirely. + * Returns `{ kind: 'update', id }` or `{ kind: 'insert' }`. Extracted so the + * fallback-insert branch (the only safety against losing a turn whose upfront + * insert failed) is unit-testable without seaming streamText. + */ +export function planFinalizeAssistant( + assistantId: string | undefined, +): { kind: 'update'; id: string } | { kind: 'insert' } { + return assistantId ? { kind: 'update', id: assistantId } : { kind: 'insert' }; +} + +/** The repo surface the terminal finalize needs (structural — the real repo and + * a test mock both satisfy it). */ +export interface FinalizeRepo { + insert(insertable: Record): Promise; + update( + id: string, + workspaceId: string, + patch: AssistantFlush, + ): Promise; +} + +/** + * Apply a finalize `plan` to the repo with the terminal `flushed` payload (#183): + * UPDATE the upfront row, or INSERT a fresh terminal row as the fallback when the + * upfront insert failed. The SINGLE dispatch shared by the service's + * finalizeAssistant and its test, so the test exercises the real path instead of + * a copy (#186 review). Pure of error handling — the caller wraps it. + */ +export async function applyFinalize( + repo: FinalizeRepo, + plan: { kind: 'update'; id: string } | { kind: 'insert' }, + base: { chatId: string; workspaceId: string; userId: string }, + flushed: AssistantFlush, +): Promise { + if (plan.kind === 'update') { + await repo.update(plan.id, base.workspaceId, flushed); + return; + } + await repo.insert({ + chatId: base.chatId, + workspaceId: base.workspaceId, + userId: base.userId, + role: 'assistant', + content: flushed.content, + toolCalls: flushed.toolCalls ?? null, + metadata: flushed.metadata, + status: flushed.status, + }); +} + +/** + * PURE assistant-row builder (#183 step-granular durability). Given the turn's + * accumulated steps + the in-progress (not-yet-finished) text + the lifecycle + * status, it returns the row patch to persist. The SAME path runs for the + * upfront insert (empty steps, status 'streaming'), every per-step update, and + * the terminal finalize (completed/error/aborted) — and a future background + * worker can call it identically, so it must stay a pure function of its inputs + * (NO `this`, no IO). + * + * `metadata.parts` is built by assistantParts over the finished steps, then the + * in-progress text appended as a trailing text part, so rowToUiMessage / + * findRecent keep replaying the turn unchanged. `metadata.finishReason`, + * `metadata.error`, `metadata.usage` and `metadata.contextTokens` are attached + * only when provided/relevant, matching the pre-#183 onFinish/onError records. + */ +export function flushAssistant( + capturedSteps: ReadonlyArray | undefined, inProgressText: string, - finishReason: 'error' | 'aborted', - errorText?: string, -): { text: string; toolCalls: unknown; metadata: Record } { - const finished = steps ?? []; + status: 'streaming' | 'completed' | 'error' | 'aborted', + extra?: { + finishReason?: string; + usage?: ChatStreamUsage | StreamUsage | undefined; + contextTokens?: number; + error?: string; + }, +): AssistantFlush { + const finished = capturedSteps ?? []; const stepsText = finished.map((s) => s.text ?? '').join(''); const trailing = inProgressText ?? ''; // assistantParts emits text parts only for FINISHED steps; append the - // in-progress step's text (the answer cut off by the error) as the last text - // part so the persisted parts match what streamed to the client. + // in-progress step's text (the partial answer cut off by an error/abort, or + // simply not yet flushed mid-stream) as the last text part so the persisted + // parts match what streamed to the client. const parts = assistantParts(finished, '') as unknown as Array< Record >; if (trailing) parts.push({ type: 'text', text: trailing }); + + const metadata: Record = { + parts: parts as unknown as UIMessage['parts'], + }; + // finishReason: prefer an explicit one; else derive a sensible value from the + // terminal status (so onError/onAbort records keep their historical reason). + if (extra?.finishReason) { + metadata.finishReason = extra.finishReason; + } else if (status === 'error' || status === 'aborted') { + metadata.finishReason = status; + } + if (extra?.usage !== undefined) { + metadata.usage = + normalizeStreamUsage(extra.usage as StreamUsage) ?? extra.usage; + } + if (extra?.contextTokens) metadata.contextTokens = extra.contextTokens; + if (extra?.error) metadata.error = extra.error; + return { - text: stepsText + trailing, + content: stepsText + trailing, toolCalls: serializeSteps(finished), - metadata: { - finishReason, - parts: parts as unknown as UIMessage['parts'], - ...(errorText ? { error: errorText } : {}), - }, + metadata, + status, }; } diff --git a/apps/server/src/core/ai-chat/chat-markdown.util.spec.ts b/apps/server/src/core/ai-chat/chat-markdown.util.spec.ts new file mode 100644 index 00000000..791d5a61 --- /dev/null +++ b/apps/server/src/core/ai-chat/chat-markdown.util.spec.ts @@ -0,0 +1,295 @@ +import { buildChatMarkdown, normalizeLang } from './chat-markdown.util'; +import type { AiChatMessage } from '@docmost/db/types/entity.types'; + +/** + * normalizeLang: the client sends `i18n.language` — a FULL locale tag like + * 'en-US' / 'ru-RU', NOT a bare 'en'/'ru'. A `@IsIn(['en','ru'])` DTO rejected + * that with a 400 (caught in real-browser testing); the export now accepts any + * string and normalizes here. Guards that regression. + */ +describe('normalizeLang', () => { + it("maps any 'ru…' locale tag to ru", () => { + expect(normalizeLang('ru')).toBe('ru'); + expect(normalizeLang('ru-RU')).toBe('ru'); + expect(normalizeLang('RU-ru')).toBe('ru'); + }); + + it('maps everything else (incl. region-qualified English) to en', () => { + expect(normalizeLang('en')).toBe('en'); + expect(normalizeLang('en-US')).toBe('en'); + expect(normalizeLang('fr-FR')).toBe('en'); + expect(normalizeLang(undefined)).toBe('en'); + expect(normalizeLang('')).toBe('en'); + }); +}); + +/** + * Unit tests for the SERVER Markdown export (#183). Mirrors the coverage of the + * (now-removed) client chat-markdown tests: heading/metadata, role labels, text + * + tool blocks, token footers, the interrupted-turn note, and NULL-status + * (legacy) rows. The export embeds a live `new Date().toISOString()` timestamp; + * we never assert it, only the deterministic structure. + */ + +function row(partial: Partial): AiChatMessage { + return { + id: partial.id ?? 'id', + chatId: partial.chatId ?? 'chat-1', + workspaceId: partial.workspaceId ?? 'ws-1', + userId: partial.userId ?? null, + role: partial.role ?? 'user', + content: partial.content ?? null, + toolCalls: partial.toolCalls ?? null, + metadata: partial.metadata ?? null, + status: partial.status ?? null, + createdAt: partial.createdAt ?? ('2026-06-21T00:00:00.000Z' as never), + updatedAt: partial.updatedAt ?? ('2026-06-21T00:00:00.000Z' as never), + deletedAt: partial.deletedAt ?? null, + } as AiChatMessage; +} + +describe('buildChatMarkdown (server) — structure', () => { + it('emits the title heading, chat id and message count', () => { + const md = buildChatMarkdown({ + title: 'My chat', + chatId: 'chat-123', + rows: [], + }); + expect(md).toContain('# My chat'); + expect(md).toContain('- Chat ID: `chat-123`'); + expect(md).toContain('- Messages: 0'); + }); + + it('falls back to "Untitled chat" with no title (en)', () => { + const md = buildChatMarkdown({ title: null, chatId: 'c', rows: [] }); + expect(md).toContain('# Untitled chat'); + }); + + it('localizes fixed labels with lang=ru (structure stays English)', () => { + const md = buildChatMarkdown({ + title: null, + chatId: 'c', + lang: 'ru', + rows: [row({ role: 'assistant', content: 'hi' })], + }); + expect(md).toContain('# Без названия'); + expect(md).toContain('## 1. ИИ-агент'); + // Structural words remain English. + expect(md).toContain('- Chat ID:'); + }); + + it('numbers messages and labels roles (You / AI agent)', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ role: 'user', content: 'question' }), + row({ role: 'assistant', content: 'answer' }), + ], + }); + expect(md).toContain('## 1. You'); + expect(md).toContain('question'); + expect(md).toContain('## 2. AI agent'); + expect(md).toContain('answer'); + }); + + it('renders a tool part with fenced input/output and the friendly label', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ + role: 'assistant', + content: 'done', + metadata: { + parts: [ + { + type: 'tool-getPage', + state: 'output-available', + input: { id: 'p1' }, + output: { title: 'Hello' }, + }, + { type: 'text', text: 'done' }, + ], + } as never, + }), + ], + }); + expect(md).toContain('**Tool: Read page** (`getPage`) — done'); + expect(md).toContain('Input:'); + expect(md).toContain('"id": "p1"'); + expect(md).toContain('Output:'); + expect(md).toContain('"title": "Hello"'); + }); + + // #186 re-review pt 1: restore the parity coverage of the removed client spec — + // error state, unknown-tool fallback (en + ru), and the circular-stringify catch. + it('renders a tool part in the error state with its errorText', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ + role: 'assistant', + metadata: { + parts: [ + { + type: 'tool-getPage', + state: 'output-error', + input: { id: 'p1' }, + errorText: 'page not found', + }, + ], + } as never, + }), + ], + }); + expect(md).toContain('**Tool: Read page** (`getPage`) — error'); + expect(md).toContain('**Error:** page not found'); + }); + + it('falls back to "Ran tool " for an unknown tool (en) and the ru variant', () => { + const parts = [ + { + type: 'tool-mysteryTool', + state: 'output-available', + output: { ok: 1 }, + }, + ]; + const en = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [row({ role: 'assistant', metadata: { parts } as never })], + }); + expect(en).toContain('**Tool: Ran tool mysteryTool** (`mysteryTool`)'); + const ru = buildChatMarkdown({ + title: 'T', + chatId: 'c', + lang: 'ru', + rows: [row({ role: 'assistant', metadata: { parts } as never })], + }); + expect(ru).toContain('Выполнил инструмент mysteryTool'); + }); + + it('does not throw on a circular tool output (falls back to String)', () => { + const circular: Record = {}; + circular.self = circular; + expect(() => + buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ + role: 'assistant', + metadata: { + parts: [ + { + type: 'tool-getPage', + state: 'output-available', + output: circular, + }, + ], + } as never, + }), + ], + }), + ).not.toThrow(); + }); + + it('emits a token footer + total when usage is present', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ + role: 'assistant', + content: 'a', + metadata: { + usage: { + inputTokens: 100, + outputTokens: 20, + totalTokens: 120, + reasoningTokens: 8, + }, + } as never, + }), + ], + }); + expect(md).toContain('- Total tokens: 120'); + expect(md).toContain( + '_Tokens — in: 100, out: 20, reasoning: 8, total: 120_', + ); + }); + + it('flags a still-streaming (interrupted) row', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ role: 'assistant', content: 'partial', status: 'streaming' }), + ], + }); + expect(md).toContain('still being generated'); + }); + + it('does NOT flag a completed row', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [row({ role: 'assistant', content: 'final', status: 'completed' })], + }); + expect(md).not.toContain('still being generated'); + }); + + it('renders a legacy NULL-status row (no parts) from plain content', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ role: 'assistant', content: 'legacy answer', status: null }), + ], + }); + expect(md).toContain('legacy answer'); + expect(md).not.toContain('still being generated'); + }); + + it('renders a persisted error', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ + role: 'assistant', + content: '', + status: 'error', + metadata: { error: '401: Unauthorized' } as never, + }), + ], + }); + expect(md).toContain('**⚠️ Error:** 401: Unauthorized'); + }); + + it('escapes embedded triple-backtick fences with a longer delimiter', () => { + const md = buildChatMarkdown({ + title: 'T', + chatId: 'c', + rows: [ + row({ + role: 'assistant', + content: 'x', + metadata: { + parts: [ + { + type: 'tool-getPage', + state: 'output-available', + output: '```inner```', + }, + ], + } as never, + }), + ], + }); + // A 4-backtick fence wraps content that itself contains a 3-backtick run. + expect(md).toContain('````'); + }); +}); diff --git a/apps/server/src/core/ai-chat/chat-markdown.util.ts b/apps/server/src/core/ai-chat/chat-markdown.util.ts new file mode 100644 index 00000000..ebbed474 --- /dev/null +++ b/apps/server/src/core/ai-chat/chat-markdown.util.ts @@ -0,0 +1,299 @@ +/** + * Server-side Markdown export for an AI agent chat (#183). The DB is the single + * source of truth: this renders a chat purely from its persisted message rows + * (`AiChatMessage[]` — role / content / metadata.parts / toolCalls / usage). + * Because the assistant row is now persisted UPFRONT and updated per step, an + * interrupted turn is included up to its last finished step. + * + * Ported from the client `utils/chat-markdown.ts`. It is a PURE function (apart + * from `new Date()` for the export timestamp), so it is straightforward to + * unit-test and a future background worker can reuse it. + * + * Only a few fixed role/tool labels are localized via the `lang` param; the + * structural document words (Input/Output/Error/Tokens/...) stay English because + * the output is a technical artifact. + */ + +import type { AiChatMessage } from '@docmost/db/types/entity.types'; + +/** Supported export label languages. Defaults to English. */ +export type ExportLang = 'en' | 'ru'; + +/** + * Normalize an arbitrary client locale code to a supported export language. The + * client sends `i18n.language`, which is a FULL locale tag (e.g. `en-US`, + * `ru-RU`), not a bare `en`/`ru` — so match on the language subtag and fall back + * to English for anything non-Russian. + */ +export function normalizeLang(lang?: string): ExportLang { + return lang?.toLowerCase().startsWith('ru') ? 'ru' : 'en'; +} + +/** A single AI SDK UIMessage part (text part or a tool part). */ +interface ExportPart { + type: string; + text?: string; + state?: string; + toolName?: string; + input?: unknown; + output?: unknown; + errorText?: string; +} + +/** Authoritative per-turn usage the server attaches to a message row. */ +interface UsageLike { + inputTokens?: number; + outputTokens?: number; + totalTokens?: number; + reasoningTokens?: number; +} + +/** Localized label table. The client-side Markdown builder was removed by #183 + * (the export is now server-side only), so this no longer mirrors a second + * exporter — instead the tool-action labels are kept in parity with the + * on-screen action-log labels in the client's `tool-parts.tsx` (`toolLabelKey`) + * so the export reads the same as the UI. Only role + tool-action labels are + * localized; everything structural is an English constant in the renderer. */ +const LABELS: Record< + ExportLang, + { + untitled: string; + aiAgent: string; + you: string; + tools: Record; + ranTool: (name: string) => string; + stillGenerating: string; + } +> = { + en: { + untitled: 'Untitled chat', + aiAgent: 'AI agent', + you: 'You', + tools: { + searchPages: 'Searched pages', + getPage: 'Read page', + createPage: 'Created page', + updatePageContent: 'Updated page', + renamePage: 'Renamed page', + movePage: 'Moved page', + deletePage: 'Deleted page (to trash)', + createComment: 'Commented', + resolveComment: 'Resolved comment', + }, + ranTool: (name) => `Ran tool ${name}`, + stillGenerating: + 'This message is still being generated — the export captured a partial, in-progress response.', + }, + ru: { + untitled: 'Без названия', + aiAgent: 'ИИ-агент', + you: 'Вы', + tools: { + searchPages: 'Искал по страницам', + getPage: 'Прочитал страницу', + createPage: 'Создал страницу', + updatePageContent: 'Обновил страницу', + renamePage: 'Переименовал страницу', + movePage: 'Переместил страницу', + deletePage: 'Удалил страницу (в корзину)', + createComment: 'Прокомментировал', + resolveComment: 'Закрыл комментарий', + }, + ranTool: (name) => `Выполнил инструмент ${name}`, + stillGenerating: + 'Это сообщение всё ещё генерируется — экспорт захватил частичный, незавершённый ответ.', + }, +}; + +/** True for AI SDK tool parts (static `tool-*` or `dynamic-tool`). */ +function isToolPart(type: string): boolean { + return type.startsWith('tool-') || type === 'dynamic-tool'; +} + +/** Extract the tool name from a part `type` of `tool-${name}` (or dynamic). */ +function getToolName(part: ExportPart): string { + if (part.type === 'dynamic-tool') return part.toolName ?? ''; + return part.type.startsWith('tool-') + ? part.type.slice('tool-'.length) + : part.type; +} + +/** Map an AI SDK tool-part state to the 3 states the action-log renders. */ +function toolRunState(state: string | undefined): 'running' | 'done' | 'error' { + if (state === 'output-error' || state === 'output-denied') return 'error'; + if (state === 'output-available') return 'done'; + return 'running'; +} + +/** Resolve a tool's friendly action-log label (localized) from its name. */ +function toolLabel(name: string, lang: ExportLang): string { + return LABELS[lang].tools[name] ?? LABELS[lang].ranTool(name); +} + +/** + * Stringify an arbitrary tool input/output value for a fenced block. Strings + * pass through as-is; everything else is pretty-printed JSON, falling back to + * `String(value)` if serialization throws (e.g. a circular structure). + */ +function stringify(value: unknown): string { + if (typeof value === 'string') return value; + try { + return JSON.stringify(value, null, 2); + } catch { + return String(value); + } +} + +/** + * Wrap `code` in a fenced code block whose backtick delimiter is LONGER than the + * longest backtick run inside the content, so embedded backticks (or a literal + * ``` fence) never break out of the block. Minimum 3 backticks. + */ +function fence(code: string, lang = ''): string { + const runs: string[] = code.match(/`+/g) ?? []; + const longest = runs.reduce((m, s) => Math.max(m, s.length), 0); + const delim = '`'.repeat(Math.max(3, longest + 1)); + return `${delim}${lang}\n${code}\n${delim}`; +} + +/** Per-row token count, mirroring the header sum in the client window. */ +function rowTokens(usage: UsageLike): number { + return ( + usage.totalTokens ?? (usage.inputTokens ?? 0) + (usage.outputTokens ?? 0) + ); +} + +/** Render one message's UIMessage parts into an array of Markdown blocks + * (text blocks + tool blocks). Mirrors the client renderer / MessageItem. */ +function renderMessageParts(parts: ExportPart[], lang: ExportLang): string[] { + const out: string[] = []; + + for (const part of parts) { + if (part.type === 'text') { + const text = (part.text ?? '').trim(); + if (text.length > 0) out.push(text); + continue; + } + + if (!isToolPart(part.type)) continue; + + const name = getToolName(part); + const label = toolLabel(name, lang); + const state = toolRunState(part.state); + + const toolLines: string[] = [`**Tool: ${label}** (\`${name}\`) — ${state}`]; + if (part.input !== undefined) { + toolLines.push('Input:'); + toolLines.push(fence(stringify(part.input), 'json')); + } + if (part.output !== undefined) { + toolLines.push('Output:'); + toolLines.push(fence(stringify(part.output), 'json')); + } + if (part.errorText) { + toolLines.push(`**Error:** ${part.errorText}`); + } + out.push(toolLines.join('\n\n')); + } + + return out; +} + +/** Resolve a persisted row's parts: prefer the rich persisted parts, else a + * single text part built from the plain-text content (mirrors rowToUiMessage). */ +function rowParts(row: AiChatMessage): ExportPart[] { + const meta = (row.metadata ?? {}) as { parts?: ExportPart[] }; + return Array.isArray(meta.parts) && meta.parts.length > 0 + ? meta.parts + : [{ type: 'text', text: row.content ?? '' }]; +} + +/** + * Serialize a chat to a Markdown string from its persisted rows. Source = DB + * ONLY (no live client state). A row whose `status` is still 'streaming' is an + * interrupted turn that the export captured mid-flight; it is rendered up to its + * last finished step and flagged "still generating". + */ +export function buildChatMarkdown(args: { + title: string | null; + chatId: string; + rows: AiChatMessage[]; + // Accepts a full client locale tag (e.g. 'en-US'/'ru-RU'); normalized below. + lang?: string; +}): string { + const { title, chatId, rows } = args; + const lang: ExportLang = normalizeLang(args.lang); + const L = LABELS[lang]; + const blocks: string[] = []; + + const heading = (title ?? '').trim() || L.untitled; + blocks.push(`# ${heading}`); + + const usageOf = (row: AiChatMessage): UsageLike | undefined => { + const meta = (row.metadata ?? {}) as { usage?: UsageLike }; + return meta.usage; + }; + const errorOf = (row: AiChatMessage): string | undefined => { + const meta = (row.metadata ?? {}) as { error?: string }; + return meta.error; + }; + + // Metadata bullet list. Total tokens is only shown when there is a sum. + const totalTokens = rows.reduce((sum, row) => { + const usage = usageOf(row); + return usage ? sum + rowTokens(usage) : sum; + }, 0); + const meta = [ + `- Chat ID: \`${chatId}\``, + `- Exported: ${new Date().toISOString()}`, + `- Messages: ${rows.length}`, + ]; + if (totalTokens > 0) meta.push(`- Total tokens: ${totalTokens}`); + blocks.push(meta.join('\n')); + + rows.forEach((row, index) => { + blocks.push('---'); + + const roleLabel = row.role === 'assistant' ? L.aiAgent : L.you; + blocks.push(`## ${index + 1}. ${roleLabel}`); + + // Created-at kept in source as an HTML comment (out of the rendered prose). + if (row.createdAt) { + const iso = + row.createdAt instanceof Date + ? row.createdAt.toISOString() + : String(row.createdAt); + blocks.push(``); + } + + blocks.push(...renderMessageParts(rowParts(row), lang)); + + // A still-'streaming' row is an interrupted/in-progress turn captured by the + // export; record that so the partial answer is not mistaken for complete. + if (row.status === 'streaming') { + blocks.push(`_⏳ ${L.stillGenerating}_`); + } + + const error = errorOf(row); + if (error) { + blocks.push(`**⚠️ Error:** ${error}`); + } + + const usage = usageOf(row); + if (usage) { + const total = usage.totalTokens ?? rowTokens(usage); + const reasoning = + usage.reasoningTokens && usage.reasoningTokens > 0 + ? `, reasoning: ${usage.reasoningTokens}` + : ''; + blocks.push( + `_Tokens — in: ${usage.inputTokens ?? '?'}, out: ${ + usage.outputTokens ?? '?' + }${reasoning}, total: ${total}_`, + ); + } + }); + + // Blank line between blocks so the Markdown renders cleanly. + return blocks.join('\n\n'); +} diff --git a/apps/server/src/core/ai-chat/dto/ai-chat.dto.ts b/apps/server/src/core/ai-chat/dto/ai-chat.dto.ts index f6775f0c..a48f2b84 100644 --- a/apps/server/src/core/ai-chat/dto/ai-chat.dto.ts +++ b/apps/server/src/core/ai-chat/dto/ai-chat.dto.ts @@ -26,3 +26,17 @@ export class GetChatMessagesDto { @IsString() cursor?: string; } + +/** Export a chat to Markdown (#183). `lang` localizes the few fixed + * role/tool-action labels; defaults to English server-side. */ +export class ExportChatDto { + @IsString() + chatId: string; + + // A full client locale tag (e.g. 'en-US', 'ru-RU') — normalized server-side to + // a supported export language (see normalizeLang). Accept any string so a + // region-qualified locale is not rejected (the 400 that broke the real client). + @IsOptional() + @IsString() + lang?: string; +} diff --git a/apps/server/src/database/migrations/20260626T120000-ai-chat-message-status.ts b/apps/server/src/database/migrations/20260626T120000-ai-chat-message-status.ts new file mode 100644 index 00000000..e6d096f2 --- /dev/null +++ b/apps/server/src/database/migrations/20260626T120000-ai-chat-message-status.ts @@ -0,0 +1,18 @@ +import { type Kysely } from 'kysely'; + +export async function up(db: Kysely): Promise { + // Step-granular durability for the assistant turn (#183). The assistant row is + // now created UPFRONT (status 'streaming') and UPDATEd as each step completes, + // so a process death mid-turn no longer loses the whole answer. The column is + // NULLABLE on purpose: rows written before this migration carry NULL, which the + // app treats as 'completed' (a settled, pre-status message). Values written by + // the app: 'streaming' | 'completed' | 'error' | 'aborted'. + await db.schema + .alterTable('ai_chat_messages') + .addColumn('status', 'text', (col) => col) + .execute(); +} + +export async function down(db: Kysely): Promise { + await db.schema.alterTable('ai_chat_messages').dropColumn('status').execute(); +} diff --git a/apps/server/src/database/repos/ai-chat/ai-chat-message.repo.ts b/apps/server/src/database/repos/ai-chat/ai-chat-message.repo.ts index 108f2b63..fc283792 100644 --- a/apps/server/src/database/repos/ai-chat/ai-chat-message.repo.ts +++ b/apps/server/src/database/repos/ai-chat/ai-chat-message.repo.ts @@ -1,4 +1,4 @@ -import { Injectable } from '@nestjs/common'; +import { Injectable, Logger } from '@nestjs/common'; import { InjectKysely } from 'nestjs-kysely'; import { KyselyDB, KyselyTransaction } from '../../types/kysely.types'; import { dbOrTx } from '../../utils'; @@ -9,8 +9,24 @@ import { import { PaginationOptions } from '@docmost/db/pagination/pagination-options'; import { executeWithCursorPagination } from '@docmost/db/pagination/cursor-pagination'; +// Crash-recovery sweep recency threshold (#183 review): a 'streaming' row is +// only swept to 'aborted' once it has been UNTOUCHED for this long. A live turn +// bumps `updatedAt` on every step (well under this window), so its row never +// matches; only a turn whose process truly died (no step update for >threshold) +// is swept. Chosen safely ABOVE the longest realistic turn so a fresh replica's +// boot-sweep can never abort a turn another replica is actively streaming +// (multi-instance deploy). +const SWEEP_STREAMING_STALE_MS = 10 * 60 * 1000; // 10 minutes + +// Hard upper bound on the rows materialized by `findAllByChat` (export path). +// A generous cap so a pathologically huge chat cannot load an unbounded result +// into memory; far above any realistic transcript length. +const FIND_ALL_BY_CHAT_LIMIT = 5000; + @Injectable() export class AiChatMessageRepo { + private readonly logger = new Logger(AiChatMessageRepo.name); + constructor(@InjectKysely() private readonly db: KyselyDB) {} // The `tsv` column is a trigger-maintained tsvector used only for @@ -25,6 +41,7 @@ export class AiChatMessageRepo { 'content', 'toolCalls', 'metadata', + 'status', 'createdAt', 'updatedAt', 'deletedAt', @@ -60,6 +77,46 @@ export class AiChatMessageRepo { }); } + // Load ALL (non-deleted) messages of a chat in ascending chronological order + // (oldest -> newest), unpaginated. Used by the server-side Markdown export + // (#183), where the DB is the single source of truth and the whole transcript + // must be rendered in one pass (findByChat is cursor-paginated and would only + // return the first page). + // + // Hard-capped at FIND_ALL_BY_CHAT_LIMIT rows (a generous bound, far above any + // realistic transcript) so exporting a pathologically huge chat cannot + // materialize an unbounded result set in memory. + async findAllByChat( + chatId: string, + workspaceId: string, + // Injectable for tests so truncation can be exercised on a modest volume. + limit: number = FIND_ALL_BY_CHAT_LIMIT, + ): Promise { + // Fetch newest-first (+1 to DETECT truncation), so on overflow we keep the + // NEWEST `limit` messages — the recent conversation matters most for an + // export — rather than silently dropping the tail (#183 review). Reverse back + // to chronological for rendering, like findRecent. + const rows = await this.db + .selectFrom('aiChatMessages') + .select(this.baseFields) + .where('chatId', '=', chatId) + .where('workspaceId', '=', workspaceId) + .where('deletedAt', 'is', null) + .orderBy('createdAt', 'desc') + .orderBy('id', 'desc') + .limit(limit + 1) + .execute(); + + if (rows.length > limit) { + rows.length = limit; // keep the newest `limit` (rows are newest-first here) + this.logger.warn( + `Chat ${chatId} export truncated to the newest ${limit} messages ` + + `(older messages omitted).`, + ); + } + return rows.reverse(); + } + // Load the most RECENT `limit` messages for a chat and return them in // ascending chronological order (oldest -> newest), as the model expects. // `findByChat` returns the FIRST page ASC (the OLDEST messages), which loses @@ -96,4 +153,68 @@ export class AiChatMessageRepo { .returning(this.baseFields) .executeTakeFirst(); } + + /** + * Update a single message in place by id + workspace (#183 step-granular + * durability). The assistant row is created UPFRONT (status 'streaming') and + * patched as each step completes, then finalized once on the terminal status. + * `updatedAt` is always bumped. Returns the updated row (baseFields) or + * undefined when no row matched (e.g. a foreign workspace / deleted row). + */ + async update( + id: string, + workspaceId: string, + patch: Partial<{ + content: string | null; + toolCalls: unknown; + metadata: unknown; + status: string | null; + }>, + opts?: { onlyIfStreaming?: boolean; trx?: KyselyTransaction }, + ): Promise { + const db = dbOrTx(this.db, opts?.trx); + let query = db + .updateTable('aiChatMessages') + .set({ ...(patch as Record), updatedAt: new Date() }) + .where('id', '=', id) + .where('workspaceId', '=', workspaceId); + // Concurrency guard (#183 review): a per-step 'streaming' update must NEVER + // overwrite a row the terminal callback already finalized. onStepFinish + // fires the streaming update fire-and-forget, so its UPDATE can land AFTER + // finalize on a DIFFERENT pool connection (commit order is not guaranteed). + // Scoping the streaming update to rows STILL in 'streaming' makes a late + // update a no-op once the row is completed/error/aborted — regardless of + // commit order. The terminal finalize runs WITHOUT this guard so it always + // wins. + if (opts?.onlyIfStreaming) { + query = query.where('status', '=', 'streaming'); + } + return query.returning(this.baseFields).executeTakeFirst(); + } + + /** + * Crash-recovery sweep (#183): flip every assistant row still left in the + * 'streaming' state (a turn that died mid-write before reaching a terminal + * status) to 'aborted'. Run once on server start. Returns the number of rows + * swept so the caller can log it. Workspace-wide on purpose — a crash can have + * dangling streaming rows across any workspace. + * + * Bounded by recency (#183 review): only rows UNTOUCHED for + * SWEEP_STREAMING_STALE_MS are swept. A live turn bumps `updatedAt` on every + * step, so an actively-streaming row never matches; this prevents a fresh + * replica's boot-sweep from aborting a turn another replica is still streaming + * in a multi-instance deploy. + */ + async sweepStreaming(trx?: KyselyTransaction): Promise { + const db = dbOrTx(this.db, trx); + const staleBefore = new Date(Date.now() - SWEEP_STREAMING_STALE_MS); + const rows = await db + .updateTable('aiChatMessages') + .set({ status: 'aborted', updatedAt: new Date() }) + .where('status', '=', 'streaming') + .where('updatedAt', '<', staleBefore) + .returning('id') + .execute(); + return rows.length; + } } diff --git a/apps/server/src/database/types/db.d.ts b/apps/server/src/database/types/db.d.ts index 8574d613..169d8e60 100644 --- a/apps/server/src/database/types/db.d.ts +++ b/apps/server/src/database/types/db.d.ts @@ -620,6 +620,10 @@ export interface AiChatMessages { content: string | null; toolCalls: Json | null; metadata: Json | null; + // Turn lifecycle status (#183): 'streaming' | 'completed' | 'error' | + // 'aborted'. NULL on rows written before the status column existed; the app + // treats NULL as 'completed' (a settled, pre-status message). + status: string | null; tsv: string | null; createdAt: Generated; updatedAt: Generated; diff --git a/apps/server/test/integration/ai-chat-message-status.int-spec.ts b/apps/server/test/integration/ai-chat-message-status.int-spec.ts new file mode 100644 index 00000000..5e7eba1b --- /dev/null +++ b/apps/server/test/integration/ai-chat-message-status.int-spec.ts @@ -0,0 +1,270 @@ +import { Kysely } from 'kysely'; +import { AiChatMessageRepo } from '@docmost/db/repos/ai-chat/ai-chat-message.repo'; +import { + getTestDb, + destroyTestDb, + createWorkspace, + createUser, + createChat, + createMessage, +} from './db'; + +/** + * Integration coverage for the #183 step-granular durability primitives on + * AiChatMessageRepo: `update` (in-place patch by id+workspace, bumps updatedAt, + * returns the row) and `sweepStreaming` (crash recovery: flip dangling + * 'streaming' rows to 'aborted'). Real SQL against docmost_test, not a mock. + */ +describe('AiChatMessageRepo.update + sweepStreaming [integration]', () => { + let db: Kysely; + let repo: AiChatMessageRepo; + let workspaceId: string; + let otherWorkspaceId: string; + let userId: string; + let chatId: string; + let otherChatId: string; + + beforeAll(async () => { + db = getTestDb(); + repo = new AiChatMessageRepo(db as any); + workspaceId = (await createWorkspace(db)).id; + otherWorkspaceId = (await createWorkspace(db)).id; + userId = (await createUser(db, workspaceId)).id; + chatId = (await createChat(db, { workspaceId, creatorId: userId })).id; + const otherUser = await createUser(db, otherWorkspaceId); + otherChatId = ( + await createChat(db, { + workspaceId: otherWorkspaceId, + creatorId: otherUser.id, + }) + ).id; + }); + + afterAll(async () => { + await destroyTestDb(); + }); + + it('update patches content/status/metadata and bumps updatedAt', async () => { + const seeded = await repo.insert({ + chatId, + workspaceId, + userId, + role: 'assistant', + content: '', + status: 'streaming', + metadata: { parts: [] } as never, + }); + const before = seeded.updatedAt; + // Ensure a measurable timestamp delta. + await new Promise((r) => setTimeout(r, 5)); + + const updated = await repo.update(seeded.id, workspaceId, { + content: 'final answer', + status: 'completed', + metadata: { parts: [{ type: 'text', text: 'final answer' }] }, + }); + + expect(updated).toBeDefined(); + expect(updated!.content).toBe('final answer'); + expect(updated!.status).toBe('completed'); + expect((updated!.metadata as any).parts).toHaveLength(1); + // The 5ms sleep above guarantees a strictly-later timestamp. + expect(new Date(updated!.updatedAt).getTime()).toBeGreaterThan( + new Date(before).getTime(), + ); + }); + + it('onlyIfStreaming update is a NO-OP once the row is finalized (race guard)', async () => { + // Reproduce the step-update-vs-finalize race (#183 review): the row is + // finalized to 'completed', then a LATE per-step 'streaming' update lands. + // With `onlyIfStreaming` it must match nothing and leave the finalized row + // untouched (no clobber back to 'streaming', no lost usage). + const seeded = await repo.insert({ + chatId, + workspaceId, + userId, + role: 'assistant', + content: 'partial', + status: 'streaming', + }); + // Terminal finalize (unguarded) wins. + await repo.update(seeded.id, workspaceId, { + content: 'final answer', + status: 'completed', + metadata: { usage: { totalTokens: 42 } } as never, + }); + // A straggler per-step update arrives AFTER finalize. + const late = await repo.update( + seeded.id, + workspaceId, + { content: 'partial', status: 'streaming', metadata: {} as never }, + { onlyIfStreaming: true }, + ); + expect(late).toBeUndefined(); // matched no 'streaming' row -> no-op + const rows = await repo.findAllByChat(chatId, workspaceId); + const row = rows.find((r) => r.id === seeded.id)!; + expect(row.status).toBe('completed'); // NOT clobbered back to streaming + expect(row.content).toBe('final answer'); + expect((row.metadata as any).usage.totalTokens).toBe(42); // usage preserved + }); + + it('update is workspace-scoped: a foreign workspace id matches nothing', async () => { + const seeded = await repo.insert({ + chatId, + workspaceId, + userId, + role: 'assistant', + content: 'orig', + status: 'streaming', + }); + const res = await repo.update(seeded.id, otherWorkspaceId, { + status: 'completed', + }); + expect(res).toBeUndefined(); + // The row in the real workspace is untouched. + const rows = await repo.findAllByChat(chatId, workspaceId); + const stillThere = rows.find((r) => r.id === seeded.id); + expect(stillThere!.status).toBe('streaming'); + // Clean up so it does not pollute the sweep test below. + await repo.update(seeded.id, workspaceId, { status: 'completed' }); + }); + + // Backdate a row's updatedAt so it qualifies as a STALE streaming row (the + // sweep only flips rows untouched for >10 minutes — a live turn bumps + // updatedAt every step, so it would never match). + async function backdateUpdatedAt( + id: string, + minutesAgo: number, + ): Promise { + await db + .updateTable('aiChatMessages') + .set({ updatedAt: new Date(Date.now() - minutesAgo * 60 * 1000) }) + .where('id', '=', id) + .execute(); + } + + it('sweepStreaming flips STALE dangling streaming rows to aborted and counts them', async () => { + // Two dangling streaming rows in our workspace + one in another workspace — + // all backdated past the staleness threshold so the sweep picks them up. + const a = await createMessage(db, { + workspaceId, + chatId, + role: 'assistant', + status: 'streaming', + }); + const b = await createMessage(db, { + workspaceId, + chatId, + role: 'assistant', + status: 'streaming', + }); + const other = await createMessage(db, { + workspaceId: otherWorkspaceId, + chatId: otherChatId, + role: 'assistant', + status: 'streaming', + }); + await backdateUpdatedAt(a.id, 20); + await backdateUpdatedAt(b.id, 20); + await backdateUpdatedAt(other.id, 20); + + // A settled row must NOT be touched. + const done = await createMessage(db, { + workspaceId, + chatId, + role: 'assistant', + status: 'completed', + }); + // A legacy NULL-status row must NOT be touched. + const legacy = await createMessage(db, { + workspaceId, + chatId, + role: 'assistant', + status: null, + }); + + const swept = await repo.sweepStreaming(); + // At least the 3 stale streaming rows we created (2 here + 1 in the other ws). + expect(swept).toBeGreaterThanOrEqual(3); + + const rows = await repo.findAllByChat(chatId, workspaceId); + const byId = new Map(rows.map((r) => [r.id, r])); + expect(byId.get(a.id)!.status).toBe('aborted'); + expect(byId.get(b.id)!.status).toBe('aborted'); + expect(byId.get(done.id)!.status).toBe('completed'); + expect(byId.get(legacy.id)!.status).toBeNull(); + + // Idempotent: a second sweep finds nothing left in our seeded set. + const again = await repo.sweepStreaming(); + const rows2 = await repo.findAllByChat(chatId, workspaceId); + // Our two rows stay aborted regardless of `again`'s global count. + expect(rows2.find((r) => r.id === a.id)!.status).toBe('aborted'); + expect(again).toBeGreaterThanOrEqual(0); + }); + + it('sweepStreaming does NOT sweep a FRESH streaming row (recency bound, #183 review)', async () => { + // A row that is actively streaming (recent updatedAt) must survive the sweep: + // a fresh replica's boot-sweep must never abort a turn another replica is + // still streaming in a multi-instance deploy. + const fresh = await createMessage(db, { + workspaceId, + chatId, + role: 'assistant', + status: 'streaming', + }); + // A STALE streaming row created alongside it IS swept — proving the sweep + // ran and the only difference is recency. + const stale = await createMessage(db, { + workspaceId, + chatId, + role: 'assistant', + status: 'streaming', + }); + await backdateUpdatedAt(stale.id, 20); + + await repo.sweepStreaming(); + + const rows = await repo.findAllByChat(chatId, workspaceId); + const byId = new Map(rows.map((r) => [r.id, r])); + // Fresh (recently-updated) streaming row is left untouched... + expect(byId.get(fresh.id)!.status).toBe('streaming'); + // ...while the stale one alongside it was swept to 'aborted'. + expect(byId.get(stale.id)!.status).toBe('aborted'); + }); + + it('findAllByChat caps the result, keeping the NEWEST messages in order (#183 review)', async () => { + // A dedicated chat so the cap test is independent of the rows above. + const cappedChat = ( + await createChat(db, { workspaceId, creatorId: userId }) + ).id; + const base = Date.now(); + // Three messages at strictly increasing timestamps. + await createMessage(db, { + workspaceId, + chatId: cappedChat, + content: 'm1-oldest', + createdAt: new Date(base), + }); + await createMessage(db, { + workspaceId, + chatId: cappedChat, + content: 'm2', + createdAt: new Date(base + 1000), + }); + await createMessage(db, { + workspaceId, + chatId: cappedChat, + content: 'm3-newest', + createdAt: new Date(base + 2000), + }); + + // Cap of 2 -> the OLDEST message is dropped; the newest two stay, in + // chronological order (oldest -> newest). + const capped = await repo.findAllByChat(cappedChat, workspaceId, 2); + expect(capped.map((r) => r.content)).toEqual(['m2', 'm3-newest']); + + // Without a cap (well above the row count) all three come back in order. + const all = await repo.findAllByChat(cappedChat, workspaceId, 100); + expect(all.map((r) => r.content)).toEqual(['m1-oldest', 'm2', 'm3-newest']); + }); +}); diff --git a/apps/server/test/integration/db.ts b/apps/server/test/integration/db.ts index 8cf11fdb..ede53494 100644 --- a/apps/server/test/integration/db.ts +++ b/apps/server/test/integration/db.ts @@ -104,7 +104,8 @@ export async function createWorkspace( name: overrides.name ?? `ws-${suffix}`, // hostname is uniquely constrained; keep it unique per workspace. hostname: `host-${suffix}`, - settings: overrides.settings === undefined ? null : (overrides.settings as any), + settings: + overrides.settings === undefined ? null : (overrides.settings as any), }) .returning(['id', 'settings']) .executeTakeFirstOrThrow(); @@ -226,3 +227,37 @@ export async function createChat( .executeTakeFirstOrThrow(); return { id: row.id as string }; } + +export async function createMessage( + db: Kysely, + args: { + workspaceId: string; + chatId: string; + userId?: string | null; + role?: string; + content?: string | null; + status?: string | null; + metadata?: unknown; + // Explicit timestamp so a test can control message ORDER (the default DB + // now() can tie within a millisecond, and the v4 id is not time-ordered). + createdAt?: Date; + }, +): Promise<{ id: string }> { + const id = randomUUID(); + const row = await db + .insertInto('aiChatMessages') + .values({ + id, + workspaceId: args.workspaceId, + chatId: args.chatId, + userId: args.userId ?? null, + role: args.role ?? 'assistant', + content: args.content ?? null, + status: args.status ?? null, + metadata: (args.metadata ?? null) as any, + ...(args.createdAt ? { createdAt: args.createdAt } : {}), + }) + .returning(['id']) + .executeTakeFirstOrThrow(); + return { id: row.id as string }; +} diff --git a/packages/git-sync/build/engine/client.types.d.ts b/packages/git-sync/build/engine/client.types.d.ts new file mode 100644 index 00000000..9a1f8fb8 --- /dev/null +++ b/packages/git-sync/build/engine/client.types.d.ts @@ -0,0 +1,109 @@ +/** + * The client seam. `pull.ts`/`push.ts` depend on a narrow STRUCTURAL interface + * rather than any concrete client, because the gitmost server writes NATIVELY — + * through repositories + collab `openDirectConnection`. + * + * `GitSyncClient` is that interface: the native datasource (server side) + * implements it, and the engine only ever uses `Pick` + * subsets of it. The signatures below MIRROR exactly the methods the engine's + * `pull.ts`/`push.ts` actually call (arg shapes + the fields the engine reads + * off each result), so a REST-style client is still structurally assignable and + * the native adapter has a precise contract. + */ +/** + * A page node as returned by `listSpaceTree` (the sidebar/tree walk, no body). + * The engine layout (`buildVaultLayout`) consumes `PageNode` from `./layout`, + * which only requires `id` (+ optional `title`/`slugId`/`parentPageId`); this + * lite shape documents the fields the tree walk surfaces. Real tree nodes also + * carry `position`, `icon`, `hasChildren` — kept open via the index signature. + */ +export interface GitSyncPageNodeLite { + id: string; + slugId?: string; + title?: string; + parentPageId?: string | null; + hasChildren?: boolean; + /** `listSpaceTree` nodes carry extra fields (position, icon, …). */ + [key: string]: unknown; +} +/** + * The structural client the engine depends on. Only `Pick` + * subsets are ever used: + * - pull reads: `getPageJson` (+ the tree walk's `listSpaceTree`), + * - push writes: `importPageMarkdown` / `createPage` / `deletePage` / + * `movePage` / `renamePage`, + * - continuous (phase B+): `listRecentSince` / `listTrash` / `restorePage`. + */ +export interface GitSyncClient { + /** + * Full tree of page nodes for the space (or the subtree rooted at + * `rootPageId`), each WITHOUT body content. `complete` is `false` when the + * walk was truncated / a fetch failed — the pull side suppresses absence + * deletions on an incomplete tree (SPEC §8). Native impl returns + * `complete: true` always (reads the DB, not a paginated REST endpoint). + */ + listSpaceTree(spaceId: string, rootPageId?: string): Promise<{ + pages: GitSyncPageNodeLite[]; + complete: boolean; + }>; + /** + * One page WITH its ProseMirror body content. `applyPullActions` reads + * `id`, `slugId`, `title`, `parentPageId`, `spaceId` (for the file meta) and + * `content` (to stabilize/serialize). `updatedAt` is carried for the + * poll-suppression loop-guard. + */ + getPageJson(pageId: string): Promise<{ + id: string; + slugId: string; + title: string; + parentPageId: string | null; + spaceId: string; + updatedAt: string; + content: unknown; + }>; + /** + * Merge a page's body from a self-contained markdown file (meta + body). The + * collab/Yjs write path (SPEC §2/§15.6) — never a raw jsonb overwrite. + * `applyPushActions` reads only an optional `updatedAt` off the result + * (via `extractUpdatedAt`, tolerant of extra fields). + * + * `baseMarkdown` is the last-synced version of the file (`refs/docmost/ + * last-pushed`), the common ancestor for a THREE-WAY merge against the live + * doc so concurrent human edits survive (review #5). Optional/null -> 2-way. + */ + importPageMarkdown(pageId: string, fullMarkdown: string, baseMarkdown?: string | null): Promise<{ + updatedAt?: string; + [key: string]: unknown; + }>; + /** + * Create a new page and return the assigned id at `data.id` + * (`applyPushActions` reads `result.data.id`, then writes it back into the + * file's meta). An optional top-level/`data.updatedAt` feeds the loop-guard. + */ + createPage(title: string, content: string, spaceId: string, parentPageId?: string): Promise<{ + data: { + id: string; + }; + updatedAt?: string; + [key: string]: unknown; + }>; + /** Soft-delete a page to Trash (SPEC §8). Result is not inspected. */ + deletePage(pageId: string): Promise; + /** + * Reparent a page (and optionally set its fractional-index `position`). The + * engine passes `position` UNDEFINED for now; the native impl computes a + * default between siblings. Result is not inspected. + */ + movePage(pageId: string, parentPageId: string | null, position?: string): Promise; + /** Change a page's title only (no body touch). Result is not inspected. */ + renamePage(pageId: string, title: string): Promise; + /** + * Pages updated since `sinceIso` (the poll-safety reconciliation, SPEC §8). + * `spaceId` may be undefined (all spaces); `hardPageCap` bounds the walk. + */ + listRecentSince(spaceId: string | undefined, sinceIso: string | null, hardPageCap?: number): Promise; + /** List soft-deleted (trashed) pages for the space (deletion detection). */ + listTrash(spaceId: string): Promise; + /** Restore a soft-deleted page from Trash. Result is not inspected. */ + restorePage(pageId: string): Promise; +} diff --git a/packages/git-sync/build/engine/client.types.js b/packages/git-sync/build/engine/client.types.js new file mode 100644 index 00000000..199e849e --- /dev/null +++ b/packages/git-sync/build/engine/client.types.js @@ -0,0 +1,13 @@ +/** + * The client seam. `pull.ts`/`push.ts` depend on a narrow STRUCTURAL interface + * rather than any concrete client, because the gitmost server writes NATIVELY — + * through repositories + collab `openDirectConnection`. + * + * `GitSyncClient` is that interface: the native datasource (server side) + * implements it, and the engine only ever uses `Pick` + * subsets of it. The signatures below MIRROR exactly the methods the engine's + * `pull.ts`/`push.ts` actually call (arg shapes + the fields the engine reads + * off each result), so a REST-style client is still structurally assignable and + * the native adapter has a precise contract. + */ +export {}; diff --git a/packages/git-sync/build/engine/config-errors.d.ts b/packages/git-sync/build/engine/config-errors.d.ts new file mode 100644 index 00000000..3e710684 --- /dev/null +++ b/packages/git-sync/build/engine/config-errors.d.ts @@ -0,0 +1 @@ +export declare function loadSettingsOrExit(factory: () => T): T; diff --git a/packages/git-sync/build/engine/config-errors.js b/packages/git-sync/build/engine/config-errors.js new file mode 100644 index 00000000..93be916e --- /dev/null +++ b/packages/git-sync/build/engine/config-errors.js @@ -0,0 +1,50 @@ +import { ZodError } from 'zod'; +// Turn a ZodError from settings validation into a clear, actionable startup +// message that names the offending env var(s), then exit(1) — no raw stack +// trace. Mirrors the Python new-project skeleton's load_settings_or_exit. +// A non-ZodError is left to propagate unchanged. +export function loadSettingsOrExit(factory) { + try { + return factory(); + } + catch (err) { + if (!(err instanceof ZodError)) + throw err; + const missing = []; + const invalid = []; + for (const issue of err.issues) { + const name = issue.path.length ? String(issue.path[0]) : '?'; + // A missing required variable surfaces as an `invalid_type` issue whose + // received value was `undefined`. zod 3 exposed `issue.received` directly; + // zod 4 dropped that field and instead folds it into the message + // ("expected string, received undefined"). Detect both shapes so the + // missing-vs-invalid split holds across zod majors. NOTE: an invalid (but + // present) value uses a different code (invalid_format / invalid_value) or + // an `invalid_type` message that reports a non-undefined received (e.g. + // "received NaN" from a coerced number), so neither is misread as missing. + const i = issue; + const isMissing = issue.code === 'invalid_type' && + (i.received === 'undefined' || + /received undefined/i.test(i.message ?? '')); + if (isMissing) + missing.push(name); + else + invalid.push(`${name}: ${issue.message}`); + } + const lines = ['Configuration error in environment / .env:']; + if (missing.length) { + lines.push(' Missing required variable(s):'); + for (const n of [...new Set(missing)]) + lines.push(` - ${n}`); + } + if (invalid.length) { + lines.push(' Invalid value(s):'); + for (const item of invalid) + lines.push(` - ${item}`); + } + lines.push(''); + lines.push('Set them in .env (see .env.example) and try again.'); + process.stderr.write(lines.join('\n') + '\n'); + process.exit(1); + } +} diff --git a/packages/git-sync/build/engine/cycle.d.ts b/packages/git-sync/build/engine/cycle.d.ts new file mode 100644 index 00000000..ba194865 --- /dev/null +++ b/packages/git-sync/build/engine/cycle.d.ts @@ -0,0 +1,70 @@ +import { VaultGit } from "./git.js"; +import { GitSyncClient } from "./client.types.js"; +import { Settings } from "./settings.js"; +/** + * Absolute-path filesystem primitives the cycle needs. Injected (not imported) + * so the engine stays IO-free and unit-testable. `mkdir` is recursive; `rm` is + * force (a missing file is a no-op). + */ +export interface CycleFs { + readFile: (absPath: string) => Promise; + writeFile: (absPath: string, text: string) => Promise; + mkdir: (absDir: string) => Promise; + rm: (absPath: string) => Promise; +} +export interface RunCycleDeps { + spaceId: string; + /** The Docmost seam (reads for pull, writes for push). */ + client: GitSyncClient; + /** The per-space git vault (a real working repo). */ + vault: VaultGit; + /** Engine settings; `vaultPath` roots the relPath -> absolute-path mapping. */ + settings: Settings; + fs: CycleFs; + log: (line: string) => void; + /** + * Delete-cap hook (the ONLY caller-specific policy). Called with the push + * dry-run's planned delete count (`Number.POSITIVE_INFINITY` when the dry-run + * itself failed, so the hook can fail safe) and the live client; returns the + * client to use for the REAL apply. The default (omitted) applies every op + * unmodified. gitmost uses it to neutralize deletes when over its cap. + * + * When omitted, NO dry-run is performed (one fewer push planning pass). + */ + resolveApplyClient?: (plannedDeletes: number, client: GitSyncClient) => GitSyncClient; +} +export interface RunCycleResult { + ran: boolean; + /** Set when the cycle short-circuited without running pull/push. */ + skipped?: "merge-in-progress"; + pull?: { + written: number; + deleted: number; + conflict: boolean; + }; + push?: { + mode: string; + failures: number; + }; +} +/** + * Run ONE full reconcile cycle for a space: PULL (Docmost -> vault) then PUSH + * (vault -> Docmost), under the engine's required branch choreography. This is + * the single entry point the app drives — it owns the staging order so it can + * never drift from the engine it ships with. + * + * Staging (the ⭐ data-loss-critical order, SPEC §6/§9): + * 1. assertGitAvailable + ensureRepo (the git state store must exist). + * 2. refuse on an unresolved merge (a prior conflicting pull); next checkout + * would fail otherwise. + * 3. ensureBranch('docmost','main') + checkout('docmost'). Pull writes MUST + * land on `docmost`, not `main`: applyPullActions commits on `docmost`, + * then checks out `main` and merges docmost -> main. Writing Docmost + * content straight onto `main` would clobber local file edits before push + * can diff them. + * 4. PULL: readExisting -> listSpaceTree -> computePullActions -> apply. + * 5. PUSH: optional dry-run to feed the delete-cap hook, then the real apply. + * + * Lock + cap POLICY live in the caller; this owns only the mechanics. + */ +export declare function runCycle(deps: RunCycleDeps): Promise; diff --git a/packages/git-sync/build/engine/cycle.js b/packages/git-sync/build/engine/cycle.js new file mode 100644 index 00000000..92e3be3c --- /dev/null +++ b/packages/git-sync/build/engine/cycle.js @@ -0,0 +1,97 @@ +import { readExisting, computePullActions, applyPullActions } from "./pull.js"; +import { runPush } from "./push.js"; +/** + * Run ONE full reconcile cycle for a space: PULL (Docmost -> vault) then PUSH + * (vault -> Docmost), under the engine's required branch choreography. This is + * the single entry point the app drives — it owns the staging order so it can + * never drift from the engine it ships with. + * + * Staging (the ⭐ data-loss-critical order, SPEC §6/§9): + * 1. assertGitAvailable + ensureRepo (the git state store must exist). + * 2. refuse on an unresolved merge (a prior conflicting pull); next checkout + * would fail otherwise. + * 3. ensureBranch('docmost','main') + checkout('docmost'). Pull writes MUST + * land on `docmost`, not `main`: applyPullActions commits on `docmost`, + * then checks out `main` and merges docmost -> main. Writing Docmost + * content straight onto `main` would clobber local file edits before push + * can diff them. + * 4. PULL: readExisting -> listSpaceTree -> computePullActions -> apply. + * 5. PUSH: optional dry-run to feed the delete-cap hook, then the real apply. + * + * Lock + cap POLICY live in the caller; this owns only the mechanics. + */ +export async function runCycle(deps) { + const { spaceId, client, vault, settings, fs, log, resolveApplyClient } = deps; + const vaultRoot = settings.vaultPath; + const abs = (relPath) => `${vaultRoot}/${relPath}`; + // 1. The engine state store is git: make sure the repo + branches exist + // before any tracked-file listing or diff. + await vault.assertGitAvailable(); + await vault.ensureRepo(); + // 2. Refuse to run on top of an unresolved merge (SPEC §9): a prior + // conflicting pull leaves the vault mid-merge; the next checkout would fail. + if (await vault.isMergeInProgress()) { + log(`vault has an unresolved merge — resolve it (or 'git merge --abort') ` + + `and re-run (SPEC §9); skipping cycle.`); + return { ran: false, skipped: "merge-in-progress" }; + } + // 3. Pull writes happen on `docmost`; be on it BEFORE applying (see docstring). + await vault.ensureBranch("docmost", "main"); + await vault.checkout("docmost"); + // 4. PULL -------------------------------------------------------------------- + const existing = await readExisting({ + listTracked: () => vault.listTrackedFiles("*.md"), + readFile: (relPath) => fs.readFile(abs(relPath)), + }); + const tree = await client.listSpaceTree(spaceId); + const pullActions = computePullActions({ + pages: tree.pages, + treeComplete: tree.complete, + existing, + }); + const pullResult = await applyPullActions({ + client, + git: vault, + writeFile: (absPath, text) => fs.writeFile(absPath, text), + mkdir: (absDir) => fs.mkdir(absDir), + rm: (absPath) => fs.rm(absPath), + }, pullActions, vaultRoot); + // 5. PUSH -------------------------------------------------------------------- + const pushDeps = { + settings, + git: vault, + makeClient: () => client, + readFile: (relPath) => fs.readFile(abs(relPath)), + writeFile: (relPath, text) => fs.writeFile(abs(relPath), text), + log, + }; + let applyClient = client; + if (resolveApplyClient) { + // Plan the push as a DRY-RUN first to read the delete count, then let the + // caller decide the apply client (e.g. neutralize deletes over a cap). A + // failed dry-run yields Infinity so the hook can fail safe. + let plannedDeletes; + try { + const dry = await runPush(pushDeps, { dryRun: true }); + plannedDeletes = dry.planned?.deletes ?? 0; + } + catch (err) { + log(`push dry-run planning failed (${err instanceof Error ? err.message : String(err)}); deferring deletion policy to the cap hook (fail-safe).`); + plannedDeletes = Number.POSITIVE_INFINITY; + } + applyClient = resolveApplyClient(plannedDeletes, client); + } + const pushResult = await runPush({ ...pushDeps, makeClient: () => applyClient }, { dryRun: false }); + return { + ran: true, + pull: { + written: pullResult.written, + deleted: pullResult.deleted, + conflict: pullResult.merge.conflict, + }, + push: { + mode: pushResult.mode, + failures: pushResult.failures?.length ?? 0, + }, + }; +} diff --git a/packages/git-sync/build/engine/git.d.ts b/packages/git-sync/build/engine/git.d.ts new file mode 100644 index 00000000..85cba296 --- /dev/null +++ b/packages/git-sync/build/engine/git.d.ts @@ -0,0 +1,259 @@ +/** Bot identity used for engine-authored vault commits (SPEC §7.3). */ +export declare const BOT_AUTHOR_NAME = "Docmost Sync"; +export declare const BOT_AUTHOR_EMAIL = "docmost-sync@local"; +/** Default branch the vault repo is initialized on. */ +export declare const DEFAULT_BRANCH = "main"; +/** + * One row of `git diff --name-status` (SPEC §6 "ФС → Docmost"). `status` is the + * single-letter change code (`-M` rename detection on), `path` is the (new) file + * path; for a rename/copy (`R`/`C`) `oldPath` is the source and `path` is the + * destination, with `score` carrying git's similarity index (0–100). + */ +export interface DiffEntry { + status: "A" | "M" | "D" | "R" | "C"; + /** New (destination) path. For A/M/D it is the only path. */ + path: string; + /** Source path — present only for R/C. */ + oldPath?: string; + /** Rename/copy similarity score (0–100) — present only for R/C. */ + score?: number; +} +/** Result of a `merge`: whether it succeeded cleanly or left conflict markers. */ +export interface MergeResult { + /** True when the merge applied cleanly (fast-forward or clean 3-way). */ + ok: boolean; + /** True when the merge stopped on conflicts (markers left in the worktree). */ + conflict: boolean; + /** Raw combined stdout+stderr, for logging/diagnostics. */ + output: string; +} +/** Options for an engine-authored commit (provenance, SPEC §7.3). */ +export interface CommitOptions { + authorName: string; + authorEmail: string; + /** + * Trailer lines appended to the commit message body (e.g. + * `Docmost-Sync-Source: docmost`). These are the machine-readable provenance + * the loop-guard keys on (SPEC §12, "commit-attribution"). + */ + trailers?: string[]; +} +/** + * A git wrapper bound to a single vault path. Construct once per vault; every + * method runs git with `cwd = vaultPath`. + */ +export declare class VaultGit { + private readonly vaultPath; + constructor(vaultPath: string); + /** + * Preflight: verify a runnable `git` binary is on PATH. The daemon shells out + * to system `git` for every vault operation, so a missing binary (e.g. a slim + * container image without git) must fail fast with an actionable message + * rather than a cryptic ENOENT deep inside the first real git call. Presence + * check only — we do NOT gate on a specific version. Runs `git --version` + * with NO `cwd` (the vault dir may not exist yet at preflight time). + */ + assertGitAvailable(): Promise; + /** + * Run a git command in the vault and return trimmed stdout. THIN wrapper over + * the single `runRaw` primitive: throws a clear, unified Error (including + * stderr/stdout) on a non-zero exit. + */ + private run; + /** + * The ONE primitive every git invocation in this module flows through. Builds + * the full argv (`--no-pager -c core.quotepath=false `), env, cwd, and + * maxBuffer, runs git, and NEVER throws — it returns the exit info so callers + * can treat a non-zero exit as either an error (`run`) or a meaningful state + * (e.g. a merge conflict, a porcelain diff that "fails" deliberately). + * + * - argv: ALWAYS prepends `--no-pager -c core.quotepath=false`, so git never + * blocks on a pager and always prints verbatim UTF-8 paths (no octal + * escaping/quoting). `quotepath=false` is the baseline for ALL path- + * printing commands (ls-files, diff --name-only, …). + * - cwd: `opts.cwd === null` -> do NOT set cwd (the preflight, where the + * vault dir may not exist); otherwise `opts.cwd ?? this.vaultPath`. + * - env: `vaultGitEnv(opts?.env)` (cwd-isolation + caller extras). + * - On a spawn/exec error we capture the error `message` too, so a failure + * before git could write to stderr (e.g. ENOENT) is NOT lost. + */ + private runRaw; + /** + * Ensure the vault directory exists and is an initialized git repo on `main` + * with an initial (empty) commit so branches exist. Idempotent: safe to call + * on every run. Sets a LOCAL bot identity for the vault repo if none is set + * (so engine commits never fall back to a global/unset identity). + */ + ensureRepo(): Promise; + /** True if `cwd` is inside a git work-tree (the vault is initialized). */ + private isRepo; + /** True if a LOCAL git config key is set in the vault repo. */ + private hasLocalConfig; + /** True if the repo has at least one commit (HEAD resolves). */ + private hasAnyCommit; + /** True if a branch with the given name exists. */ + branchExists(name: string): Promise; + /** + * Create `name` from `fromBranch` if it does not already exist. No-op (and no + * checkout) when the branch is already present. + */ + ensureBranch(name: string, fromBranch: string): Promise; + /** Name of the currently checked-out branch. */ + currentBranch(): Promise; + /** Check out an existing branch. */ + checkout(name: string): Promise; + /** Stage everything (adds, modifications, deletions). */ + stageAll(): Promise; + /** + * True if the vault is mid-merge (an unresolved merge from a previous run, + * SPEC §9 / §12). Detected via a `MERGE_HEAD` ref OR any unmerged + * (conflicted) index entries (`git ls-files -u`). The pull cycle checks this + * BEFORE any checkout so a left-over merge produces a clear, actionable + * message instead of a raw "you need to resolve your current index first" + * failure deep inside `checkout`. This is what makes re-runs converge + * (resumability, SPEC §12). + */ + isMergeInProgress(): Promise; + /** + * Commit the currently STAGED changes with an explicit author/committer + * identity and the given trailers appended to the message body (SPEC §7.3 + * provenance). Returns `true` if a commit was made, `false` if there was + * nothing to commit (graceful no-op). The caller is expected to have staged + * its changes first (e.g. via `stageAll`). + */ + commit(message: string, opts: CommitOptions): Promise; + /** + * Low-level commit used by both `commit` and `ensureRepo`'s initial commit. + * Builds the full message with appended trailers and sets author + committer + * identity via env vars (so the committer matches the author, not the repo + * default). + */ + private commitRaw; + /** + * Merge `fromBranch` into the current branch (`git merge --no-edit`). + * Fast-forwards when possible; performs a real 3-way merge otherwise. Conflict + * state is SURFACED (returned), NOT auto-resolved (SPEC §9): the conflict + * markers are left in the worktree for manual resolution by a later increment, + * and — critically — nothing is pushed to Docmost (we never write to Docmost + * anyway). + */ + merge(fromBranch: string): Promise; + /** True if the index has any unmerged (conflicted) paths. */ + private hasUnmergedPaths; + /** + * List tracked files on the current branch (paths relative to the vault + * root, forward-slash separated). An optional glob (a git pathspec) narrows + * the listing, e.g. `"*.md"`. + * + * The target wiki is RUSSIAN, so vault file names routinely contain Cyrillic + * (e.g. `Колонка.md`). With git's DEFAULT `core.quotepath=true`, `ls-files` + * returns non-ASCII paths octal-escaped and double-quoted (`"\320\232..."`), + * which `src/pull.ts` `readExisting` would then parse as garbage paths, + * breaking move/duplicate detection. We defeat that two ways at once: + * - `core.quotepath=false` disables the octal-escape/quoting. It is now the + * `runRaw` argv baseline (prepended to EVERY invocation), so we no longer + * pass it inline here. + * - `-z` emits NUL-delimited RAW UTF-8 paths (no quoting, no newline + * ambiguity), which we split on `\0`. + * We read the RAW stdout (NOT the trimming `run()` helper, which would mangle + * the NUL-delimited bytes) and split on `\0`, dropping empty entries. Paths + * are returned verbatim — git already emits forward slashes. + */ + listTrackedFiles(glob?: string): Promise; + /** + * Diff two refs with `--name-status -M -z` and parse the NUL-delimited output + * (SPEC §6: the FS→Docmost push direction diffs `main` against + * `refs/docmost/last-pushed`). Rename detection is ON (`-M`), so a moved/renamed + * file is reported as a single `R` row with both its old and new path instead + * of a delete+add pair — that distinction is what lets the push planner tell a + * move from a delete+create (SPEC §8 "Move vs delete"). + * + * `-z` makes git emit NUL-delimited RAW UTF-8 records (the Russian wiki has + * Cyrillic file names) with NO quoting/escaping. The record shape differs by + * status: + * - A/M/D: `status\0path\0` + * - R/C: `Rnnn\0oldPath\0newPath\0` (nnn = similarity score, e.g. `R100`) + * We read the RAW stdout (not the trimming `run()` helper, which would mangle + * the NUL bytes), split on `\0`, drop the trailing empty entry, and walk the + * tokens pulling 1 or 2 path tokens per status. Paths are returned verbatim. + */ + diffNameStatus(fromRef: string, toRef: string): Promise; + /** + * Resolve a ref/commit-ish to its full SHA, or `null` if it does not exist. + * `rev-parse --verify --quiet` exits non-zero (and prints nothing) for an + * unknown ref, so a non-zero exit maps cleanly to `null`. Used to read + * `refs/docmost/last-pushed` (SPEC §5) — which is absent before the first push. + */ + revParse(ref: string): Promise; + /** + * Read a ref to its SHA, or `null` if unset. Thin alias over `revParse`, + * named for the push direction's marker `refs/docmost/last-pushed` (SPEC §5: + * "что из `main` уже отражено в Docmost"). + */ + readRef(ref: string): Promise; + /** + * Point `ref` at `target` (`git update-ref `). Used to advance + * `refs/docmost/last-pushed` to the just-pushed `main` commit after a push + * (SPEC §6 step 3 / §5). `target` may be a SHA or any commit-ish git accepts. + */ + updateRef(ref: string, target: string): Promise; + /** + * Fast-forward `branch` to `toCommit` — but ONLY if it is a TRUE fast-forward, + * i.e. the current `branch` tip is an ancestor of `toCommit` (verified via + * `git merge-base --is-ancestor `). Used to advance the + * `docmost` mirror branch after a clean push (SPEC §6 step 3 / §10): once a + * push succeeds, Docmost already contains the pushed `main` content, so the + * mirror must reflect it — otherwise the NEXT pull would diff our own write + * back and re-pull it (loop-guard). + * + * SAFETY — never force, never clobber divergent history: + * - If `branch` IS an ancestor of `toCommit`, advance it with + * `git update-ref refs/heads/ `. The `docmost` branch is + * NOT checked out during a push (push works on `main`), so updating the ref + * directly is safe and avoids any working-tree touch. + * - If `branch` is NOT an ancestor (divergent / would-be non-fast-forward), + * do NOT move it — return `{ ok: false, reason: 'not-fast-forward' }` and + * let the caller log it. We must never overwrite a `docmost` history that + * has commits the push base does not contain. + * + * Returns `{ ok: true }` when the branch was advanced (or already at + * `toCommit`, a degenerate fast-forward), `{ ok: false, reason }` otherwise. + * A missing `branch` or `toCommit` also yields `{ ok: false }` with a reason. + */ + fastForwardBranch(branch: string, toCommit: string): Promise<{ + ok: boolean; + reason?: string; + }>; + /** + * Read a file's content at a specific ref (`git show :`), or `null` + * if the path does not exist there. Used by the push direction to read the + * PRE-IMAGE of a DELETED file (e.g. at `refs/docmost/last-pushed`) so its + * `docmost:meta` — and therefore its `pageId` — can be recovered to translate + * the deletion into a `delete_page` (SPEC §6/§8: only TRACKED files, i.e. ones + * that had a pageId, are deleted in Docmost). A non-zero exit (path absent at + * that ref) maps to `null` rather than throwing. + */ + showFileAtRef(ref: string, path: string): Promise; +} +/** + * Build the environment for a vault git invocation (SPEC §12 cwd-isolation). + * Used by the single `runRaw` primitive every git command flows through, so + * these pins apply uniformly (including the `git --version` preflight). + * + * cwd-isolation is this module's central safety guarantee: every git command + * MUST operate on the vault repo at `cwd: vaultPath` and nothing else. An + * inherited `GIT_DIR` / `GIT_WORK_TREE` in `process.env` would silently + * redirect the operation away from `cwd` (e.g. to the source repo or another + * checkout), defeating that guarantee. So we always strip them, regardless of + * whatever else the caller adds (author/committer identity, etc.). + * + * Exported for unit testing. + */ +export declare function vaultGitEnv(extra?: Record): NodeJS.ProcessEnv; +/** + * Build a commit message body with trailer lines appended (SPEC §7.3). The + * trailers are separated from the subject by a blank line so `git interpret- + * trailers` / `git log --format=%(trailers)` parse them as trailers. + * Exported for unit testing. + */ +export declare function buildCommitMessage(subject: string, trailers?: string[]): string; diff --git a/packages/git-sync/build/engine/git.js b/packages/git-sync/build/engine/git.js new file mode 100644 index 00000000..7a67f2eb --- /dev/null +++ b/packages/git-sync/build/engine/git.js @@ -0,0 +1,570 @@ +/** + * Thin async wrapper over the system `git` binary (SPEC §5: state store = git). + * + * IMPORTANT — VAULT-SCOPED: every operation here runs with `cwd = vaultPath`, + * which is the vault's OWN git repository (default `data/vault`), SEPARATE from + * the gitmost application repo. This module MUST NEVER run git against the + * application repo. `data/` is gitignored, so a nested repo under `data/vault` + * is safe. The pull cycle is READ-ONLY toward Docmost; this module only touches + * the local vault git, never a git remote (push is deferred, see SPEC §7). + * + * Implementation notes: + * - We shell out via `node:child_process` `execFile` (promisified), passing + * ARGS AS AN ARRAY — no shell, so there is no command injection surface even + * if a page title / branch name contains shell metacharacters. + * - EVERY git invocation funnels through the single `runRaw` primitive, which + * ALWAYS prepends `--no-pager -c core.quotepath=false` to the argv (so git + * never blocks on a pager and always prints verbatim UTF-8 paths). There is + * no exception — even the `git --version` preflight goes through `runRaw`. + * - "nothing to commit" is treated as a graceful no-op, not an error. + */ +import { execFile } from "node:child_process"; +import { mkdir } from "node:fs/promises"; +import { promisify } from "node:util"; +const execFileAsync = promisify(execFile); +/** Bot identity used for engine-authored vault commits (SPEC §7.3). */ +export const BOT_AUTHOR_NAME = "Docmost Sync"; +export const BOT_AUTHOR_EMAIL = "docmost-sync@local"; +/** Default branch the vault repo is initialized on. */ +export const DEFAULT_BRANCH = "main"; +/** + * A git wrapper bound to a single vault path. Construct once per vault; every + * method runs git with `cwd = vaultPath`. + */ +export class VaultGit { + vaultPath; + constructor(vaultPath) { + this.vaultPath = vaultPath; + } + /** + * Preflight: verify a runnable `git` binary is on PATH. The daemon shells out + * to system `git` for every vault operation, so a missing binary (e.g. a slim + * container image without git) must fail fast with an actionable message + * rather than a cryptic ENOENT deep inside the first real git call. Presence + * check only — we do NOT gate on a specific version. Runs `git --version` + * with NO `cwd` (the vault dir may not exist yet at preflight time). + */ + async assertGitAvailable() { + // Goes through the single `runRaw` primitive like every other invocation. + // `cwd: null` means "do not set a cwd" — the vault dir may not exist yet at + // preflight time, so we must not point git at a missing directory. + const r = await this.runRaw(["--version"], { cwd: null }); + if (r.code !== 0) { + const detail = (r.stderr || r.stdout || "").trim(); + throw new Error("git binary not found or not runnable — install git (the vault state " + + `store requires it). Underlying error: ${detail}`); + } + } + /** + * Run a git command in the vault and return trimmed stdout. THIN wrapper over + * the single `runRaw` primitive: throws a clear, unified Error (including + * stderr/stdout) on a non-zero exit. + */ + async run(args, opts) { + const r = await this.runRaw(args, opts); + if (r.code !== 0) { + const detail = (r.stderr || r.stdout || "").trim(); + throw new Error(`git ${args.join(" ")} failed: ${detail}`); + } + return r.stdout.trim(); + } + /** + * The ONE primitive every git invocation in this module flows through. Builds + * the full argv (`--no-pager -c core.quotepath=false `), env, cwd, and + * maxBuffer, runs git, and NEVER throws — it returns the exit info so callers + * can treat a non-zero exit as either an error (`run`) or a meaningful state + * (e.g. a merge conflict, a porcelain diff that "fails" deliberately). + * + * - argv: ALWAYS prepends `--no-pager -c core.quotepath=false`, so git never + * blocks on a pager and always prints verbatim UTF-8 paths (no octal + * escaping/quoting). `quotepath=false` is the baseline for ALL path- + * printing commands (ls-files, diff --name-only, …). + * - cwd: `opts.cwd === null` -> do NOT set cwd (the preflight, where the + * vault dir may not exist); otherwise `opts.cwd ?? this.vaultPath`. + * - env: `vaultGitEnv(opts?.env)` (cwd-isolation + caller extras). + * - On a spawn/exec error we capture the error `message` too, so a failure + * before git could write to stderr (e.g. ENOENT) is NOT lost. + */ + async runRaw(args, opts) { + const cwd = opts?.cwd === null ? undefined : (opts?.cwd ?? this.vaultPath); + try { + const { stdout, stderr } = await execFileAsync("git", ["--no-pager", "-c", "core.quotepath=false", ...args], { + // Generous buffer: file listings / porcelain output on a large vault + // can be sizable. + ...(cwd !== undefined ? { cwd } : {}), + maxBuffer: 64 * 1024 * 1024, + env: vaultGitEnv(opts?.env), + }); + return { code: 0, stdout, stderr }; + } + catch (err) { + const e = err; + return { + code: typeof e.code === "number" ? e.code : 1, + stdout: e.stdout ?? "", + // Preserve the error message when there is no stderr (e.g. a spawn + // failure like ENOENT, where promisified execFile sets stderr to an + // EMPTY STRING — so `||`, not `??`, to fall through to `message`). + stderr: e.stderr || e.message || "", + }; + } + } + /** + * Ensure the vault directory exists and is an initialized git repo on `main` + * with an initial (empty) commit so branches exist. Idempotent: safe to call + * on every run. Sets a LOCAL bot identity for the vault repo if none is set + * (so engine commits never fall back to a global/unset identity). + */ + async ensureRepo() { + await mkdir(this.vaultPath, { recursive: true }); + if (!(await this.isRepo())) { + // `git init -b main` sets the initial branch on modern git; we still + // guard the branch name below for safety on older binaries. + await this.run(["init", "-b", DEFAULT_BRANCH]); + } + // Set a local identity for the vault repo if unset, so engine commits have + // a deterministic committer even on a machine with no global git config. + if (!(await this.hasLocalConfig("user.name"))) { + await this.run(["config", "user.name", BOT_AUTHOR_NAME]); + } + if (!(await this.hasLocalConfig("user.email"))) { + await this.run(["config", "user.email", BOT_AUTHOR_EMAIL]); + } + // Neutralize correctness-affecting git config in the vault's LOCAL config so + // a user's GLOBAL/system config cannot change porcelain BEHAVIOR (not just + // output) and corrupt the vault. The vault is OUR dedicated repo, so LOCAL + // values (which override global/system) are the right scope. Set + // UNCONDITIONALLY every run — idempotent and cheap; `git config ` + // writes to `--local` by default inside the repo. These MUST be in place + // before any add/commit/checkout that could be affected, hence they run + // before the initial-commit block below. + // - core.autocrlf=false — CRITICAL (SPEC §11): a global core.autocrlf=true + // would rewrite LF<->CRLF on add/checkout, making our deterministic, + // byte-stable markdown churn and breaking the round-trip invariant. + // `false` guarantees git stores/checks out verbatim bytes. + // - core.safecrlf=false — avoid CRLF-related warnings/aborts on add. + // - commit.gpgsign=false — the headless daemon must never try to GPG-sign + // a commit (would fail/hang; we already set GIT_TERMINAL_PROMPT=0). + // - core.attributesFile=/dev/null — neutralize the user's GLOBAL + // gitattributes so a global clean/smudge filter (filter..clean) + // cannot rewrite the STORED blob and break §11 byte-stability (a config + // that core.autocrlf=false does not cover). POSIX-only path, which is + // fine: the daemon runs on Linux (Docker) / macOS. A system + // /etc/gitattributes remains the host admin's domain (out of scope). + // NOTE: these stay PERSISTED LOCAL config (not `-c` flags) on purpose — a + // human running git by hand in the vault must inherit the same neutralized + // behavior; a transient `-c` would not persist. (core.quotepath, by + // contrast, only affects OUR parsing of output and so is baked into the + // `runRaw` argv baseline instead.) + try { + await this.run(["config", "core.autocrlf", "false"]); + await this.run(["config", "core.safecrlf", "false"]); + await this.run(["config", "commit.gpgsign", "false"]); + await this.run(["config", "core.attributesFile", "/dev/null"]); + } + catch (err) { + const detail = err instanceof Error ? err.message : String(err); + throw new Error(`failed to pin vault git config (SPEC §11) — ensure ${this.vaultPath}` + + "/.git/config is writable and not locked (e.g. stale config.lock): " + + detail); + } + // Create the initial empty commit on `main` if the repo has no commits yet, + // so both `main` and (later) `docmost` branches have a common base. + if (!(await this.hasAnyCommit())) { + // Make sure we are on the default branch before the first commit (covers + // the older-git case where `init -b` was not honored). + await this.run(["checkout", "-B", DEFAULT_BRANCH]); + await this.commitRaw("init vault", { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + allowEmpty: true, + }); + } + } + /** True if `cwd` is inside a git work-tree (the vault is initialized). */ + async isRepo() { + const r = await this.runRaw(["rev-parse", "--is-inside-work-tree"]); + return r.code === 0 && r.stdout.trim() === "true"; + } + /** True if a LOCAL git config key is set in the vault repo. */ + async hasLocalConfig(key) { + const r = await this.runRaw(["config", "--local", "--get", key]); + return r.code === 0 && r.stdout.trim().length > 0; + } + /** True if the repo has at least one commit (HEAD resolves). */ + async hasAnyCommit() { + const r = await this.runRaw(["rev-parse", "--verify", "HEAD"]); + return r.code === 0; + } + /** True if a branch with the given name exists. */ + async branchExists(name) { + const r = await this.runRaw([ + "rev-parse", + "--verify", + `refs/heads/${name}`, + ]); + return r.code === 0; + } + /** + * Create `name` from `fromBranch` if it does not already exist. No-op (and no + * checkout) when the branch is already present. + */ + async ensureBranch(name, fromBranch) { + if (await this.branchExists(name)) + return; + await this.run(["branch", name, fromBranch]); + } + /** Name of the currently checked-out branch. */ + async currentBranch() { + return this.run(["rev-parse", "--abbrev-ref", "HEAD"]); + } + /** Check out an existing branch. */ + async checkout(name) { + await this.run(["checkout", name]); + } + /** Stage everything (adds, modifications, deletions). */ + async stageAll() { + await this.run(["add", "-A"]); + } + /** + * True if the vault is mid-merge (an unresolved merge from a previous run, + * SPEC §9 / §12). Detected via a `MERGE_HEAD` ref OR any unmerged + * (conflicted) index entries (`git ls-files -u`). The pull cycle checks this + * BEFORE any checkout so a left-over merge produces a clear, actionable + * message instead of a raw "you need to resolve your current index first" + * failure deep inside `checkout`. This is what makes re-runs converge + * (resumability, SPEC §12). + */ + async isMergeInProgress() { + // MERGE_HEAD exists exactly while a merge is in progress. + const mergeHead = await this.runRaw([ + "rev-parse", + "--verify", + "--quiet", + "MERGE_HEAD", + ]); + if (mergeHead.code === 0 && mergeHead.stdout.trim().length > 0) + return true; + // Fallback / belt-and-suspenders: any unmerged index entries also mean the + // working tree is mid-conflict and a checkout would refuse. + const unmerged = await this.runRaw(["ls-files", "-u"]); + return unmerged.code === 0 && unmerged.stdout.trim().length > 0; + } + /** + * Commit the currently STAGED changes with an explicit author/committer + * identity and the given trailers appended to the message body (SPEC §7.3 + * provenance). Returns `true` if a commit was made, `false` if there was + * nothing to commit (graceful no-op). The caller is expected to have staged + * its changes first (e.g. via `stageAll`). + */ + async commit(message, opts) { + // Nothing staged -> nothing to commit. Treat as a no-op (SPEC §11: a + // deterministic re-pull of unchanged pages produces identical bytes, so + // git sees no diff and we must not error). + const staged = await this.runRaw([ + "diff", + "--cached", + "--quiet", + ]); + // `diff --cached --quiet` exits 0 when the index matches HEAD (nothing + // staged), 1 when there are staged changes. + if (staged.code === 0) + return false; + await this.commitRaw(message, opts); + return true; + } + /** + * Low-level commit used by both `commit` and `ensureRepo`'s initial commit. + * Builds the full message with appended trailers and sets author + committer + * identity via env vars (so the committer matches the author, not the repo + * default). + */ + async commitRaw(message, opts) { + const fullMessage = buildCommitMessage(message, opts.trailers); + // `--no-verify` skips pre-commit/commit-msg hooks: a global core.hooksPath + // (or any injected hook) must never interfere with engine commits in our + // dedicated vault repo. + const args = ["commit", "--no-verify", "-m", fullMessage]; + if (opts.allowEmpty) + args.push("--allow-empty"); + // Route through the single `runRaw` primitive; set author + committer + // identity via env vars (so the committer matches the author, not the repo + // default). Throw via the same unified message on a non-zero exit. + const r = await this.runRaw(args, { + env: { + GIT_AUTHOR_NAME: opts.authorName, + GIT_AUTHOR_EMAIL: opts.authorEmail, + GIT_COMMITTER_NAME: opts.authorName, + GIT_COMMITTER_EMAIL: opts.authorEmail, + }, + }); + if (r.code !== 0) { + const detail = (r.stderr || r.stdout || "").trim(); + throw new Error(`git ${args.join(" ")} failed: ${detail}`); + } + } + /** + * Merge `fromBranch` into the current branch (`git merge --no-edit`). + * Fast-forwards when possible; performs a real 3-way merge otherwise. Conflict + * state is SURFACED (returned), NOT auto-resolved (SPEC §9): the conflict + * markers are left in the worktree for manual resolution by a later increment, + * and — critically — nothing is pushed to Docmost (we never write to Docmost + * anyway). + */ + async merge(fromBranch) { + const r = await this.runRaw(["merge", "--no-edit", fromBranch]); + const output = `${r.stdout}\n${r.stderr}`.trim(); + if (r.code === 0) { + return { ok: true, conflict: false, output }; + } + // A non-zero exit on merge most commonly means a conflict. Confirm by + // checking for unmerged paths (porcelain "U" status) so we don't mislabel + // an unrelated failure as a conflict. + const conflict = await this.hasUnmergedPaths(); + return { ok: false, conflict, output }; + } + /** True if the index has any unmerged (conflicted) paths. */ + async hasUnmergedPaths() { + const r = await this.runRaw(["diff", "--name-only", "--diff-filter=U"]); + return r.code === 0 && r.stdout.trim().length > 0; + } + /** + * List tracked files on the current branch (paths relative to the vault + * root, forward-slash separated). An optional glob (a git pathspec) narrows + * the listing, e.g. `"*.md"`. + * + * The target wiki is RUSSIAN, so vault file names routinely contain Cyrillic + * (e.g. `Колонка.md`). With git's DEFAULT `core.quotepath=true`, `ls-files` + * returns non-ASCII paths octal-escaped and double-quoted (`"\320\232..."`), + * which `src/pull.ts` `readExisting` would then parse as garbage paths, + * breaking move/duplicate detection. We defeat that two ways at once: + * - `core.quotepath=false` disables the octal-escape/quoting. It is now the + * `runRaw` argv baseline (prepended to EVERY invocation), so we no longer + * pass it inline here. + * - `-z` emits NUL-delimited RAW UTF-8 paths (no quoting, no newline + * ambiguity), which we split on `\0`. + * We read the RAW stdout (NOT the trimming `run()` helper, which would mangle + * the NUL-delimited bytes) and split on `\0`, dropping empty entries. Paths + * are returned verbatim — git already emits forward slashes. + */ + async listTrackedFiles(glob) { + const r = await this.runRaw(["ls-files", "-z", ...(glob ? [glob] : [])]); + if (r.code !== 0) { + const detail = (r.stderr || r.stdout || "").trim(); + throw new Error(`git ls-files failed: ${detail}`); + } + return r.stdout.split("\0").filter((p) => p.length > 0); + } + /** + * Diff two refs with `--name-status -M -z` and parse the NUL-delimited output + * (SPEC §6: the FS→Docmost push direction diffs `main` against + * `refs/docmost/last-pushed`). Rename detection is ON (`-M`), so a moved/renamed + * file is reported as a single `R` row with both its old and new path instead + * of a delete+add pair — that distinction is what lets the push planner tell a + * move from a delete+create (SPEC §8 "Move vs delete"). + * + * `-z` makes git emit NUL-delimited RAW UTF-8 records (the Russian wiki has + * Cyrillic file names) with NO quoting/escaping. The record shape differs by + * status: + * - A/M/D: `status\0path\0` + * - R/C: `Rnnn\0oldPath\0newPath\0` (nnn = similarity score, e.g. `R100`) + * We read the RAW stdout (not the trimming `run()` helper, which would mangle + * the NUL bytes), split on `\0`, drop the trailing empty entry, and walk the + * tokens pulling 1 or 2 path tokens per status. Paths are returned verbatim. + */ + async diffNameStatus(fromRef, toRef) { + const r = await this.runRaw([ + "diff", + "--name-status", + "-M", + "-z", + fromRef, + toRef, + ]); + if (r.code !== 0) { + const detail = (r.stderr || r.stdout || "").trim(); + throw new Error(`git diff --name-status failed: ${detail}`); + } + // Tokens alternate: ... With `-z`, + // each token (status code AND each path) is its own NUL-delimited field. + const tokens = r.stdout.split("\0").filter((t) => t.length > 0); + const entries = []; + let i = 0; + while (i < tokens.length) { + const raw = tokens[i++]; + // The status token is e.g. `A`, `M`, `D`, or `R100` / `C075`. The leading + // letter is the change kind; any trailing digits are the similarity score. + const letter = raw[0]; + if (letter === "R" || letter === "C") { + const score = Number.parseInt(raw.slice(1), 10); + const oldPath = tokens[i++]; + const path = tokens[i++]; + if (oldPath === undefined || path === undefined) + break; // malformed tail + entries.push({ + status: letter, + path, + oldPath, + ...(Number.isFinite(score) ? { score } : {}), + }); + } + else if (letter === "A" || letter === "M" || letter === "D") { + const path = tokens[i++]; + if (path === undefined) + break; // malformed tail + entries.push({ status: letter, path }); + } + else { + // Unknown/other status (e.g. T type-change, U unmerged) — consume one + // path token defensively so the walk stays aligned, but do not emit it + // (the push planner only handles A/M/D/R/C). + i++; + } + } + return entries; + } + /** + * Resolve a ref/commit-ish to its full SHA, or `null` if it does not exist. + * `rev-parse --verify --quiet` exits non-zero (and prints nothing) for an + * unknown ref, so a non-zero exit maps cleanly to `null`. Used to read + * `refs/docmost/last-pushed` (SPEC §5) — which is absent before the first push. + */ + async revParse(ref) { + const r = await this.runRaw(["rev-parse", "--verify", "--quiet", ref]); + if (r.code !== 0) + return null; + const sha = r.stdout.trim(); + return sha.length > 0 ? sha : null; + } + /** + * Read a ref to its SHA, or `null` if unset. Thin alias over `revParse`, + * named for the push direction's marker `refs/docmost/last-pushed` (SPEC §5: + * "что из `main` уже отражено в Docmost"). + */ + async readRef(ref) { + return this.revParse(ref); + } + /** + * Point `ref` at `target` (`git update-ref `). Used to advance + * `refs/docmost/last-pushed` to the just-pushed `main` commit after a push + * (SPEC §6 step 3 / §5). `target` may be a SHA or any commit-ish git accepts. + */ + async updateRef(ref, target) { + await this.run(["update-ref", ref, target]); + } + /** + * Fast-forward `branch` to `toCommit` — but ONLY if it is a TRUE fast-forward, + * i.e. the current `branch` tip is an ancestor of `toCommit` (verified via + * `git merge-base --is-ancestor `). Used to advance the + * `docmost` mirror branch after a clean push (SPEC §6 step 3 / §10): once a + * push succeeds, Docmost already contains the pushed `main` content, so the + * mirror must reflect it — otherwise the NEXT pull would diff our own write + * back and re-pull it (loop-guard). + * + * SAFETY — never force, never clobber divergent history: + * - If `branch` IS an ancestor of `toCommit`, advance it with + * `git update-ref refs/heads/ `. The `docmost` branch is + * NOT checked out during a push (push works on `main`), so updating the ref + * directly is safe and avoids any working-tree touch. + * - If `branch` is NOT an ancestor (divergent / would-be non-fast-forward), + * do NOT move it — return `{ ok: false, reason: 'not-fast-forward' }` and + * let the caller log it. We must never overwrite a `docmost` history that + * has commits the push base does not contain. + * + * Returns `{ ok: true }` when the branch was advanced (or already at + * `toCommit`, a degenerate fast-forward), `{ ok: false, reason }` otherwise. + * A missing `branch` or `toCommit` also yields `{ ok: false }` with a reason. + */ + async fastForwardBranch(branch, toCommit) { + const branchRef = `refs/heads/${branch}`; + // Resolve both endpoints first so a missing ref is a clean refusal, not a + // confusing `merge-base` failure. + const branchSha = await this.revParse(branchRef); + if (branchSha === null) { + return { ok: false, reason: `branch ${branch} does not exist` }; + } + const targetSha = await this.revParse(toCommit); + if (targetSha === null) { + return { ok: false, reason: `target ${toCommit} does not resolve` }; + } + // Already at the target -> a no-op fast-forward (still ok). + if (branchSha === targetSha) + return { ok: true }; + // `merge-base --is-ancestor A B` exits 0 iff A is an ancestor of B. Only a + // true ancestor is a fast-forward; anything else is divergent and refused. + const ancestor = await this.runRaw([ + "merge-base", + "--is-ancestor", + branchSha, + targetSha, + ]); + if (ancestor.code !== 0) { + return { ok: false, reason: "not-fast-forward" }; + } + // Safe to advance: the branch is not checked out during push, so a direct + // ref update avoids a checkout/working-tree touch. + await this.updateRef(branchRef, targetSha); + return { ok: true }; + } + /** + * Read a file's content at a specific ref (`git show :`), or `null` + * if the path does not exist there. Used by the push direction to read the + * PRE-IMAGE of a DELETED file (e.g. at `refs/docmost/last-pushed`) so its + * `docmost:meta` — and therefore its `pageId` — can be recovered to translate + * the deletion into a `delete_page` (SPEC §6/§8: only TRACKED files, i.e. ones + * that had a pageId, are deleted in Docmost). A non-zero exit (path absent at + * that ref) maps to `null` rather than throwing. + */ + async showFileAtRef(ref, path) { + // `git show :` requires the path relative to the repo root; pass + // it verbatim (forward-slash, matching `listTrackedFiles` / diff output). + const r = await this.runRaw(["show", `${ref}:${path}`]); + if (r.code !== 0) + return null; + return r.stdout; + } +} +/** + * Build the environment for a vault git invocation (SPEC §12 cwd-isolation). + * Used by the single `runRaw` primitive every git command flows through, so + * these pins apply uniformly (including the `git --version` preflight). + * + * cwd-isolation is this module's central safety guarantee: every git command + * MUST operate on the vault repo at `cwd: vaultPath` and nothing else. An + * inherited `GIT_DIR` / `GIT_WORK_TREE` in `process.env` would silently + * redirect the operation away from `cwd` (e.g. to the source repo or another + * checkout), defeating that guarantee. So we always strip them, regardless of + * whatever else the caller adds (author/committer identity, etc.). + * + * Exported for unit testing. + */ +export function vaultGitEnv(extra) { + const env = { + ...process.env, + // Locale-independent output (defense in depth). We never parse localized + // prose, but pinning the locale prevents a future regression where some + // git message we DO key on is translated by an inherited LC_ALL/LANG. + LC_ALL: "C", + LANG: "C", + // Never page (we already pass --no-pager, but a stray GIT_PAGER could still + // bite) and never block on an interactive prompt (e.g. credentials) — the + // daemon runs unattended and must not hang. + GIT_PAGER: "cat", + GIT_TERMINAL_PROMPT: "0", + ...extra, + }; + delete env.GIT_DIR; + delete env.GIT_WORK_TREE; + return env; +} +/** + * Build a commit message body with trailer lines appended (SPEC §7.3). The + * trailers are separated from the subject by a blank line so `git interpret- + * trailers` / `git log --format=%(trailers)` parse them as trailers. + * Exported for unit testing. + */ +export function buildCommitMessage(subject, trailers) { + if (!trailers || trailers.length === 0) + return subject; + return `${subject}\n\n${trailers.join("\n")}`; +} diff --git a/packages/git-sync/build/engine/layout.d.ts b/packages/git-sync/build/engine/layout.d.ts new file mode 100644 index 00000000..8e6d14b4 --- /dev/null +++ b/packages/git-sync/build/engine/layout.d.ts @@ -0,0 +1,44 @@ +/** + * Pure page-tree -> vault path mapping (SPEC §12). + * + * Given the flat list of page nodes for a space (as returned by + * `listAllSpacePages`), compute for every page a deterministic, collision-free + * destination: a folder path (root -> leaf ancestors) plus a file stem (the + * page's own name, no extension). This module is intentionally PURE and + * dependency-free apart from the sanitization helpers, so the whole tree -> + * path logic is unit-testable without any I/O. The names are COSMETIC; identity + * lives in each file's meta block (pageId / slugId). + */ +/** Flat page node as returned by `listAllSpacePages` (no content). */ +export interface PageNode { + id: string; + title?: string; + slugId?: string; + parentPageId?: string | null; + hasChildren?: boolean; +} +/** A page's resolved vault destination: folder path + file stem. */ +export interface VaultEntry { + /** Folder path, root -> leaf (the page's ancestors). Empty for a root page. */ + segments: string[]; + /** The page's own file name without extension. */ + stem: string; +} +/** + * Build the full vault layout for a space. + * + * Returns a Map keyed by pageId -> `{ segments, stem }`. The result is + * deterministic for a given input and guarantees every full destination path + * (`[...segments, stem].join("/")`) is unique, so no page can silently overwrite + * another. + * + * Disambiguation is layered: + * 1. Sibling collisions (same sanitized title under the same parent) are + * resolved with a stable ` ~` suffix (the suffix is itself + * sanitized, since slugId/id is untrusted data that must never inject a + * path separator). + * 2. A final full-path pass catches residual collisions that sibling-scoping + * cannot see — e.g. two pages whose parents are BOTH outside the input set + * both bucket at the root with `segments: []`. + */ +export declare function buildVaultLayout(pages: PageNode[]): Map; diff --git a/packages/git-sync/build/engine/layout.js b/packages/git-sync/build/engine/layout.js new file mode 100644 index 00000000..7142c29d --- /dev/null +++ b/packages/git-sync/build/engine/layout.js @@ -0,0 +1,170 @@ +/** + * Pure page-tree -> vault path mapping (SPEC §12). + * + * Given the flat list of page nodes for a space (as returned by + * `listAllSpacePages`), compute for every page a deterministic, collision-free + * destination: a folder path (root -> leaf ancestors) plus a file stem (the + * page's own name, no extension). This module is intentionally PURE and + * dependency-free apart from the sanitization helpers, so the whole tree -> + * path logic is unit-testable without any I/O. The names are COSMETIC; identity + * lives in each file's meta block (pageId / slugId). + */ +import { sanitizeTitle, disambiguate } from "./sanitize.js"; +/** + * Build the full vault layout for a space. + * + * Returns a Map keyed by pageId -> `{ segments, stem }`. The result is + * deterministic for a given input and guarantees every full destination path + * (`[...segments, stem].join("/")`) is unique, so no page can silently overwrite + * another. + * + * Disambiguation is layered: + * 1. Sibling collisions (same sanitized title under the same parent) are + * resolved with a stable ` ~` suffix (the suffix is itself + * sanitized, since slugId/id is untrusted data that must never inject a + * path separator). + * 2. A final full-path pass catches residual collisions that sibling-scoping + * cannot see — e.g. two pages whose parents are BOTH outside the input set + * both bucket at the root with `segments: []`. + */ +export function buildVaultLayout(pages) { + // Index pages by id so the parent chain can be walked. Guard against + // duplicate ids in the input (first one wins). + const byId = new Map(); + for (const p of pages) { + if (p && p.id && !byId.has(p.id)) + byId.set(p.id, p); + } + // Resolve each node's display name once, deterministically, tracking sibling + // collisions per parent. `usedBySibling` maps a parent key -> set of names + // already taken under that parent. The bucket key is the node's parent ONLY + // when that parent is actually present in `byId`; otherwise (null parent, or + // an orphan whose parent is outside the input set) the node buckets at + // `"__root__"`. This is critical: orphans land at the vault root (see + // `folderSegmentsFor`), so they MUST share the root bucket with real root + // pages to be disambiguated against each other here — making `nameById` final + // before any `segments` are computed, so no ancestor name can drift later. + const usedBySibling = new Map(); + const nameById = new Map(); + for (const p of pages) { + if (p && p.id && !nameById.has(p.id)) { + const parentKey = p.parentPageId && byId.has(p.parentPageId) ? p.parentPageId : "__root__"; + nameById.set(p.id, nameForNode(p, parentKey, usedBySibling)); + } + } + // Every id we index above MUST get a resolved name; this helper returns it + // and THROWS if it is somehow absent, rather than silently recomputing a + // DIFFERENT, non-disambiguated name (which would desync a folder segment from + // its target file). + const nameOf = (id) => { + const name = nameById.get(id); + if (name === undefined) { + throw new Error(`buildVaultLayout: no resolved name for page id ${id}`); + } + return name; + }; + // Build the folder path for a page by walking parentPageId to the root. The + // page's OWN name is the file stem; its ancestors become folders. A `visited` + // guard prevents an infinite loop on a malformed parent cycle. + const folderSegmentsFor = (node) => { + const ancestors = []; + const visited = new Set(); + let current = node.parentPageId + ? byId.get(node.parentPageId) + : undefined; + while (current && current.id && !visited.has(current.id)) { + visited.add(current.id); + ancestors.unshift(nameOf(current.id)); + current = current.parentPageId + ? byId.get(current.parentPageId) + : undefined; + } + return ancestors; + }; + // First pass: compute the provisional { segments, stem } for every node. + const layout = new Map(); + for (const p of pages) { + if (!p || !p.id || layout.has(p.id)) + continue; + layout.set(p.id, { + segments: folderSegmentsFor(p), + stem: nameOf(p.id), + }); + } + // FOLDER-NOTE transform (native-Obsidian layout): a page WITH CHILDREN lives at + // `<…>//.md` — its body is the folder-note INSIDE its own folder + // (LostPaul Folder Notes convention), and its children sit alongside it in that + // folder. A leaf stays `<…>/.md`. Children's segments already point into + // the parent's folder (folderSegmentsFor walks ancestor NAMES), so only the + // parent's own file relocates here; the sibling name pass above already made + // the parent name unique, so folder == file name stays consistent. + for (const p of pages) { + if (!p || !p.id) + continue; + const entry = layout.get(p.id); + if (entry && p.hasChildren) { + entry.segments = [...entry.segments, entry.stem]; + } + } + // Final full-path uniqueness pass — a belt-and-suspenders safety net. Note + // that cross-bucket (orphan/root) collisions are now resolved in the name pass + // above (orphans share the "__root__" bucket), so ancestor names are final + // before `segments` are built and this pass should rarely/never re-stem an + // ancestor. It only re-stems the colliding LATER leaf via the sanitized + // slugId/id, then (if still colliding) appends the id. + // + // Process FOLDER-NOTES (pages with children) FIRST so a parent claims its + // canonical `/.md` before a same-named CHILD — the child (a leaf) + // is the one that disambiguates, never the folder-note. + const usedPaths = new Set(); + const seenIds = new Set(); + const pathKey = (e) => [...e.segments, e.stem].join("/"); + const ordered = pages + .filter((p) => Boolean(p && p.id)) + .sort((a, b) => Number(Boolean(b.hasChildren)) - Number(Boolean(a.hasChildren))); + for (const p of ordered) { + if (seenIds.has(p.id)) + continue; + seenIds.add(p.id); + const entry = layout.get(p.id); + if (!entry) + continue; + if (usedPaths.has(pathKey(entry))) { + // First attempt: disambiguate the stem with the sanitized slugId (or id). + entry.stem = disambiguate(entry.stem, sanitizeTitle(p.slugId ?? p.id)); + if (usedPaths.has(pathKey(entry))) { + // Still colliding: append the (sanitized) id as a last resort. The id + // is globally unique, so this always resolves the collision. + entry.stem = disambiguate(entry.stem, sanitizeTitle(p.id)); + } + } + usedPaths.add(pathKey(entry)); + } + return layout; +} +/** + * Compute a deterministic, collision-free name for a node among its SIBLINGS. + * `usedBySibling` maps a parent key -> set of names already taken, so two + * siblings that sanitize to the same name get a stable ` ~slugId` suffix + * (SPEC §12). The suffix is itself passed through `sanitizeTitle`, because the + * slugId/id is a second untrusted-data channel that must never leak a path + * separator into the name. `parentKey` is supplied by the caller (it resolves + * to `"__root__"` for root pages AND for orphans whose parent is outside the + * input set, so they share one bucket). The name is COSMETIC; identity lives in + * the meta block. + */ +function nameForNode(node, parentKey, usedBySibling) { + let used = usedBySibling.get(parentKey); + if (!used) { + used = new Set(); + usedBySibling.set(parentKey, used); + } + let name = sanitizeTitle(node.title ?? ""); + if (used.has(name)) { + // Sibling collision: disambiguate with the stable, sanitized slugId (fall + // back to the sanitized pageId if no slugId is present). + name = disambiguate(name, sanitizeTitle(node.slugId ?? node.id)); + } + used.add(name); + return name; +} diff --git a/packages/git-sync/build/engine/loop-guard.d.ts b/packages/git-sync/build/engine/loop-guard.d.ts new file mode 100644 index 00000000..95980d02 --- /dev/null +++ b/packages/git-sync/build/engine/loop-guard.d.ts @@ -0,0 +1,13 @@ +/** + * Stable hash of a page's markdown BODY (SPEC §10 "хэш тела"). Deterministic: + * the same input string always yields the same digest, a different input a + * different one. Used to recognize our own write later (loop suppression). + * + * We hash the body STRING as-is (UTF-8) with SHA-256 and return lowercase hex. + * SPEC §10 keys on the body hash rather than file bytes; callers decide WHAT + * counts as "the body" (here it is the exact string passed in — typically the + * self-contained markdown that was pushed). No normalization is applied: the + * caller is responsible for passing a canonical/stable representation if it + * wants hash equality across cosmetic-only differences. + */ +export declare function bodyHash(markdownBody: string): string; diff --git a/packages/git-sync/build/engine/loop-guard.js b/packages/git-sync/build/engine/loop-guard.js new file mode 100644 index 00000000..a85047e4 --- /dev/null +++ b/packages/git-sync/build/engine/loop-guard.js @@ -0,0 +1,28 @@ +/** + * Loop-guard primitives (SPEC §10). The sync engine must never re-pull its OWN + * write as if it were a remote edit: after a push, the next poll will see the + * page it just wrote with a fresh `updatedAt`. To suppress that, we key on two + * signals — the body HASH of what we pushed (this module) and the `updatedAt` + * returned by the write — recorded per page at push time. + * + * This module owns the PURE, deterministic body-hash. The CONSUMPTION on the + * pull side (comparing an incoming page's body hash against the last pushed hash + * to decide "this is our own write, ignore it") is a future increment — here we + * only PRODUCE the hash and the per-page push record (see `src/push.ts`). + */ +import { createHash } from "node:crypto"; +/** + * Stable hash of a page's markdown BODY (SPEC §10 "хэш тела"). Deterministic: + * the same input string always yields the same digest, a different input a + * different one. Used to recognize our own write later (loop suppression). + * + * We hash the body STRING as-is (UTF-8) with SHA-256 and return lowercase hex. + * SPEC §10 keys on the body hash rather than file bytes; callers decide WHAT + * counts as "the body" (here it is the exact string passed in — typically the + * self-contained markdown that was pushed). No normalization is applied: the + * caller is responsible for passing a canonical/stable representation if it + * wants hash equality across cosmetic-only differences. + */ +export function bodyHash(markdownBody) { + return createHash("sha256").update(markdownBody, "utf8").digest("hex"); +} diff --git a/packages/git-sync/build/engine/pull.d.ts b/packages/git-sync/build/engine/pull.d.ts new file mode 100644 index 00000000..f6f7cbd4 --- /dev/null +++ b/packages/git-sync/build/engine/pull.d.ts @@ -0,0 +1,136 @@ +import type { GitSyncClient } from "./client.types.js"; +import { type PageNode } from "./layout.js"; +import { VaultGit } from "./git.js"; +import { type MovedEntry, type DeletionDecision } from "./reconcile.js"; +/** + * Injectable IO for `readExisting` (R-Pull-1, test-strategy report §5). The real + * `main` wires these to `git.listTrackedFiles("*.md")` and an `fs.readFile` + * rooted at the vault; tests pass fakes so the parsing/skip rules are unit- + * testable without a real git repo or filesystem. + */ +export interface ReadExistingDeps { + /** List tracked .md paths (forward-slash, vault-relative). */ + listTracked: () => Promise; + /** Read a tracked file's text by its (forward-slash) vault-relative path. */ + readFile: (relPath: string) => Promise; +} +/** + * Read every tracked .md file in the vault and recover `{ pageId, relPath }` from + * its `gitmost_id` frontmatter (native-Obsidian format). Files without a + * `gitmost_id` are skipped (they are not engine-tracked pages yet — e.g. a stray + * hand-written Obsidian file; PUSH adopts those separately). + * + * The IO is injected (R-Pull-1) so this is testable with fakes. Skip rules: + * - a `readFile` rejection (tracked but missing on disk, a mid-operation race) + * -> skipped, NOT thrown; the next pull converges; + * - no `gitmost_id` frontmatter (`parsePageFile` -> id null) -> skipped. + */ +export declare function readExisting(deps: ReadExistingDeps): Promise<{ + pageId: string; + relPath: string; +}[]>; +/** + * Input to the PURE `computePullActions` (R-Pull-2). All data, no IO: the live + * tree nodes + completeness flag (from `listSpaceTree`) and the parsed + * `existing` tracked files (from `readExisting`). + */ +export interface PullActionsInput { + /** Live page nodes for the space (from `listSpaceTree`). */ + pages: PageNode[]; + /** Whether the live tree fetch was COMPLETE (SPEC §8 suppression). */ + treeComplete: boolean; + /** Parsed tracked files: `{ pageId, relPath }` (from `readExisting`). */ + existing: { + pageId: string; + relPath: string; + }[]; +} +/** + * The PURE decisions object computed by `computePullActions` (no IO). It holds + * the reconciliation plan plus the SPEC §8 absence-deletion decision, with the + * suppression already folded in: `toDelete` is the POST-suppression set the + * caller should actually remove (empty when `deletionDecision.apply` is false). + */ +export interface PullActions { + /** Pages to (re)write at their relPath (add + update + move target). */ + toWrite: { + pageId: string; + relPath: string; + }[]; + /** Moves: write new path, then remove old path (only on a successful write). */ + moved: MovedEntry[]; + /** + * Absence-based paths to delete AFTER suppression. Empty when the decision + * suppressed deletions this cycle, so the caller can apply it unconditionally. + */ + toDelete: string[]; + /** Why absence deletions were (or were not) applied (for logging + tests). */ + deletionDecision: DeletionDecision; + /** Tracked-file count (for the suppression log messages). */ + existingCount: number; + /** Planned absence-delete count BEFORE suppression (for the log message). */ + plannedDeleteCount: number; +} +/** + * PURE pull-action planner (R-Pull-2, test-strategy report §5). Takes the live + * tree nodes + completeness + existing tracked files and returns the full set of + * decisions with NO IO: + * + * - builds the vault layout (deterministic relPath per live page), + * - `planReconciliation` -> toWrite / moved / absence-toDelete, + * - `decideAbsenceDeletions` -> the SPEC §8 suppression (incomplete-fetch + + * empty-live + mass-delete guard), folded IN here so `toDelete` is the + * POST-suppression set (empty when suppressed). + * + * Moves are NOT governed by the suppression: a moved page is present in `live`, + * so its old-path removal is real (the caller still gates it on the write + * succeeding). The expensive content fetch / file write / git ops happen in the + * thin `applyPullActions`. + */ +export declare function computePullActions(input: PullActionsInput): PullActions; +/** + * Injectable IO for `applyPullActions` (R-Pull-2). The real `main` wires these + * to the live client, the vault git wrapper, and `node:fs/promises`; tests pass + * fakes that RECORD calls so the ordering + the move-on-success data-loss guard + * are testable without real git/fs/network. + */ +export interface ApplyPullActionsDeps { + client: Pick; + git: Pick; + /** Write a file by ABSOLUTE path (mkdir of the parent is done internally). */ + writeFile: (absPath: string, text: string) => Promise; + /** Recursive mkdir of an ABSOLUTE directory path. */ + mkdir: (absDir: string) => Promise; + /** Remove a file by ABSOLUTE path (force: a missing file is a no-op). */ + rm: (absPath: string) => Promise; +} +/** Outcome counters from `applyPullActions` (for the summary + tests). */ +export interface ApplyResult { + written: number; + movedApplied: number; + deleted: number; + failed: number; + committed: boolean; + merge: { + ok: boolean; + conflict: boolean; + output: string; + }; +} +/** + * THIN IO applier (R-Pull-2). Performs the side effects in the EXACT current + * order, with all the original safety guards preserved bit-for-bit: + * + * 1. for each `toWrite`: fetch content (`client.getPageJson`) -> stabilize + * (normalize-on-write fixpoint, SPEC §11) -> mkdir + write. One bad page + * never aborts the pull (bounded-concurrency pool, fault-tolerant). + * 2. apply MOVE old-path removals — ONLY when the planner marked the old path + * removable AND the new-path write SUCCEEDED (the ⭐ data-loss guard: a + * failed move-write keeps the old path so the page never vanishes). + * 3. apply (post-suppression) absence deletes. + * 4. stageAll + commit on `docmost` (subject from ACTUAL written/deleted + * counts) + checkout main + merge docmost (conflicts surfaced, SPEC §9). + * + * `vaultRoot` roots the relPath -> absolute-path conversion for the fs deps. + */ +export declare function applyPullActions(deps: ApplyPullActionsDeps, actions: PullActions, vaultRoot: string): Promise; diff --git a/packages/git-sync/build/engine/pull.js b/packages/git-sync/build/engine/pull.js new file mode 100644 index 00000000..22b008bd --- /dev/null +++ b/packages/git-sync/build/engine/pull.js @@ -0,0 +1,284 @@ +/** + * Pull cycle — Docmost -> vault (SPEC §6 "Docmost -> ФС"). + * + * This increment turns the read-only mirror into the git-backed pull cycle: + * + * 1. ensureRepo(vault); refuse if a merge is in progress (SPEC §9/§12); + * ensureBranch("docmost", "main") (SPEC §5 branches) + * 2. checkout docmost + * 3. fetch the live tree (listSpaceTree -> {pages, complete}) -> compute the + * desired `live` files (relPath via the pure sanitize/disambiguation layout) + * 4. parse `existing` tracked .md files (pageId + relPath from gitmost_id frontmatter) + * 5. plan = planReconciliation(live, existing) (pure, SPEC §5/§8); toDelete + * is absence-only, moves are separate + * 6. decideAbsenceDeletions: SUPPRESS absence deletions on an incomplete tree + * fetch (SPEC §8) and behind the mass-delete guard (defense in depth) + * 7. write each live page in its fixpoint form (normalize-on-write, SPEC §11); + * apply moved-old-path removals (only when the move write SUCCEEDED) and + * absence-delete removals (only when the decision allowed them) + * 8. stageAll + commit on `docmost` with the provenance trailer (SPEC §7.3) + * 9. checkout main + merge docmost (conflicts are surfaced, NOT auto-resolved, + * SPEC §9); push is deferred (SPEC §7) + * 10. one-line summary + * + * DIRECTION IS Docmost -> vault ONLY. Nothing here ever writes to Docmost + * (read-only: listSpaceTree + getPageJson). All git operations run against + * the vault repo (`cwd = vaultPath`), never the source repo (see ./git.ts). + * + * The client seam is the native `GitSyncClient` (`Pick`); + * the gitmost server drives the engine in-process (there is no standalone CLI + * entry point). + */ +import { dirname } from "node:path"; +import { sep } from "node:path"; +import { parsePageFile, serializePageFile } from "../lib/page-file.js"; +import { buildVaultLayout } from "./layout.js"; +import { BOT_AUTHOR_NAME, BOT_AUTHOR_EMAIL, DEFAULT_BRANCH, } from "./git.js"; +import { planReconciliation, decideAbsenceDeletions, } from "./reconcile.js"; +import { stabilizePageBody } from "./stabilize.js"; +// Engine-only mirror branch (SPEC §5): the engine writes here, humans never do. +const DOCMOST_BRANCH = "docmost"; +// Machine-readable provenance the loop-guard keys on (SPEC §7.3 / §12). +const SOURCE_TRAILER = "Docmost-Sync-Source: docmost"; +// Number of pages fetched/stabilized concurrently. Bounded so a large space +// does not open thousands of simultaneous requests/conversions at once. +const CONCURRENCY = 6; +// How often to log incremental progress (every N completed pages). +const PROGRESS_EVERY = 25; +/** Convert a vault-relative path (forward-slash) to an absolute FS path. */ +function relToAbs(vaultRoot, relPath) { + return [vaultRoot, ...relPath.split("/")].join("/"); +} +/** Convert an absolute/relative segment list under the vault to a relPath. */ +function segmentsToRelPath(segments, stem) { + return [...segments, `${stem}.md`].join("/"); +} +/** + * Read every tracked .md file in the vault and recover `{ pageId, relPath }` from + * its `gitmost_id` frontmatter (native-Obsidian format). Files without a + * `gitmost_id` are skipped (they are not engine-tracked pages yet — e.g. a stray + * hand-written Obsidian file; PUSH adopts those separately). + * + * The IO is injected (R-Pull-1) so this is testable with fakes. Skip rules: + * - a `readFile` rejection (tracked but missing on disk, a mid-operation race) + * -> skipped, NOT thrown; the next pull converges; + * - no `gitmost_id` frontmatter (`parsePageFile` -> id null) -> skipped. + */ +export async function readExisting(deps) { + const tracked = await deps.listTracked(); + const existing = []; + for (const relPath of tracked) { + // git ls-files always emits forward-slash paths; normalize just in case. + const rel = relPath.split(sep).join("/"); + let text; + try { + text = await deps.readFile(rel); + } + catch { + // Tracked but missing on disk (mid-operation race) — skip; the next pull + // converges. + continue; + } + const { id } = parsePageFile(text); + if (id) + existing.push({ pageId: id, relPath: rel }); + } + return existing; +} +/** + * PURE pull-action planner (R-Pull-2, test-strategy report §5). Takes the live + * tree nodes + completeness + existing tracked files and returns the full set of + * decisions with NO IO: + * + * - builds the vault layout (deterministic relPath per live page), + * - `planReconciliation` -> toWrite / moved / absence-toDelete, + * - `decideAbsenceDeletions` -> the SPEC §8 suppression (incomplete-fetch + + * empty-live + mass-delete guard), folded IN here so `toDelete` is the + * POST-suppression set (empty when suppressed). + * + * Moves are NOT governed by the suppression: a moved page is present in `live`, + * so its old-path removal is real (the caller still gates it on the write + * succeeding). The expensive content fetch / file write / git ops happen in the + * thin `applyPullActions`. + */ +export function computePullActions(input) { + const { pages, treeComplete, existing } = input; + const layout = buildVaultLayout(pages); + const live = []; + for (const p of pages) { + if (!p || !p.id) + continue; + const entry = layout.get(p.id); + if (!entry) + continue; + live.push({ + pageId: p.id, + relPath: segmentsToRelPath(entry.segments, entry.stem), + }); + } + // Plan reconciliation (pure). `plan.toDelete` is ABSENCE-based only; + // `plan.moved` carries move old-path removals separately. + const plan = planReconciliation(live, existing); + // Decide whether the ABSENCE-based deletions may be applied this cycle + // (SPEC §8): incomplete-fetch suppression + empty-live + mass-delete guard. + // Moves are NOT governed by this. + const deletionDecision = decideAbsenceDeletions({ + treeComplete, + liveCount: live.length, + existingCount: existing.length, + deleteCount: plan.toDelete.length, + }); + return { + toWrite: plan.toWrite, + moved: plan.moved, + // Fold the suppression in: a suppressed cycle deletes nothing. + toDelete: deletionDecision.apply ? plan.toDelete : [], + deletionDecision, + existingCount: existing.length, + plannedDeleteCount: plan.toDelete.length, + }; +} +/** + * THIN IO applier (R-Pull-2). Performs the side effects in the EXACT current + * order, with all the original safety guards preserved bit-for-bit: + * + * 1. for each `toWrite`: fetch content (`client.getPageJson`) -> stabilize + * (normalize-on-write fixpoint, SPEC §11) -> mkdir + write. One bad page + * never aborts the pull (bounded-concurrency pool, fault-tolerant). + * 2. apply MOVE old-path removals — ONLY when the planner marked the old path + * removable AND the new-path write SUCCEEDED (the ⭐ data-loss guard: a + * failed move-write keeps the old path so the page never vanishes). + * 3. apply (post-suppression) absence deletes. + * 4. stageAll + commit on `docmost` (subject from ACTUAL written/deleted + * counts) + checkout main + merge docmost (conflicts surfaced, SPEC §9). + * + * `vaultRoot` roots the relPath -> absolute-path conversion for the fs deps. + */ +export async function applyPullActions(deps, actions, vaultRoot) { + const { client, git } = deps; + // Emit the SPEC §8 suppression warnings (preserved from the original `main`). + const decision = actions.deletionDecision; + if (!decision.apply) { + if (decision.reason === "incomplete-fetch") { + console.warn("pull: tree fetch incomplete — deletions suppressed this cycle (SPEC §8)"); + } + else if (decision.reason === "empty-live") { + console.warn(`pull: live fetch returned 0 pages but ${actions.existingCount} file(s) are ` + + `tracked — deletions suppressed this cycle (SPEC §8). Re-run when ` + + `Docmost is reachable.`); + } + else { + console.warn(`pull: plan would delete ${actions.plannedDeleteCount} of ${actions.existingCount} ` + + `tracked file(s) (mass-delete guard) — deletions suppressed this ` + + `cycle (SPEC §8). Verify the live Docmost tree, then re-run.`); + } + } + // 1. Write each live page in its fixpoint form (normalize-on-write, SPEC §11). + let written = 0; + let failed = 0; + let completed = 0; + let nextIndex = 0; + // pageIds whose write FAILED. A moved page whose new-path write failed must + // NOT have its old path removed (otherwise the page vanishes entirely). + const failedPageIds = new Set(); + const writeOne = async (w) => { + try { + const page = await client.getPageJson(w.pageId); + // Native-Obsidian format: a minimal `gitmost_id` frontmatter + the fixpoint + // markdown body. title/parent/space are DERIVED (filename / folder / repo), + // so nothing but the pageId is persisted as meta. + const text = serializePageFile(page.id, await stabilizePageBody(page.content)); + const abs = relToAbs(vaultRoot, w.relPath); + await deps.mkdir(dirname(abs)); + await deps.writeFile(abs, text); + written++; + } + catch (err) { + failed++; + failedPageIds.add(w.pageId); + console.error(`pull: failed page ${w.pageId}:`, err instanceof Error ? err.message : String(err)); + } + finally { + completed++; + if (completed % PROGRESS_EVERY === 0) { + console.log(`pulled ${completed}/${actions.toWrite.length}`); + } + } + }; + // Bounded-concurrency pool (dependency-free): a fixed set of runners each + // take the next index until the write list is exhausted. One bad page never + // aborts the whole pull (mirrors the fault-tolerant tree walk). + const runner = async () => { + while (true) { + const i = nextIndex++; + if (i >= actions.toWrite.length) + return; + await writeOne(actions.toWrite[i]); + } + }; + await Promise.all(Array.from({ length: Math.min(CONCURRENCY, actions.toWrite.length) || 1 }, () => runner())); + // Helper: `rm` with force:true is a no-op if the file is already gone. + const removePath = async (rel, what) => { + try { + await deps.rm(relToAbs(vaultRoot, rel)); + return true; + } + catch (err) { + console.error(`pull: failed to ${what} ${rel}:`, err instanceof Error ? err.message : String(err)); + return false; + } + }; + // 2. Apply MOVE old-path removals. A moved page IS present in `live`, so its + // old path is genuinely stale — NOT subject to the incomplete-fetch + // suppression. BUT only remove the old path when (a) the planner marked it + // removable (not reused by another live page) AND (b) the new-path write + // actually SUCCEEDED — otherwise we would delete the only copy of a page + // whose move-write failed (⭐ data-loss guard). + let movedApplied = 0; + for (const m of actions.moved) { + if (!m.removeOldPath) + continue; + if (failedPageIds.has(m.pageId)) { + console.warn(`pull: move write for ${m.pageId} failed — keeping old path ` + + `${m.fromRelPath} (SPEC §8)`); + continue; + } + if (await removePath(m.fromRelPath, "remove moved old path")) + movedApplied++; + } + // 3. Apply ABSENCE-based deletions — `actions.toDelete` is ALREADY the + // post-suppression set (empty when the decision suppressed them, SPEC §8). + let deleted = 0; + for (const rel of actions.toDelete) { + if (await removePath(rel, "delete")) + deleted++; + } + // 4. Stage + commit on `docmost` (only if there is something to commit). + // Deterministic stabilized output means unchanged pages produce identical + // bytes -> git sees no diff -> no churn (SPEC §11). The subject reflects the + // ACTUAL work applied (pages written + files deleted), not the planned size, + // so a run with failures does not over-report (SPEC §5 nit). + const subject = deleted > 0 + ? `docmost: sync ${written} page(s), ${deleted} deleted` + : `docmost: sync ${written} page(s)`; + await git.stageAll(); + const committed = await git.commit(subject, { + authorName: BOT_AUTHOR_NAME, + authorEmail: BOT_AUTHOR_EMAIL, + trailers: [SOURCE_TRAILER], + }); + // Merge docmost -> main. Conflicts are surfaced and left in git (SPEC §9); + // we never push to Docmost. Push to a git remote is deferred (SPEC §7). + await git.checkout(DEFAULT_BRANCH); + const merge = await git.merge(DOCMOST_BRANCH); + if (merge.conflict) { + console.error("pull: merge of docmost -> main CONFLICTED. Conflict markers were left " + + "in the vault for manual resolution (SPEC §9). Nothing is pushed to " + + "Docmost (read-only). Resolve locally, then re-run."); + } + else if (!merge.ok) { + console.error(`pull: merge of docmost -> main failed: ${merge.output}`); + } + console.log("pull: git push to remote is DEFERRED in this increment (SPEC §7)."); + return { written, movedApplied, deleted, failed, committed, merge }; +} diff --git a/packages/git-sync/build/engine/push.d.ts b/packages/git-sync/build/engine/push.d.ts new file mode 100644 index 00000000..c72d37a5 --- /dev/null +++ b/packages/git-sync/build/engine/push.d.ts @@ -0,0 +1,504 @@ +/** + * Push cycle — vault -> Docmost (SPEC §6 "ФС → Docmost"), FIRST increment. + * + * This module mirrors the structure of `./pull.ts`: a set of VaultGit diff/ref + * primitives (in `./git.ts`), a PURE planner (`computePushActions`) that turns + * a git diff into a classified action set with NO IO, and a THIN injectable + * applier (`applyPushActions`) exercised in tests via fakes only. + * + * Direction is vault -> Docmost. The diff is `main` against + * `refs/docmost/last-pushed` (SPEC §6 step 2); each `A`/`M`/`D`/`R` row is + * translated into a Docmost mutation by `pageId` identity (SPEC §4): + * - A without pageId -> create_page (then write the assigned pageId back). + * - A with pageId -> update (restored/copied file; the page already exists). + * - M -> update content (collab/Yjs path, SPEC §2/§15.6). + * - D -> delete_page (pageId recovered from the PRE-IMAGE meta). + * - R -> rename/move (CLASSIFIED here, APPLIED in push #3). + * + * MOVE/RENAME APPLY (push #3) — DONE here. `classifyRenameMoves` (PURE) resolves + * each `renamesMoves` entry into the Docmost op(s) it needs, comparing the PATH- + * derived parent (SPEC §5: the file path is the source of truth for tree + * position, NOT stale `meta.parentPageId`) and the meta title; `applyPushActions` + * then calls `move_page` / `rename_page` (both for a reparent+retitle), or + * records a NO-OP for a cosmetic local-only file-path rename. + * + * The client seam is the native `GitSyncClient` (`Pick`); + * the gitmost server drives the engine in-process (there is no standalone CLI + * entry point). + */ +import { type DocmostMdMeta } from "../lib/index.js"; +import type { GitSyncClient } from "./client.types.js"; +import type { DiffEntry } from "./git.js"; +import { VaultGit } from "./git.js"; +import { type Settings } from "./settings.js"; +export type { DiffEntry } from "./git.js"; +/** A page to CREATE in Docmost (new local file, meta has no pageId yet). */ +export interface CreateAction { + /** Vault-relative path of the new file. */ + path: string; +} +/** A page whose CONTENT changed (meta carries the existing pageId). */ +export interface UpdateAction { + pageId: string; + /** Vault-relative path of the changed file. */ + path: string; +} +/** A page to soft-delete in Docmost (Trash, SPEC §8). */ +export interface DeleteAction { + pageId: string; +} +/** A renamed/moved page (same pageId, new path). Resolution DEFERRED. */ +export interface RenameMoveAction { + pageId: string; + oldPath: string; + newPath: string; +} +/** + * A CLASSIFIED rename/move (push #3): a `RenameMoveAction` resolved into the + * Docmost op(s) it actually needs. The file PATH is the source of truth for tree + * position (SPEC §5: "истина связи — pageId, не путь" — the path is COSMETIC and + * LOCAL, the page identity is its pageId), so we compare the RESOLVED parent of + * the new path against the resolved parent of the old path, and the title in the + * current meta against the title in the previous meta. Each sub-op is emitted + * ONLY when something real changed: + * - `move` — the resolved parent page changed (reparent in Docmost). A `null` + * `parentPageId` means the new parent is ROOT (the file sits at the space + * root, no enclosing folder). + * - `rename` — the page title changed (a pure title edit in Docmost). + * - `noop` — neither changed: a purely LOCAL file-path rename (same parent, + * same title). The page identity is its pageId, so Docmost is NOT called. + * `move` and `rename` are independent and may BOTH be present (reparent + retitle). + */ +export interface RenameMoveActionClassified { + pageId: string; + oldPath: string; + newPath: string; + /** Present iff the resolved parent changed -> `move_page` (reparent). */ + move?: { + parentPageId: string | null; + }; + /** Present iff the title changed -> `rename_page` (title-only). */ + rename?: { + title: string; + }; + /** True iff neither parent nor title changed (cosmetic local-only rename). */ + noop?: true; +} +/** + * Injected resolvers for the PURE `classifyRenameMoves` (push #3). Both are PURE + * given a path + side; the real `main` (a follow-up) wires them to the file tree + * (`readFile` for `current`, `git.showFileAtRef` for `prev`), tests pass plain + * lookups. SPEC §5 path-as-truth: + * - `metaAt`: the file's synthetic native meta at that side (title from the + * filename, pageId from the `gitmost_id` frontmatter). + * - `resolveParentPageId`: the pageId of the page whose FILE is the parent + * FOLDER's `.md` (one level up from the given path), or `null` for ROOT. + */ +export interface ClassifyRenameMovesDeps { + metaAt: (path: string, side: MetaSide) => DocmostMdMeta | null; + resolveParentPageId: (path: string, side: MetaSide) => string | null; +} +/** + * PURE classifier for the `renamesMoves` produced by `computePushActions` + * (push #3, SPEC §5/§6/§8). Resolves each `{pageId, oldPath, newPath}` into the + * Docmost op(s) it needs, with NO IO (both resolvers are injected). + * + * SPEC §5 — the file PATH is the source of truth for tree position, NOT the + * (possibly stale) `meta.parentPageId`. So the NEW parent is resolved from + * `newPath`'s enclosing folder, and the OLD parent from `oldPath`'s enclosing + * folder, via `deps.resolveParentPageId`. The title comes from the meta. + * + * For each entry: + * - `newParent = resolveParentPageId(newPath, 'current')`, + * `oldParent = resolveParentPageId(oldPath, 'prev')`. + * - `newTitle = metaAt(newPath,'current')?.title`, + * `oldTitle = metaAt(oldPath,'prev')?.title`. + * - include `move` iff `newParent !== oldParent` (a real reparent), + * - include `rename` iff `newTitle` is a NON-EMPTY string AND differs from + * `oldTitle` (a real title edit; an empty/absent new title is never a rename), + * - if NEITHER applies -> `noop: true` (a cosmetic local-only file-path rename; + * the page is its pageId, so Docmost is not touched). + */ +export declare function classifyRenameMoves(renamesMoves: RenameMoveAction[], deps: ClassifyRenameMovesDeps): RenameMoveActionClassified[]; +/** The classified set of push actions (PURE output of `computePushActions`). */ +export interface PushActions { + creates: CreateAction[]; + updates: UpdateAction[]; + deletes: DeleteAction[]; + renamesMoves: RenameMoveAction[]; + /** + * Diff rows that could NOT be classified into an action, with a reason — e.g. + * a deleted file whose PRE-IMAGE meta carried no recoverable pageId (the + * untracked-file guard, SPEC §8: only files that were tracked with a pageId + * are deleted in Docmost). Carried so the caller can log them. + */ + skipped: { + path: string; + status: DiffEntry["status"]; + reason: string; + }[]; +} +/** + * Which tree a `metaAt` lookup reads the file's native meta from: + * - `current`: the current `main` tree (the live file content) — used for + * A/M/R, where the file still exists. + * - `prev`: the last-pushed PRE-IMAGE (e.g. `refs/docmost/last-pushed:`) + * — used for D, where the file is gone from `main` but its pageId must be + * recovered from the version Docmost last knew (SPEC §6/§8). + */ +export type MetaSide = "current" | "prev"; +/** Input to the PURE planner. `metaAt` is injected (no IO inside the planner). */ +export interface PushActionsInput { + /** Diff rows of `main` vs `refs/docmost/last-pushed` (SPEC §6 step 2). */ + changes: DiffEntry[]; + /** + * Resolve a file's synthetic native meta at a given side, or `null` if the file is + * absent there / has no parseable meta. PURE injection: the real `main` reads + * the working tree (current) or `git show :` (prev); tests + * pass a plain lookup. + */ + metaAt: (path: string, side: MetaSide) => DocmostMdMeta | null; + /** + * The pageIds present at ANY path in the current `main` tree (optional). When + * given, a deleted file whose pageId still lives somewhere in the tree is NOT + * a deletion but a MOVE — guards against trashing a live page when a layout + * reshuffle relocated its file (possibly across two cycles, so the matching + * add isn't in THIS diff). When omitted, only the in-diff D+A/M coalescing + * applies. + */ + currentPageIds?: Set; +} +/** + * PURE push planner (SPEC §4/§6/§8). Classifies each diff row into a Docmost + * action by `pageId` identity, with NO IO (the `metaAt` resolver is injected). + * + * Classification rules: + * - `A` (added): + * - current meta HAS a pageId -> UPDATE (a restored/copied file whose + * page already exists; we push its content rather than create a dup). + * - current meta has NO pageId but HAS a non-empty spaceId -> CREATE (a + * brand-new local file; the page does not exist in Docmost yet). + * - current meta has NO pageId and NO usable spaceId -> SKIP with reason + * `create-without-spaceId`: Docmost `create_page` REQUIRES a spaceId + * (§16), and a new local file may carry only partial human meta. We + * refuse to create rather than guess a space (SPEC §8 guard spirit). + * - `M` (modified): current meta has a pageId -> UPDATE content. (If a modified + * file somehow lost its pageId it is skipped — there is nothing to target.) + * - `D` (deleted): recover the pageId from the PRE-IMAGE meta (`metaAt(path, + * 'prev')`) -> DELETE. If no pageId can be recovered, SKIP with a reason + * (untracked-file guard, SPEC §8: never delete an untracked page). + * - `R` (renamed/moved): same pageId (from current meta), path changed -> + * RENAME/MOVE. Resolution of move-vs-rename + the new parentPageId is + * DEFERRED to the next increment; here we only record oldPath/newPath/ + * pageId. If the renamed file has no recoverable pageId it is SKIPPED. + * (`C` copy is treated the same as `R` for recording purposes.) + */ +export declare function computePushActions(input: PushActionsInput): PushActions; +/** The marker the push direction advances after a successful push (SPEC §5/§6). */ +export declare const LAST_PUSHED_REF = "refs/docmost/last-pushed"; +/** + * The mirror branch fast-forwarded after a clean push (SPEC §5/§6 step 3). It + * reflects "what Docmost currently contains"; advancing it to the pushed `main` + * commit closes the loop so the next pull diffs empty for the pushed pages. + */ +export declare const DOCMOST_BRANCH = "docmost"; +/** + * Injectable IO for `applyPushActions`. The real `main` (NEXT increment) wires + * these to the live client, `node:fs/promises`, and the vault git wrapper; this + * increment drives them only through FAKES in tests (no live destructive run). + * - `client`: the create/update/delete/move/rename subset of `GitSyncClient`. + * - `readFile`/`writeFile`: read a changed file's body / write a file back + * (by vault-relative path; the applier does not resolve absolute paths so + * fakes stay trivial). + * - `git`: `updateRef` (advance `refs/docmost/last-pushed`) and + * `fastForwardBranch` (advance the `docmost` mirror after a clean push, the + * loop-close — SPEC §6 step 3 / §10). + */ +export interface ApplyPushDeps { + client: Pick; + /** Read a changed file's full text by its vault-relative path. */ + readFile: (path: string) => Promise; + /** Write a file's full text by its vault-relative path. */ + writeFile: (path: string, text: string) => Promise; + /** + * The Docmost spaceId this vault mirrors. A CREATE targets this space (the + * native file carries no spaceId — every file in the vault belongs to it), and + * it backs the synthetic native meta the classifier reads. + */ + spaceId: string; + /** + * `updateRef` advances `refs/docmost/last-pushed`; `fastForwardBranch` advances + * the `docmost` mirror after a clean push. `showFileAtRef` reads a file's text + * at a ref (used by the move/rename classifier to resolve the PREVIOUS parent + * folder's `.md` at `refs/docmost/last-pushed`, SPEC §5 path-as-truth). + */ + git: Pick; +} +/** A file whose meta was rewritten with a freshly-assigned pageId (post-create). */ +export interface WrittenBackPage { + path: string; + pageId: string; +} +/** + * The per-page push record consulted by a FUTURE poll-suppression (SPEC §10): a + * pulled page whose body hash + `updatedAt` match a record here is OUR OWN write + * and must not be re-pulled. PRODUCED here; CONSUMED on the pull side later. + */ +export interface PushedPageRecord { + /** The Docmost pageId that was updated/created. */ + pageId: string; + /** + * The `updatedAt` from the create/update client result, when the result + * exposed one. Absent when the (fake) client did not return it. + */ + updatedAt?: string; + /** Stable hash of the markdown BODY that was pushed (SPEC §10 "хэш тела"). */ + bodyHash: string; +} +/** + * One page whose operation FAILED during apply (SPEC §12 resumability). The bad + * page is isolated — recorded here — and the rest of the batch still runs; the + * refs are NOT advanced when there is any failure, so a re-run retries cleanly. + */ +export interface PushFailure { + kind: "update" | "create" | "delete" | "move" | "rename"; + /** The pageId for update/delete/move/rename; absent for a never-id'd create. */ + pageId?: string; + /** The vault-relative path for create/update/move/rename; absent for delete. */ + path?: string; + /** The error message captured from the thrown error. */ + error: string; +} +/** + * A rename/move action that resolved to a NO-OP (push #3, SPEC §5): a purely + * LOCAL file-path rename whose resolved parent AND title are both unchanged. The + * page identity is its pageId and the path is COSMETIC/local-only, so Docmost is + * NOT called — the skip is recorded here (with the reason) for logging. + */ +export interface PushNoop { + pageId: string; + oldPath: string; + newPath: string; + /** Why no Docmost op was emitted (currently always a path-only rename). */ + reason: "path-only-rename"; +} +/** Structured outcome of `applyPushActions` (counts + write-backs + noops). */ +export interface ApplyPushResult { + created: number; + updated: number; + deleted: number; + /** Pages reparented in Docmost via `move_page` (push #3, SPEC §5/§16). */ + moved: number; + /** Pages retitled in Docmost via `rename_page` (push #3, SPEC §5/§6). */ + renamed: number; + /** + * Files whose `gitmost_id` frontmatter was written with the pageId Docmost assigned on + * create — these now need a FOLLOW-UP commit (the meta on disk changed). The + * commit itself is the caller's job (NEXT increment); recorded here so it is + * not lost. + */ + writtenBack: WrittenBackPage[]; + /** + * Per-page push records (pageId + optional `updatedAt` + body hash) for every + * page successfully updated/created — the §10 loop-guard data a future + * poll-suppression (pull side) will consult so it does not re-pull our own + * write. Deletes are not included (no body was pushed). + */ + pushed: PushedPageRecord[]; + /** + * Pages whose operation threw — isolated and recorded, the batch continued + * (SPEC §12). Non-empty here means the refs were NOT advanced. + */ + failures: PushFailure[]; + /** + * Rename/move actions that resolved to a NO-OP — a purely LOCAL file-path + * rename (same parent, same title). NO Docmost call was made for these (SPEC + * §5: the page is its pageId, the path is local-only). Recorded for logging. + */ + noops: PushNoop[]; + /** Diff rows the planner could not classify (carried through for logging). */ + skipped: PushActions["skipped"]; + /** Whether `refs/docmost/last-pushed` was advanced (only on a CLEAN push). */ + lastPushedAdvanced: boolean; + /** + * Result of fast-forwarding the `docmost` mirror branch after a CLEAN push + * (the loop-close, SPEC §6 step 3 / §10). `null` when no advance was attempted + * (no `pushedCommit`, or there were failures). `{ ok:false, reason }` when a + * non-fast-forward was REFUSED (divergent `docmost` history is never clobbered). + */ + docmostFastForward: { + ok: boolean; + reason?: string; + } | null; +} +/** + * THIN IO applier for the COMMON push cases (create/update/delete). Exercised + * via FAKES only in this increment — there is no live wiring. + * + * - UPDATE: read the file body, then `client.importPageMarkdown(pageId, body)`. + * This is the collab/Yjs write path (SPEC §2/§15.6) — NEVER a raw jsonb + * overwrite. The full self-contained markdown (meta + body) is sent as-is; + * `importPageMarkdown` parses the meta/body itself. + * - CREATE: derive title/spaceId/parentPageId from the file's current meta, + * `client.createPage(...)`, take the assigned pageId from the result, and + * write it BACK as the file's `gitmost_id` frontmatter (re-serialized via + * `serializePageFile`, body preserved) so the file becomes + * tracked. The write-back is recorded in `writtenBack` (a follow-up commit + * is needed — NEXT increment). + * - DELETE: `client.deletePage(pageId)` — soft-delete to Trash (SPEC §8). + * - RENAME/MOVE (push #3, SPEC §5/§6/§16): classify each `renamesMoves` entry + * with `classifyRenameMoves` (resolvers read the parent FOLDER's `.md` for + * the parent pageId — path-as-truth — and the meta for the title), then: + * - `move` -> `client.movePage(pageId, parentPageId, position?)` (reparent; + * `position` is UNDEFINED for now — the client supplies a default), + * - `rename` -> `client.renamePage(pageId, title)` (title-only), + * - BOTH -> move (reparent) THEN rename (title), in that order, + * - `noop` -> NO client call; recorded in `noops` (a cosmetic local-only + * file-path rename: the page is its pageId, the path is local, SPEC §5). + * + * FAIL-SAFE / per-page isolation (SPEC §12 resumability). Each page's operation + * is wrapped in its own try/catch: a single failing page is recorded in + * `failures[]` (with its kind + pageId/path + error) and the batch CONTINUES — + * one bad page must never block the rest. Crucially, the refs are advanced ONLY + * when `failures.length === 0`: a PARTIAL push must NOT advance + * `refs/docmost/last-pushed` or the `docmost` mirror, so a re-run retries the + * whole batch cleanly (the already-applied pages are idempotent re-applies). + * + * LOOP-CLOSE (SPEC §6 step 3 / §10). After a fully-successful push, when a + * `pushedCommit` is supplied: + * - advance `refs/docmost/last-pushed` to it (what of `main` is in Docmost), AND + * - fast-forward the `docmost` mirror branch to it via + * `git.fastForwardBranch('docmost', pushedCommit)` — so the mirror reflects + * what Docmost now contains and the NEXT pull diffs EMPTY for these pages + * (it does not re-pull our own write). The ff is REFUSED (not forced) if + * `docmost` is not an ancestor of the pushed commit; the result is surfaced + * in `docmostFastForward`. On ANY failure, NEITHER ref is advanced. + * + * LOOP-GUARD DATA (SPEC §10). For every page successfully updated/created the + * result carries a `pushed` record `{ pageId, updatedAt?, bodyHash }` — the body + * hash of what was pushed plus the write's `updatedAt` (when the client returned + * one). A future pull-side poll-suppression consults this so it does not re-pull + * our own write; producing it is in scope here, consuming it is deferred. + * + * @param pushedCommit The `main` commit just reflected into Docmost (SHA or + * commit-ish). When omitted, NEITHER ref is advanced (e.g. a dry plan). + */ +export declare function applyPushActions(deps: ApplyPushDeps, actions: PushActions, pushedCommit?: string): Promise; +/** + * SPEC §5 path-as-truth: the parent FOLDER's `.md` file for a vault-relative + * (forward-slash) path. `buildVaultLayout` puts a page with children at + * `<...>/Title.md` and nests its children under `<...>/Title/`, so for + * `newPath = /Child.md` the parent page's file is `.md` (the enclosing + * folder, one level up). A path with NO enclosing folder (`Child.md`, at the + * space root) has no parent folder file -> `null` (the parent is ROOT). + */ +export declare function parentFolderFile(path: string): string | null; +/** + * Whether a vault path is a Docmost PAGE file (design §"Адопция"): a `.md` file + * with NO dot-segment anywhere in its path. This excludes `.obsidian/` config, + * `.trash/`, dotfiles (`.foo.md`), and every non-`.md` file (attachments, JSON, + * …) — Obsidian owns those; they live in the vault but are never pages. Used to + * screen the PUSH diff so non-page files are never created/updated/deleted in + * Docmost (and never get a `gitmost_id` frontmatter written into them). + */ +export declare function isPageFile(path: string): boolean; +/** + * The human ("local") git identity used for engine-made commits on `main` in the + * push direction (SPEC §7.3). The provenance is carried by the trailer (below), + * which the loop-guard keys on; the identity is for history readability only. + * When the vault repo already has a configured `user.name`/`user.email`, git + * uses that for the working-tree commit; this is the fallback the daemon stamps. + */ +export declare const LOCAL_AUTHOR_NAME = "Local"; +export declare const LOCAL_AUTHOR_EMAIL = "local@local"; +/** The provenance trailer marking a `main`-side (human/local) commit (SPEC §7.3). */ +export declare const LOCAL_SOURCE_TRAILER = "Docmost-Sync-Source: local"; +/** + * Injectable deps for `runPush` (mirrors `pull.ts`'s wiring; everything that + * touches the outside world is here so tests pass fakes). `makeClient` is a + * FACTORY, not a client — a dry-run must build NO client at all (it is never + * called), and only `--apply` invokes it. + */ +export interface PushDeps { + settings: Settings; + git: Pick; + /** Build a real client — called ONLY on `--apply`, never on dry-run. */ + makeClient: (settings: Settings) => ApplyPushDeps["client"]; + /** Read a file's full text by its vault-relative (forward-slash) path. */ + readFile: (path: string) => Promise; + /** Write a file's full text by its vault-relative path. */ + writeFile: (path: string, text: string) => Promise; + /** Structured logger (defaults to console in `main`; a recorder in tests). */ + log: (line: string) => void; +} +/** The structured outcome of a `runPush` cycle (returned + summarized). */ +export interface PushRunResult { + /** Which path ran: `dry-run` (plan only) or `apply` (Docmost mutated). */ + mode: "dry-run" | "apply"; + /** Why the cycle stopped before planning, if it did (e.g. a left-over merge). */ + aborted?: "merge-in-progress"; + /** The diff base the plan was computed against (`last-pushed` else `docmost`). */ + base?: { + ref: string; + source: "last-pushed" | "docmost"; + sha: string | null; + }; + /** The `main` commit the plan targets (the would-be pushed commit). */ + pushedCommit?: string; + /** Planned action counts from the PURE planner (present once a plan was built). */ + planned?: { + creates: number; + updates: number; + deletes: number; + renamesMoves: number; + skipped: number; + }; + /** The applier's structured result — ONLY present on the `--apply` path. */ + applied?: ApplyPushResult; + /** + * True when `applyPushActions` REFUSED to fast-forward a divergent `docmost` + * mirror (SPEC §5 invariant broken). Escalated (logged prominently) and folded + * into the CLI's non-zero exit. + */ + divergentDocmost?: boolean; + /** Per-page failures from the applier (empty/absent on a clean run). */ + failures?: PushFailure[]; +} +/** + * Run one FS->Docmost push cycle (SPEC §6 "ФС → Docmost"), DRY-RUN BY DEFAULT. + * + * Steps (mirrors `pull.ts`): + * 1. Preflight git: `assertGitAvailable` + `ensureRepo`; ABORT (clear message + + * non-zero-ish result) if a merge is in progress — never push on top of an + * unresolved conflict (SPEC §9/§12). Conflict markers must NEVER reach + * Docmost (SPEC §9). + * 2. Checkout `main` (the human-facing branch the push reads from). + * 3. Commit the human's pending working-tree changes on `main` with the + * `local` provenance trailer (SPEC §7.3). A no-op when nothing changed. + * 4. Pick the diff BASE: `refs/docmost/last-pushed` if it resolves, else the + * `docmost` mirror branch (what Docmost currently has). Resolve `main`. + * 5. `diffNameStatus(base, main)` -> changes; build the `metaAt(path, side)` + * resolver (current = working tree, prev = `git show :`); run + * the PURE `computePushActions`. + * 6. DRY-RUN (default): LOG the full plan and RETURN — NO client, NO Docmost + * calls, NO ref advance. + * 7. `--apply`: build the client, run `applyPushActions(..., pushedCommit=main)`, + * then (a) if any pageIds were written back (creates), commit them on `main` + * with the `local` trailer and RE-advance `refs/docmost/last-pushed` to the + * new commit so the recorded pageIds are persisted in what Docmost mirrors; + * (b) ESCALATE a divergent-`docmost` ff refusal (SPEC §5) with a prominent + * WARNING and a non-zero-ish flag. Then log a one-line summary. + */ +export declare function runPush(deps: PushDeps, opts: { + dryRun: boolean; +}): Promise; +/** Parsed `push` CLI flags. DRY-RUN is the default; `--apply` opts into writes. */ +export interface PushParsedArgs { + /** True when `--apply` was passed (the ONLY path that writes to Docmost). */ + apply: boolean; +} +/** + * Parse the `push` CLI flags. SAFE BY DEFAULT: without `--apply` the run is a + * DRY-RUN (plan only). Exported so the flag handling is unit-testable. + */ +export declare function parseArgs(argv: string[]): PushParsedArgs; diff --git a/packages/git-sync/build/engine/push.js b/packages/git-sync/build/engine/push.js new file mode 100644 index 00000000..841fb105 --- /dev/null +++ b/packages/git-sync/build/engine/push.js @@ -0,0 +1,971 @@ +import { parsePageFile, serializePageFile } from "../lib/page-file.js"; +import { DEFAULT_BRANCH } from "./git.js"; +import { bodyHash } from "./loop-guard.js"; +/** + * PURE classifier for the `renamesMoves` produced by `computePushActions` + * (push #3, SPEC §5/§6/§8). Resolves each `{pageId, oldPath, newPath}` into the + * Docmost op(s) it needs, with NO IO (both resolvers are injected). + * + * SPEC §5 — the file PATH is the source of truth for tree position, NOT the + * (possibly stale) `meta.parentPageId`. So the NEW parent is resolved from + * `newPath`'s enclosing folder, and the OLD parent from `oldPath`'s enclosing + * folder, via `deps.resolveParentPageId`. The title comes from the meta. + * + * For each entry: + * - `newParent = resolveParentPageId(newPath, 'current')`, + * `oldParent = resolveParentPageId(oldPath, 'prev')`. + * - `newTitle = metaAt(newPath,'current')?.title`, + * `oldTitle = metaAt(oldPath,'prev')?.title`. + * - include `move` iff `newParent !== oldParent` (a real reparent), + * - include `rename` iff `newTitle` is a NON-EMPTY string AND differs from + * `oldTitle` (a real title edit; an empty/absent new title is never a rename), + * - if NEITHER applies -> `noop: true` (a cosmetic local-only file-path rename; + * the page is its pageId, so Docmost is not touched). + */ +export function classifyRenameMoves(renamesMoves, deps) { + return renamesMoves.map((rm) => { + const newParent = deps.resolveParentPageId(rm.newPath, "current"); + const oldParent = deps.resolveParentPageId(rm.oldPath, "prev"); + const newTitle = deps.metaAt(rm.newPath, "current")?.title; + const oldTitle = deps.metaAt(rm.oldPath, "prev")?.title; + const out = { + pageId: rm.pageId, + oldPath: rm.oldPath, + newPath: rm.newPath, + }; + // A reparent: the new path's resolved parent page differs from the old's. + if (newParent !== oldParent) { + out.move = { parentPageId: newParent }; + } + // A title edit: only when there is a real, non-empty new title that changed. + if (typeof newTitle === "string" && + newTitle.length > 0 && + newTitle !== oldTitle) { + out.rename = { title: newTitle }; + } + // Neither changed -> a purely LOCAL file-path rename; do NOT call Docmost. + if (!out.move && !out.rename) { + out.noop = true; + } + return out; + }); +} +/** + * PURE push planner (SPEC §4/§6/§8). Classifies each diff row into a Docmost + * action by `pageId` identity, with NO IO (the `metaAt` resolver is injected). + * + * Classification rules: + * - `A` (added): + * - current meta HAS a pageId -> UPDATE (a restored/copied file whose + * page already exists; we push its content rather than create a dup). + * - current meta has NO pageId but HAS a non-empty spaceId -> CREATE (a + * brand-new local file; the page does not exist in Docmost yet). + * - current meta has NO pageId and NO usable spaceId -> SKIP with reason + * `create-without-spaceId`: Docmost `create_page` REQUIRES a spaceId + * (§16), and a new local file may carry only partial human meta. We + * refuse to create rather than guess a space (SPEC §8 guard spirit). + * - `M` (modified): current meta has a pageId -> UPDATE content. (If a modified + * file somehow lost its pageId it is skipped — there is nothing to target.) + * - `D` (deleted): recover the pageId from the PRE-IMAGE meta (`metaAt(path, + * 'prev')`) -> DELETE. If no pageId can be recovered, SKIP with a reason + * (untracked-file guard, SPEC §8: never delete an untracked page). + * - `R` (renamed/moved): same pageId (from current meta), path changed -> + * RENAME/MOVE. Resolution of move-vs-rename + the new parentPageId is + * DEFERRED to the next increment; here we only record oldPath/newPath/ + * pageId. If the renamed file has no recoverable pageId it is SKIPPED. + * (`C` copy is treated the same as `R` for recording purposes.) + */ +export function computePushActions(input) { + const { metaAt, currentPageIds } = input; + // PAGE-FILE FILTER (design §"Адопция"): only `.md` files OUTSIDE any dot-folder + // are Docmost pages. `.obsidian/*`, attachments, and other non-page files are + // committed to the vault (no `.gitignore`) and so appear in the diff, but they + // are NEVER pages — Obsidian owns them. Without this filter every ADDED such + // file would be mis-classified as a CREATE (nativeMeta always supplies a + // spaceId, so the old `create-without-spaceId` skip no longer screens them), + // creating junk pages in Docmost and corrupting the file with a `gitmost_id` + // frontmatter. Filter BEFORE any classification so non-page A/M/D/R are ignored. + const changes = input.changes.filter((c) => isPageFile(c.path)); + const actions = { + creates: [], + updates: [], + deletes: [], + renamesMoves: [], + skipped: [], + }; + // GHOST-MOVE coalescing (⭐ data-loss guard). git's rename detection (`-M`) + // can miss a move when the two files are too dissimilar — which is exactly the + // case for the tiny meta-only files a layout RESHUFFLE produces (e.g. + // several untitled pages sharing the `_` fallback name; retitling one frees the + // bare `_` and another page's file relocates `_ ~slug.md` -> `_.md`). git then + // reports the move as a DELETE of the old path + an ADD of the new one. Taken + // literally that soft-deletes a page that merely MOVED — a live page vanishing + // into Trash. Identity is the pageId, not git's heuristic: a pageId that is + // BOTH deleted (pre-image) and added (current) is one page that relocated, so + // we classify it as a rename/move and NEVER as a delete. + // A pageId can land at its new path two ways: as an ADD (the path was free) or + // as a MODIFY (the path was occupied by ANOTHER page that left — the reshuffle + // case, where `_.md`'s occupant changes pageId). Both are "the page survives at + // a new path", so the surviving side is the CURRENT-meta pageId of A *and* M. + const deletedPath = new Map(); + const survivingPath = new Map(); + for (const change of changes) { + if (change.status === "D") { + const pid = metaAt(change.path, "prev")?.pageId; + if (pid) + deletedPath.set(pid, change.path); + } + else if (change.status === "A" || change.status === "M") { + const pid = metaAt(change.path, "current")?.pageId; + if (pid) + survivingPath.set(pid, change.path); + } + } + const ghostMove = new Map(); + for (const [pid, oldPath] of deletedPath) { + const newPath = survivingPath.get(pid); + if (newPath && newPath !== oldPath) { + ghostMove.set(pid, { oldPath, newPath }); + } + } + for (const change of changes) { + switch (change.status) { + case "A": { + const meta = metaAt(change.path, "current"); + const pageId = meta?.pageId; + if (pageId && ghostMove.has(pageId)) { + // Half of a git-undetected move (a matching DELETE exists): record it + // as a rename/move (like a real `R`), NOT an update — the `D` side is + // suppressed so the page is never soft-deleted. + actions.renamesMoves.push({ + pageId, + oldPath: ghostMove.get(pageId).oldPath, + newPath: change.path, + }); + } + else if (pageId) { + // Added but already carries a pageId (restored/copied file): the page + // exists in Docmost, so push content as an UPDATE — never a duplicate. + actions.updates.push({ pageId, path: change.path }); + } + else if (meta?.spaceId) { + // Brand-new local file with a target space -> create the page, then + // write the assigned pageId back into its meta (in `applyPushActions`). + // `meta.spaceId` is truthy here, so empty-string is also rejected. + actions.creates.push({ path: change.path }); + } + else { + // A create needs a spaceId (Docmost `create_page` requires it, §16). A + // new file with partial meta and no usable spaceId is SKIPPED rather + // than created into a guessed space (SPEC §8 guard spirit). + actions.skipped.push({ + path: change.path, + status: "A", + reason: "create-without-spaceId", + }); + } + break; + } + case "M": { + const meta = metaAt(change.path, "current"); + const pageId = meta?.pageId; + if (pageId && ghostMove.has(pageId)) { + // This path's occupant changed pageId: the previous page left and THIS + // page relocated here (a reshuffle). Its old file was DELETED elsewhere + // — coalesce into a rename/move so the page is never trashed. + actions.renamesMoves.push({ + pageId, + oldPath: ghostMove.get(pageId).oldPath, + newPath: change.path, + }); + } + else if (pageId) { + actions.updates.push({ pageId, path: change.path }); + } + else { + // A modified file with no pageId has no Docmost target to update. + actions.skipped.push({ + path: change.path, + status: "M", + reason: "modified file has no pageId in meta", + }); + } + break; + } + case "D": { + // The file is gone from `main`; recover its pageId from the PRE-IMAGE + // (the version last pushed to Docmost) so we delete the RIGHT page. + const prevMeta = metaAt(change.path, "prev"); + const pageId = prevMeta?.pageId; + if (pageId && ghostMove.has(pageId)) { + // The same pageId was re-ADDED at a new path: this is a git-undetected + // MOVE, handled by the `A` branch above. Suppress the delete so a moved + // page is never trashed (⭐ data-loss guard). + actions.skipped.push({ + path: change.path, + status: "D", + reason: "ghost-move (re-added at a new path) — not a deletion", + }); + } + else if (pageId && currentPageIds?.has(pageId)) { + // The pageId still EXISTS elsewhere in the current tree: the file moved + // (a layout reshuffle whose matching add was in an earlier cycle, so it + // is not in this diff). A live page must never be trashed because its + // FILENAME changed — identity is the pageId (⭐ data-loss guard). + actions.skipped.push({ + path: change.path, + status: "D", + reason: "pageId still present in the tree (moved) — not a deletion", + }); + } + else if (pageId) { + actions.deletes.push({ pageId }); + } + else { + // Untracked-file guard (SPEC §8): a file with no recoverable pageId was + // never a Docmost page — do NOT translate its removal into a delete. + actions.skipped.push({ + path: change.path, + status: "D", + reason: "deleted file has no recoverable pageId (pre-image meta)", + }); + } + break; + } + case "R": + case "C": { + // Same page, new path. Identity comes from the CURRENT (post-rename) meta + // since the file still exists. RESOLUTION (move vs rename, parentPageId) + // is deferred — record oldPath/newPath/pageId only. + const meta = metaAt(change.path, "current"); + const pageId = meta?.pageId; + const oldPath = change.oldPath ?? change.path; + if (pageId) { + actions.renamesMoves.push({ + pageId, + oldPath, + newPath: change.path, + }); + } + else { + actions.skipped.push({ + path: change.path, + status: change.status, + reason: "renamed/moved file has no pageId in meta", + }); + } + break; + } + default: { + // Unreachable for A/M/D/R/C; defensive for any future status. + actions.skipped.push({ + path: change.path, + status: change.status, + reason: `unhandled diff status ${change.status}`, + }); + } + } + } + return actions; +} +// --- thin apply (create/update/delete), fakes-only in this increment --------- +/** The marker the push direction advances after a successful push (SPEC §5/§6). */ +export const LAST_PUSHED_REF = "refs/docmost/last-pushed"; +/** + * The mirror branch fast-forwarded after a clean push (SPEC §5/§6 step 3). It + * reflects "what Docmost currently contains"; advancing it to the pushed `main` + * commit closes the loop so the next pull diffs empty for the pushed pages. + */ +export const DOCMOST_BRANCH = "docmost"; +/** + * THIN IO applier for the COMMON push cases (create/update/delete). Exercised + * via FAKES only in this increment — there is no live wiring. + * + * - UPDATE: read the file body, then `client.importPageMarkdown(pageId, body)`. + * This is the collab/Yjs write path (SPEC §2/§15.6) — NEVER a raw jsonb + * overwrite. The full self-contained markdown (meta + body) is sent as-is; + * `importPageMarkdown` parses the meta/body itself. + * - CREATE: derive title/spaceId/parentPageId from the file's current meta, + * `client.createPage(...)`, take the assigned pageId from the result, and + * write it BACK as the file's `gitmost_id` frontmatter (re-serialized via + * `serializePageFile`, body preserved) so the file becomes + * tracked. The write-back is recorded in `writtenBack` (a follow-up commit + * is needed — NEXT increment). + * - DELETE: `client.deletePage(pageId)` — soft-delete to Trash (SPEC §8). + * - RENAME/MOVE (push #3, SPEC §5/§6/§16): classify each `renamesMoves` entry + * with `classifyRenameMoves` (resolvers read the parent FOLDER's `.md` for + * the parent pageId — path-as-truth — and the meta for the title), then: + * - `move` -> `client.movePage(pageId, parentPageId, position?)` (reparent; + * `position` is UNDEFINED for now — the client supplies a default), + * - `rename` -> `client.renamePage(pageId, title)` (title-only), + * - BOTH -> move (reparent) THEN rename (title), in that order, + * - `noop` -> NO client call; recorded in `noops` (a cosmetic local-only + * file-path rename: the page is its pageId, the path is local, SPEC §5). + * + * FAIL-SAFE / per-page isolation (SPEC §12 resumability). Each page's operation + * is wrapped in its own try/catch: a single failing page is recorded in + * `failures[]` (with its kind + pageId/path + error) and the batch CONTINUES — + * one bad page must never block the rest. Crucially, the refs are advanced ONLY + * when `failures.length === 0`: a PARTIAL push must NOT advance + * `refs/docmost/last-pushed` or the `docmost` mirror, so a re-run retries the + * whole batch cleanly (the already-applied pages are idempotent re-applies). + * + * LOOP-CLOSE (SPEC §6 step 3 / §10). After a fully-successful push, when a + * `pushedCommit` is supplied: + * - advance `refs/docmost/last-pushed` to it (what of `main` is in Docmost), AND + * - fast-forward the `docmost` mirror branch to it via + * `git.fastForwardBranch('docmost', pushedCommit)` — so the mirror reflects + * what Docmost now contains and the NEXT pull diffs EMPTY for these pages + * (it does not re-pull our own write). The ff is REFUSED (not forced) if + * `docmost` is not an ancestor of the pushed commit; the result is surfaced + * in `docmostFastForward`. On ANY failure, NEITHER ref is advanced. + * + * LOOP-GUARD DATA (SPEC §10). For every page successfully updated/created the + * result carries a `pushed` record `{ pageId, updatedAt?, bodyHash }` — the body + * hash of what was pushed plus the write's `updatedAt` (when the client returned + * one). A future pull-side poll-suppression consults this so it does not re-pull + * our own write; producing it is in scope here, consuming it is deferred. + * + * @param pushedCommit The `main` commit just reflected into Docmost (SHA or + * commit-ish). When omitted, NEITHER ref is advanced (e.g. a dry plan). + */ +export async function applyPushActions(deps, actions, pushedCommit) { + const { client, git } = deps; + let created = 0; + let updated = 0; + let deleted = 0; + let moved = 0; + let renamed = 0; + const writtenBack = []; + const pushed = []; + const failures = []; + const noops = []; + // 1. UPDATES — collab/Yjs write path (SPEC §2/§15.6), never a raw overwrite. + // Each update is isolated: a thrown page is recorded and the batch goes on. + for (const u of actions.updates) { + try { + // Push the CLEAN body only (no `gitmost_id` frontmatter): the frontmatter + // is engine metadata, never page content. The server converts the markdown + // it receives verbatim, so stripping here keeps the id out of Docmost. + const body = parsePageFile(await deps.readFile(u.path)).body; + // The last-synced version of this file (pre-image) is the common ancestor + // for a 3-way merge against the live page, so concurrent human edits are + // not clobbered (review #5). Null when the file is new at last-pushed. Its + // body is stripped the SAME way so the merge compares body-to-body. + const baseFull = await deps.git.showFileAtRef(LAST_PUSHED_REF, u.path); + const baseMarkdown = baseFull === null ? null : parsePageFile(baseFull).body; + const result = await client.importPageMarkdown(u.pageId, body, baseMarkdown); + updated++; + // §10 loop-guard data: hash the BODY we pushed + capture `updatedAt`. + pushed.push({ + pageId: u.pageId, + ...extractUpdatedAt(result), + bodyHash: bodyHash(body), + }); + } + catch (err) { + failures.push({ + kind: "update", + pageId: u.pageId, + path: u.path, + error: errMessage(err), + }); + } + } + // 2. CREATES — create the page, then write the assigned pageId back to meta so + // the file becomes tracked (SPEC §4 "записать присвоенный pageId обратно"). + // Isolated per page like updates. + for (const c of actions.creates) { + try { + const text = await deps.readFile(c.path); + const { body } = parsePageFile(text); + // Derive create args from the PATH (native-Obsidian, SPEC §5): title from + // the filename, parent from the enclosing folder's folder-note, space from + // the run (the vault's space). `parentPageId: null` -> created at ROOT. + const title = titleFromPath(c.path); + const parentPageId = (await resolveParentPageIdViaTree(deps, c.path, "current")) ?? undefined; + const result = await client.createPage(title, body, deps.spaceId, parentPageId); + // `createPage` returns `{ data: { id, ... }, success }`; the assigned + // pageId is at `result.data.id`. + const assignedPageId = result?.data?.id; + if (assignedPageId) { + // Write the assigned pageId back as the `gitmost_id` frontmatter, body + // preserved — the file becomes engine-tracked (SPEC §4). + const rewritten = serializePageFile(assignedPageId, body); + await deps.writeFile(c.path, rewritten); + writtenBack.push({ path: c.path, pageId: assignedPageId }); + // §10 loop-guard data for the created page (hash the pushed BODY). + pushed.push({ + pageId: assignedPageId, + ...extractUpdatedAt(result), + bodyHash: bodyHash(body), + }); + } + created++; + } + catch (err) { + failures.push({ kind: "create", path: c.path, error: errMessage(err) }); + } + } + // 3. DELETES — soft-delete to Trash (SPEC §8), reversible. Isolated per page. + for (const d of actions.deletes) { + try { + await client.deletePage(d.pageId); + deleted++; + } + catch (err) { + failures.push({ + kind: "delete", + pageId: d.pageId, + error: errMessage(err), + }); + } + } + // 4. RENAME/MOVE (push #3, SPEC §5/§6/§16). Classify each entry against the + // tree-backed resolvers (the NEW parent comes from the new path's enclosing + // folder `.md`, the OLD parent from the old path's at last-pushed — PATH is + // the truth, not stale `meta.parentPageId`; the title from the meta), then + // apply only the real ops. Each page is isolated like the cases above: a + // thrown op is recorded in `failures` and the batch continues. ORDER for a + // page that needs both: reparent (move) FIRST, then retitle (rename). + if (actions.renamesMoves.length > 0) { + // The classifier is PURE over sync resolvers; the tree reads are async, so + // prefetch every (path, side) lookup it will make into plain tables first. + const parentTable = new Map(); + const metaTable = new Map(); + // A tree read (readFile / git.showFileAtRef) throwing must isolate THAT page + // into `failures`, NOT abort the whole batch (§12 resumability). The helpers + // already swallow their own errors, but this per-entry try/catch keeps the + // batch-isolation invariant holding regardless of future changes to them. + const prefetchFailed = new Set(); + for (const rm of actions.renamesMoves) { + // newParent + newTitle from the CURRENT tree; oldParent + oldTitle from the + // last-pushed pre-image (`prev`). Keyed by `path|side` so duplicates fold. + try { + parentTable.set(`${rm.newPath}|current`, await resolveParentPageIdViaTree(deps, rm.newPath, "current")); + parentTable.set(`${rm.oldPath}|prev`, await resolveParentPageIdViaTree(deps, rm.oldPath, "prev")); + metaTable.set(`${rm.newPath}|current`, await metaAtViaTree(deps, rm.newPath, "current", deps.spaceId)); + metaTable.set(`${rm.oldPath}|prev`, await metaAtViaTree(deps, rm.oldPath, "prev", deps.spaceId)); + } + catch (err) { + prefetchFailed.add(rm.pageId); + failures.push({ + kind: "move", + pageId: rm.pageId, + path: rm.newPath, + error: errMessage(err), + }); + } + } + const classified = classifyRenameMoves(actions.renamesMoves.filter((rm) => !prefetchFailed.has(rm.pageId)), { + metaAt: (path, side) => metaTable.get(`${path}|${side}`) ?? null, + resolveParentPageId: (path, side) => parentTable.get(`${path}|${side}`) ?? null, + }); + for (const c of classified) { + if (c.noop) { + // Cosmetic local-only file-path rename — no Docmost op (SPEC §5). + noops.push({ + pageId: c.pageId, + oldPath: c.oldPath, + newPath: c.newPath, + reason: "path-only-rename", + }); + continue; + } + // Track which op is in flight so a failure is attributed to the op that + // ACTUALLY threw: for a page needing both, a move that succeeds then a + // rename that throws must be recorded as `rename`, not `move`. + let failingKind = c.move ? "move" : "rename"; + try { + // Reparent FIRST so the page is in its new tree position, THEN retitle. + if (c.move) { + failingKind = "move"; + // TODO(next): compute a fractional-index position between siblings + // (SPEC §16). `position` is UNDEFINED here; the client supplies a valid + // default. Pass `parentPageId: null` for a move to the space ROOT. + await client.movePage(c.pageId, c.move.parentPageId); + moved++; + } + if (c.rename) { + failingKind = "rename"; + await client.renamePage(c.pageId, c.rename.title); + renamed++; + } + } + catch (err) { + // Isolate the failed page: the op that ACTUALLY threw is recorded so a + // re-run can retry. A move that threw before its rename leaves `rename` + // for the next run (idempotent re-apply); refs are NOT advanced (below). + failures.push({ + kind: failingKind, + pageId: c.pageId, + path: c.newPath, + error: errMessage(err), + }); + } + } + } + // 5. Advance the refs ONLY on a CLEAN push (no failures) AND when a pushed + // commit is supplied. A partial push must advance NEITHER ref, so a re-run + // retries the whole batch (SPEC §12). The loop-close (SPEC §6 step 3 / §10): + // advance `refs/docmost/last-pushed` AND fast-forward the `docmost` mirror, + // so Docmost's new content is mirrored and the next pull diffs empty. + let lastPushedAdvanced = false; + let docmostFastForward = null; + if (pushedCommit && failures.length === 0) { + await git.updateRef(LAST_PUSHED_REF, pushedCommit); + lastPushedAdvanced = true; + // Fast-forward the mirror (refused, not forced, on a non-fast-forward — the + // caller logs the reason). Surfaced in the result. + docmostFastForward = await git.fastForwardBranch(DOCMOST_BRANCH, pushedCommit); + } + return { + created, + updated, + deleted, + moved, + renamed, + writtenBack, + pushed, + failures, + noops, + skipped: actions.skipped, + lastPushedAdvanced, + docmostFastForward, + }; +} +/** Stringify a thrown value into a stable error message. */ +function errMessage(err) { + return err instanceof Error ? err.message : String(err); +} +/** + * SPEC §5 path-as-truth: the parent FOLDER's `.md` file for a vault-relative + * (forward-slash) path. `buildVaultLayout` puts a page with children at + * `<...>/Title.md` and nests its children under `<...>/Title/`, so for + * `newPath = /Child.md` the parent page's file is `.md` (the enclosing + * folder, one level up). A path with NO enclosing folder (`Child.md`, at the + * space root) has no parent folder file -> `null` (the parent is ROOT). + */ +export function parentFolderFile(path) { + const slash = path.lastIndexOf("/"); + if (slash < 0) + return null; // root-level file: parent is ROOT. + const dir = path.slice(0, slash); // the enclosing folder + // The page that OWNS the enclosing folder is its folder-note `/.md`. + const folderNote = `${dir}/${baseSegment(dir)}.md`; + if (path === folderNote) { + // This path IS its folder's folder-note, so its parent is ONE LEVEL UP: the + // folder-note of the grandparent folder (or ROOT at the top level). + const up = dir.lastIndexOf("/"); + if (up < 0) + return null; // top-level folder -> parent is ROOT. + const grandDir = dir.slice(0, up); + return `${grandDir}/${baseSegment(grandDir)}.md`; + } + // A leaf (or a nested folder-note) sitting inside `dir`: its parent is `dir`'s + // folder-note. + return folderNote; +} +/** + * Whether a vault path is a Docmost PAGE file (design §"Адопция"): a `.md` file + * with NO dot-segment anywhere in its path. This excludes `.obsidian/` config, + * `.trash/`, dotfiles (`.foo.md`), and every non-`.md` file (attachments, JSON, + * …) — Obsidian owns those; they live in the vault but are never pages. Used to + * screen the PUSH diff so non-page files are never created/updated/deleted in + * Docmost (and never get a `gitmost_id` frontmatter written into them). + */ +export function isPageFile(path) { + if (!path.endsWith(".md")) + return false; + return !path.split("/").some((seg) => seg.startsWith(".")); +} +/** The last path segment of a forward-slash path (the folder/file base name). */ +function baseSegment(path) { + const slash = path.lastIndexOf("/"); + return slash < 0 ? path : path.slice(slash + 1); +} +/** + * The page TITLE derived from a vault path: the file's base name without the + * `.md` extension. In the native-Obsidian layout the filename IS the title — for + * a folder-note `/.md` that base equals the folder name, so the same + * rule yields the folder's title. Self-consistent across pull/push: a pulled + * (possibly disambiguated) filename round-trips to the same title, so a stable + * file never pushes a spurious rename. + */ +function titleFromPath(path) { + const base = baseSegment(path); + return base.endsWith(".md") ? base.slice(0, -3) : base; +} +/** + * Build the synthetic `DocmostMdMeta` the planner/classifier consume, from the + * NATIVE format: `pageId` from the `gitmost_id` frontmatter, `title` from the + * filename, `spaceId` from the run (the vault's space — every file belongs to + * it). `parentPageId` is intentionally absent: tree position is resolved from the + * PATH (`resolveParentPageId`), never from a stored field (SPEC §5). + */ +function nativeMeta(text, path, spaceId) { + const { id } = parsePageFile(text); + const meta = { version: 1, title: titleFromPath(path), spaceId }; + if (id) + meta.pageId = id; + return meta; +} +/** + * Build the `resolveParentPageId(path, side)` resolver `classifyRenameMoves` + * needs, reading the PARENT FOLDER's `.md` (SPEC §5 path-as-truth): + * - `current` -> `deps.readFile(.md)` (the live working tree), + * - `prev` -> `git.showFileAtRef('refs/docmost/last-pushed', .md)` (the + * last-pushed pre-image), + * then read its `gitmost_id` frontmatter and return that page's pageId. A root-level path + * (no enclosing folder), a missing/unreadable parent file, or a parent file with + * no parseable pageId all resolve to `null` (parent is ROOT / unknown -> + * `parentPageId: null`, SPEC §16 "parentPageId: null -> в корень"). + * + * The IO is async, so this returns an ASYNC resolver; the call sites prefetch the + * parent pageIds (the classifier itself stays pure/sync over a plain table). + */ +async function resolveParentPageIdViaTree(deps, path, side) { + const parentFile = parentFolderFile(path); + if (parentFile === null) + return null; // root-level: parent is ROOT. + let text; + try { + text = + side === "current" + ? await deps.readFile(parentFile) + : await deps.git.showFileAtRef(LAST_PUSHED_REF, parentFile); + } + catch { + // Parent folder file missing/unreadable at that side -> treat as ROOT. + return null; + } + if (text === null) + return null; // showFileAtRef returns null when absent. + // The parent page's identity is its `gitmost_id` frontmatter; folder position + // is irrelevant here, only the pageId. + return parsePageFile(text).id; +} +/** + * Resolve the synthetic native meta at a side for the rename/move classifier (the + * title — derived from the path — comes from here). Mirrors + * `resolveParentPageIdViaTree`'s IO sides: `current` reads the working tree, + * `prev` reads `refs/docmost/last-pushed`. Returns `null` only when the file is + * missing/unreadable at that side (a real absence the classifier must see). + */ +async function metaAtViaTree(deps, path, side, spaceId) { + let text; + try { + text = + side === "current" + ? await deps.readFile(path) + : await deps.git.showFileAtRef(LAST_PUSHED_REF, path); + } + catch { + return null; + } + if (text === null) + return null; + return nativeMeta(text, path, spaceId); +} +/** + * Pull an `updatedAt` out of a create/update client result, if present. The + * shape is `{ data: { updatedAt? }, ... }` (createPage) or a flatter object; + * absent in the simple fakes, so the field is omitted rather than `undefined`. + */ +function extractUpdatedAt(result) { + const r = result; + const raw = r?.data?.updatedAt ?? r?.updatedAt; + return typeof raw === "string" ? { updatedAt: raw } : {}; +} +// --- runnable push orchestration (`runPush`) --------------------------------- +// +// `runPush` is the FS->Docmost twin of `pull.ts`'s `main`: it wires the VaultGit +// diff/ref primitives + the PURE `computePushActions` planner + the THIN +// `applyPushActions` applier into one runnable cycle. SAFE BY DEFAULT — the +// engine's FIRST write path to Docmost defaults to DRY-RUN (plan only, NO +// Docmost writes, NO ref advance); an explicit `--apply` is the ONLY path that +// builds a client and mutates Docmost. +// +// Every external effect is injected (`PushDeps`) so the whole orchestration is +// driven by FAKES in tests — no live Docmost, git, fs, or network. +/** + * The human ("local") git identity used for engine-made commits on `main` in the + * push direction (SPEC §7.3). The provenance is carried by the trailer (below), + * which the loop-guard keys on; the identity is for history readability only. + * When the vault repo already has a configured `user.name`/`user.email`, git + * uses that for the working-tree commit; this is the fallback the daemon stamps. + */ +export const LOCAL_AUTHOR_NAME = "Local"; +export const LOCAL_AUTHOR_EMAIL = "local@local"; +/** The provenance trailer marking a `main`-side (human/local) commit (SPEC §7.3). */ +export const LOCAL_SOURCE_TRAILER = "Docmost-Sync-Source: local"; +/** + * Run one FS->Docmost push cycle (SPEC §6 "ФС → Docmost"), DRY-RUN BY DEFAULT. + * + * Steps (mirrors `pull.ts`): + * 1. Preflight git: `assertGitAvailable` + `ensureRepo`; ABORT (clear message + + * non-zero-ish result) if a merge is in progress — never push on top of an + * unresolved conflict (SPEC §9/§12). Conflict markers must NEVER reach + * Docmost (SPEC §9). + * 2. Checkout `main` (the human-facing branch the push reads from). + * 3. Commit the human's pending working-tree changes on `main` with the + * `local` provenance trailer (SPEC §7.3). A no-op when nothing changed. + * 4. Pick the diff BASE: `refs/docmost/last-pushed` if it resolves, else the + * `docmost` mirror branch (what Docmost currently has). Resolve `main`. + * 5. `diffNameStatus(base, main)` -> changes; build the `metaAt(path, side)` + * resolver (current = working tree, prev = `git show :`); run + * the PURE `computePushActions`. + * 6. DRY-RUN (default): LOG the full plan and RETURN — NO client, NO Docmost + * calls, NO ref advance. + * 7. `--apply`: build the client, run `applyPushActions(..., pushedCommit=main)`, + * then (a) if any pageIds were written back (creates), commit them on `main` + * with the `local` trailer and RE-advance `refs/docmost/last-pushed` to the + * new commit so the recorded pageIds are persisted in what Docmost mirrors; + * (b) ESCALATE a divergent-`docmost` ff refusal (SPEC §5) with a prominent + * WARNING and a non-zero-ish flag. Then log a one-line summary. + */ +export async function runPush(deps, opts) { + const { git, settings, log } = deps; + const dryRun = opts.dryRun; + // 1. Preflight git. Fail fast (actionable message via main().catch) if the git + // binary is missing — the vault state store relies on it. + await git.assertGitAvailable(); + await git.ensureRepo(); + // 1b. Refuse to push on top of an unresolved merge (SPEC §9/§12). A previous + // conflicting pull leaves the vault mid-merge; pushing now could leak + // conflict markers into Docmost (SPEC §9, the cardinal invariant). Detect + // it BEFORE any checkout/diff and stop with a clear, actionable message so + // re-runs converge once the human resolves (or aborts) the merge. + if (await git.isMergeInProgress()) { + log(`push: vault has an unresolved merge at ${settings.vaultPath} — resolve ` + + `it (or 'git merge --abort') and re-run. Nothing was pushed to Docmost ` + + `(conflict markers must never reach Docmost, SPEC §9).`); + return { mode: dryRun ? "dry-run" : "apply", aborted: "merge-in-progress" }; + } + // 2. Work on `main` — the human-facing branch the push diffs FROM. + await git.checkout(DEFAULT_BRANCH); + // 3. Commit the human's pending working-tree changes on `main` with the `local` + // provenance trailer (SPEC §7.3). A no-op commit when nothing changed is + // fine (`commit` returns false). The loop-guard keys on the trailer. + // Even on a "plan only" dry-run this commits the working tree (it is the + // only way to diff `base..main`, acceptable §6.1 behavior) — so make that + // LOCAL git mutation VISIBLE, never silent: a created commit is local-only + // and nothing is sent to Docmost. + await git.stageAll(); + const committedWorkingTree = await git.commit("local: working-tree changes", { + authorName: LOCAL_AUTHOR_NAME, + authorEmail: LOCAL_AUTHOR_EMAIL, + trailers: [LOCAL_SOURCE_TRAILER], + }); + if (committedWorkingTree) { + const sha = await git.revParse(DEFAULT_BRANCH); + log(`push: committed local working-tree changes on main` + + (sha ? ` as ${sha.slice(0, 8)}` : "") + + ` (local git only — nothing sent to Docmost).`); + } + else { + log("push: working tree clean (no local changes to push)."); + } + // 4. Pick the diff BASE (SPEC §5/§6): `refs/docmost/last-pushed` if it resolves + // (the marker of what `main` is already in Docmost), else fall back to the + // `docmost` mirror branch (the mirror of what Docmost currently has) — which + // is what exists before the first push ever advanced last-pushed. + let base; + const lastPushedSha = await git.readRef(LAST_PUSHED_REF); + if (lastPushedSha) { + base = { ref: LAST_PUSHED_REF, source: "last-pushed", sha: lastPushedSha }; + } + else { + base = { + ref: DOCMOST_BRANCH, + source: "docmost", + sha: await git.revParse(DOCMOST_BRANCH), + }; + } + const pushedCommit = await git.revParse(DEFAULT_BRANCH); + if (!pushedCommit) { + // `main` has no commit — `ensureRepo` always makes an initial one, so this is + // defensive. Nothing to diff. + log("push: `main` has no commit to push — nothing to do."); + return { mode: dryRun ? "dry-run" : "apply", base }; + } + // 5. Diff the base against `main` and build the `metaAt` resolver (PURE planner + // input). `current` reads the live working tree; `prev` reads the base ref's + // pre-image via `git show :` (so a DELETE recovers its pageId). + const changes = await git.diffNameStatus(base.ref, DEFAULT_BRANCH); + // Synchronous resolver over PREFETCHED meta tables: `computePushActions` is + // PURE/sync, but the file/ref reads are async — so we prefetch every (path, + // side) the diff will ask for into a table first, then resolve from it. + const metaTable = new Map(); + for (const change of changes) { + // `current`: A/M/R/C still have the file on `main`. `prev`: D needs the + // pre-image; R/C also benefit (old title). Prefetch both sides per path. + const currentPath = change.path; + const prevPath = change.oldPath ?? change.path; + if (!metaTable.has(`${currentPath}|current`)) { + metaTable.set(`${currentPath}|current`, await readMetaCurrent(deps, currentPath, settings.docmostSpaceId)); + } + if (!metaTable.has(`${prevPath}|prev`)) { + metaTable.set(`${prevPath}|prev`, await readMetaPrev(deps, base.ref, prevPath, settings.docmostSpaceId)); + } + } + const metaAt = (path, side) => metaTable.get(`${path}|${side}`) ?? null; + // The set of pageIds that STILL EXIST somewhere in the current `main` tree. + // Identity is the pageId, NOT the filename: a file vanishing from one path + // while the SAME pageId lives at another path is a MOVE (often a layout + // reshuffle of `_`-fallback names, whose two halves can even land in separate + // cycles), never a deletion. Built only when the diff contains deletes — the + // guard's whole job is to stop a phantom delete from trashing a live page. + let currentPageIds; + if (changes.some((c) => c.status === "D")) { + currentPageIds = new Set(); + for (const relPath of await git.listTrackedFiles("*.md")) { + const pid = (await readMetaCurrent(deps, relPath, settings.docmostSpaceId)) + ?.pageId; + if (pid) + currentPageIds.add(pid); + } + } + const actions = computePushActions({ changes, metaAt, currentPageIds }); + const planned = { + creates: actions.creates.length, + updates: actions.updates.length, + deletes: actions.deletes.length, + renamesMoves: actions.renamesMoves.length, + skipped: actions.skipped.length, + }; + // 6. DRY-RUN (default): log the full plan and RETURN — build NO client, make + // ZERO Docmost calls, advance NO refs. This is the SAFE default. + logPlan(log, base, pushedCommit, actions, planned, dryRun); + if (dryRun) { + return { mode: "dry-run", base, pushedCommit, planned }; + } + // 7. --apply: build the REAL client and execute. This is the ONLY write path. + const client = deps.makeClient(settings); + const applied = await applyPushActions({ + client, + // Pass the WHOLE `git` object (it satisfies the applier's + // `Pick` deps surface). Passing bare method references + // (`git.updateRef`, …) would lose their `this` binding, so on a REAL + // `VaultGit` they would throw `this.runRaw is not a function`. Hand over + // the object so the methods keep their receiver — exactly as `pull.ts` + // does for `applyPullActions`. + git, + readFile: deps.readFile, + writeFile: deps.writeFile, + spaceId: settings.docmostSpaceId, + }, actions, pushedCommit); + // 7a. Persist freshly-assigned pageIds (creates) back into git. `applyPushActions` + // rewrote those files on disk; commit them on `main` with the `local` trailer + // so the new pageIds are recorded, then RE-advance `refs/docmost/last-pushed` + // to the new commit so what Docmost mirrors and what last-pushed points at + // stay in lock-step (the write-back commit is part of `main` now). + // Track a divergent-`docmost` mirror across BOTH ff sites (the applier's main + // push ff in 7b, and the write-back ff here). A divergent mirror is a §5 + // invariant breach in EITHER branch and must escalate identically (exit 1). + let divergentDocmost = false; + if (applied.writtenBack.length > 0) { + await git.stageAll(); + const recorded = await git.commit("local: record created pageIds", { + authorName: LOCAL_AUTHOR_NAME, + authorEmail: LOCAL_AUTHOR_EMAIL, + trailers: [LOCAL_SOURCE_TRAILER], + }); + if (recorded) { + const newCommit = await git.revParse(DEFAULT_BRANCH); + // Only re-advance when the original push was CLEAN (last-pushed was already + // advanced by the applier); a partial push left the refs untouched and a + // re-run retries the whole batch, so we must not move them either. + if (newCommit && applied.lastPushedAdvanced) { + await git.updateRef(LAST_PUSHED_REF, newCommit); + const ff = await git.fastForwardBranch(DOCMOST_BRANCH, newCommit); + if (!ff.ok) { + // SYMMETRIC with the main escalation (7b): a divergent mirror in the + // write-back branch is the SAME §5 invariant breach and must escalate + // (exit 1), not just log a soft warning. + divergentDocmost = true; + log(`push: WARNING — the 'docmost' mirror branch DIVERGED and was NOT ` + + `fast-forwarded to the pageId write-back commit ` + + `(${ff.reason ?? "not-fast-forward"}). The §5 invariant ('docmost' ` + + `mirrors what Docmost contains) is broken: reconcile 'docmost' ` + + `against the live Docmost tree before the next cycle.`); + } + } + } + } + // 7b. ESCALATE a divergent-`docmost` fast-forward refusal (SPEC §5 invariant + // broken). The applier already refused to clobber a divergent mirror; make + // it LOUD (not silent) so the operator notices, and fold it into the exit. + if (applied.docmostFastForward && !applied.docmostFastForward.ok) { + divergentDocmost = true; + log(`push: WARNING — the 'docmost' mirror branch DIVERGED and was NOT ` + + `fast-forwarded (${applied.docmostFastForward.reason ?? "not-fast-forward"}). ` + + `The §5 invariant ('docmost' mirrors what Docmost contains) is broken: ` + + `reconcile 'docmost' against the live Docmost tree before the next cycle.`); + } + // 7c. One-line summary (mirrors pull.ts's summary line). + log(`push complete: ${applied.created} created, ${applied.updated} updated, ` + + `${applied.deleted} deleted, ${applied.moved} moved, ${applied.renamed} ` + + `renamed, ${applied.noops.length} no-op(s), ${applied.skipped.length} ` + + `skipped, ${applied.failures.length} failure(s)` + + (divergentDocmost ? " [DIVERGENT docmost mirror]" : "")); + return { + mode: "apply", + base, + pushedCommit, + planned, + applied, + divergentDocmost, + failures: applied.failures, + }; +} +/** Synthetic native meta from the live working tree (`current` side). */ +async function readMetaCurrent(deps, path, spaceId) { + let text; + try { + text = await deps.readFile(path); + } + catch { + return null; // absent on disk (e.g. a D row's path) -> no current meta. + } + return nativeMeta(text, path, spaceId); +} +/** Synthetic native meta from the base ref's pre-image (`prev` side). */ +async function readMetaPrev(deps, baseRef, path, spaceId) { + let text; + try { + text = await deps.git.showFileAtRef(baseRef, path); + } + catch { + return null; + } + if (text === null) + return null; // path absent at the base ref. + return nativeMeta(text, path, spaceId); +} +/** Emit the full plan (counts + per-item) to the injected logger. */ +function logPlan(log, base, pushedCommit, actions, planned, dryRun) { + log(`push plan (${dryRun ? "DRY-RUN — no Docmost writes" : "APPLY"}): base=` + + `${base.ref} (${base.source}${base.sha ? ` ${base.sha.slice(0, 8)}` : ""}) ` + + `-> main ${pushedCommit.slice(0, 8)}`); + log(`push plan counts: ${planned.creates} create, ${planned.updates} update, ` + + `${planned.deletes} delete, ${planned.renamesMoves} rename/move, ` + + `${planned.skipped} skipped`); + for (const c of actions.creates) + log(` create: ${c.path}`); + for (const u of actions.updates) + log(` update: ${u.pageId} (${u.path})`); + for (const d of actions.deletes) + log(` delete: ${d.pageId}`); + for (const rm of actions.renamesMoves) + log(` rename/move: ${rm.oldPath} -> ${rm.newPath} (${rm.pageId})`); + for (const s of actions.skipped) + log(` skipped [${s.status}] ${s.path}: ${s.reason}`); +} +/** + * Parse the `push` CLI flags. SAFE BY DEFAULT: without `--apply` the run is a + * DRY-RUN (plan only). Exported so the flag handling is unit-testable. + */ +export function parseArgs(argv) { + return { apply: argv.includes("--apply") }; +} diff --git a/packages/git-sync/build/engine/reconcile.d.ts b/packages/git-sync/build/engine/reconcile.d.ts new file mode 100644 index 00000000..28a58e92 --- /dev/null +++ b/packages/git-sync/build/engine/reconcile.d.ts @@ -0,0 +1,126 @@ +/** + * Pure reconciliation planner (SPEC §5/§6/§8). + * + * Given the desired live set of files (computed from the current Docmost tree) + * and the set of files currently tracked in the vault, compute what to write, + * what to move (old path to remove), and what to delete. Identity is `pageId` + * (the stable file<->page anchor, SPEC §4): a page that keeps its pageId but + * changes relPath is a MOVE, not delete+add; a tracked pageId that is gone from + * the live tree is a DELETE. + * + * This module is intentionally PURE (no IO, no git) so the whole plan is + * unit-testable. The actual file writing / git operations happen in pull.ts. + */ +/** A page that SHOULD exist in the vault at a given path. */ +export interface LiveEntry { + pageId: string; + /** Vault-relative path (forward-slash), e.g. `Space/Parent/Child.md`. */ + relPath: string; +} +/** A page currently tracked in the vault (pageId parsed from its meta). */ +export interface ExistingEntry { + pageId: string; + /** Vault-relative path (forward-slash) of the tracked file. */ + relPath: string; +} +/** A page to (re)write at its destination path. */ +export interface WriteEntry { + pageId: string; + relPath: string; +} +/** A page that moved: written at its NEW relPath, with the OLD path removed. */ +export interface MovedEntry { + pageId: string; + fromRelPath: string; + toRelPath: string; + /** + * Whether the old path (`fromRelPath`) is SAFE to remove. False when another + * live page will (re)write that exact path (path reuse): removing it would + * destroy real data, so the caller must skip the removal. The move itself is + * still recorded (the new path is written regardless). + */ + removeOldPath: boolean; +} +/** The full reconciliation plan. */ +export interface ReconciliationPlan { + /** + * Pages present in `live` -> (re)write at their relPath. This naturally + * covers add, content-update (same path) AND move (same pageId, new path), + * since every live page is (re)written regardless of whether it existed. + */ + toWrite: WriteEntry[]; + /** + * Vault-relative paths to delete because their tracked pageId is ABSENT from + * `live` (page removed/trashed). This set is ONLY absence-based deletions — + * the OLD paths of moved pages are NOT here (they live in `moved` and are + * applied separately by the caller). Keeping the two apart lets pull.ts gate + * absence deletions behind the incomplete-fetch suppression + mass-delete + * guard (SPEC §8) while still applying real moves. + */ + toDelete: string[]; + /** + * Tracked pages whose relPath changed. The caller writes the page at + * `toRelPath`, then removes `fromRelPath` — but ONLY after the new-path write + * succeeded. The old path is NOT in `toDelete`. + */ + moved: MovedEntry[]; +} +/** + * Compute the reconciliation plan. + * + * Rules: + * - Every `live` page is written at its relPath (covers add + update + move). + * - A tracked pageId present in `live` whose relPath changed is `moved`; its + * OLD relPath goes into `moved` ONLY (the caller removes it after the new + * path is written) and is NEVER added to `toDelete`. + * - A tracked pageId NOT present in `live` is an ABSENCE delete; its relPath + * is added to `toDelete`. + * + * Notes: + * - Safety filter (no data loss): no path that is a live TARGET path of any + * page is ever deleted/removed (a write owns it). This applies to BOTH the + * absence `toDelete` set AND a moved page's old-path removal — if a moved + * page's OLD path is reused by ANOTHER live page, the move records no old + * path to remove, because that path will be (re)written. + * - `existing` may legitimately contain duplicate pageIds (two stray files + * carrying the same meta pageId); each such file that is not the live target + * path is removed (as an absence/move) so the vault converges to exactly the + * live set. + */ +export declare function planReconciliation(live: LiveEntry[], existing: ExistingEntry[]): ReconciliationPlan; +/** + * Below this many tracked files the mass-delete fraction guard is not applied + * (a tiny vault where deleting "most" files is normal, e.g. 1-of-2). + */ +export declare const MASS_DELETE_MIN_EXISTING = 4; +/** Fraction of tracked files above which a delete plan is a suspected wipe. */ +export declare const MASS_DELETE_FRACTION = 0.5; +/** Why absence-based deletions were (or were not) applied this cycle. */ +export type DeletionDecision = { + apply: true; +} | { + apply: false; + reason: "incomplete-fetch" | "empty-live" | "mass-delete"; +}; +/** + * Pure decision: should the ABSENCE-based deletions (`plan.toDelete`) be applied + * this cycle? Encapsulates the SPEC §8 safety invariants so they are unit- + * testable without live creds or git: + * + * - `treeComplete === false` (a partial Docmost tree fetch) -> SUPPRESS. A page + * missing from a partial tree is NOT proof of deletion (SPEC §8); we must not + * delete merely-absent files this cycle. (Writes/updates/moves still happen.) + * - The live fetch returned 0 pages while files are tracked -> SUPPRESS + * (almost always a failed fetch, never a real "delete everything"). + * - The plan would delete more than `MASS_DELETE_FRACTION` of a non-trivial + * vault -> SUPPRESS as a mass-deletion guard (defense in depth). + * + * Moves are NOT governed by this decision: a moved page IS present in `live`, so + * its old-path removal is real (handled by the caller separately). + */ +export declare function decideAbsenceDeletions(args: { + treeComplete: boolean; + liveCount: number; + existingCount: number; + deleteCount: number; +}): DeletionDecision; diff --git a/packages/git-sync/build/engine/reconcile.js b/packages/git-sync/build/engine/reconcile.js new file mode 100644 index 00000000..9a111bb5 --- /dev/null +++ b/packages/git-sync/build/engine/reconcile.js @@ -0,0 +1,117 @@ +/** + * Pure reconciliation planner (SPEC §5/§6/§8). + * + * Given the desired live set of files (computed from the current Docmost tree) + * and the set of files currently tracked in the vault, compute what to write, + * what to move (old path to remove), and what to delete. Identity is `pageId` + * (the stable file<->page anchor, SPEC §4): a page that keeps its pageId but + * changes relPath is a MOVE, not delete+add; a tracked pageId that is gone from + * the live tree is a DELETE. + * + * This module is intentionally PURE (no IO, no git) so the whole plan is + * unit-testable. The actual file writing / git operations happen in pull.ts. + */ +/** + * Compute the reconciliation plan. + * + * Rules: + * - Every `live` page is written at its relPath (covers add + update + move). + * - A tracked pageId present in `live` whose relPath changed is `moved`; its + * OLD relPath goes into `moved` ONLY (the caller removes it after the new + * path is written) and is NEVER added to `toDelete`. + * - A tracked pageId NOT present in `live` is an ABSENCE delete; its relPath + * is added to `toDelete`. + * + * Notes: + * - Safety filter (no data loss): no path that is a live TARGET path of any + * page is ever deleted/removed (a write owns it). This applies to BOTH the + * absence `toDelete` set AND a moved page's old-path removal — if a moved + * page's OLD path is reused by ANOTHER live page, the move records no old + * path to remove, because that path will be (re)written. + * - `existing` may legitimately contain duplicate pageIds (two stray files + * carrying the same meta pageId); each such file that is not the live target + * path is removed (as an absence/move) so the vault converges to exactly the + * live set. + */ +export function planReconciliation(live, existing) { + // Desired path for each live pageId. + const liveByPageId = new Map(); + // Set of all paths that WILL be written (never delete/remove one of these). + const liveTargetPaths = new Set(); + for (const e of live) { + liveByPageId.set(e.pageId, e.relPath); + liveTargetPaths.add(e.relPath); + } + const toWrite = live.map((e) => ({ + pageId: e.pageId, + relPath: e.relPath, + })); + const moved = []; + // Absence-based deletions ONLY (tracked pageId absent from `live`). Use a Set + // so the same path coming from multiple existing rows is queued only once. + const toDeleteSet = new Set(); + for (const ex of existing) { + const liveRel = liveByPageId.get(ex.pageId); + if (liveRel === undefined) { + // Tracked page is gone from the live tree -> absence delete. + // Never queue a path a live page will (re)write (path reuse -> no loss). + if (!liveTargetPaths.has(ex.relPath)) + toDeleteSet.add(ex.relPath); + continue; + } + if (liveRel !== ex.relPath) { + // Same pageId, different path -> a MOVE. Record it so the caller can write + // the new path first, then remove the old one. If the old path is itself a + // live target (reused by another page), it must NOT be removed — the write + // owns it — so flag `removeOldPath: false` (move still recorded). + moved.push({ + pageId: ex.pageId, + fromRelPath: ex.relPath, + toRelPath: liveRel, + removeOldPath: !liveTargetPaths.has(ex.relPath), + }); + } + // liveRel === ex.relPath -> content-update in place; nothing extra to do + // (the write above re-emits the file; identical bytes => git no-op). + } + const toDelete = [...toDeleteSet]; + return { toWrite, toDelete, moved }; +} +/** + * Below this many tracked files the mass-delete fraction guard is not applied + * (a tiny vault where deleting "most" files is normal, e.g. 1-of-2). + */ +export const MASS_DELETE_MIN_EXISTING = 4; +/** Fraction of tracked files above which a delete plan is a suspected wipe. */ +export const MASS_DELETE_FRACTION = 0.5; +/** + * Pure decision: should the ABSENCE-based deletions (`plan.toDelete`) be applied + * this cycle? Encapsulates the SPEC §8 safety invariants so they are unit- + * testable without live creds or git: + * + * - `treeComplete === false` (a partial Docmost tree fetch) -> SUPPRESS. A page + * missing from a partial tree is NOT proof of deletion (SPEC §8); we must not + * delete merely-absent files this cycle. (Writes/updates/moves still happen.) + * - The live fetch returned 0 pages while files are tracked -> SUPPRESS + * (almost always a failed fetch, never a real "delete everything"). + * - The plan would delete more than `MASS_DELETE_FRACTION` of a non-trivial + * vault -> SUPPRESS as a mass-deletion guard (defense in depth). + * + * Moves are NOT governed by this decision: a moved page IS present in `live`, so + * its old-path removal is real (handled by the caller separately). + */ +export function decideAbsenceDeletions(args) { + const { treeComplete, liveCount, existingCount, deleteCount } = args; + // No tracked files, or nothing to delete -> trivially fine to "apply". + if (existingCount === 0 || deleteCount === 0) + return { apply: true }; + if (!treeComplete) + return { apply: false, reason: "incomplete-fetch" }; + if (liveCount === 0) + return { apply: false, reason: "empty-live" }; + if (existingCount >= MASS_DELETE_MIN_EXISTING && + deleteCount > existingCount * MASS_DELETE_FRACTION) { + return { apply: false, reason: "mass-delete" }; + } + return { apply: true }; +} diff --git a/packages/git-sync/build/engine/roundtrip-helpers.d.ts b/packages/git-sync/build/engine/roundtrip-helpers.d.ts new file mode 100644 index 00000000..30bcfa8f --- /dev/null +++ b/packages/git-sync/build/engine/roundtrip-helpers.d.ts @@ -0,0 +1,21 @@ +/** + * Pure, IO-free comparison helpers for the idempotency round-trip checks. The + * round-trip harness that drives these lives in the package's tests, not in the + * engine. + */ +/** + * Recursively strip every `attrs.id` from a ProseMirror node tree. Block ids + * are regenerated by `markdownToProseMirror` (SPEC §11), so they must be + * ignored when comparing the semantic shape of two documents. Returns a NEW + * tree; the input is not mutated. + */ +export declare function stripBlockIds(node: any): any; +/** + * Find the first divergence between two values via a recursive deep compare. + * Returns a short path + the two differing values, or null if they are equal. + */ +export declare function firstDivergence(a: any, b: any, path?: string): { + path: string; + a: any; + b: any; +} | null; diff --git a/packages/git-sync/build/engine/roundtrip-helpers.js b/packages/git-sync/build/engine/roundtrip-helpers.js new file mode 100644 index 00000000..9fe4c495 --- /dev/null +++ b/packages/git-sync/build/engine/roundtrip-helpers.js @@ -0,0 +1,70 @@ +/** + * Pure, IO-free comparison helpers for the idempotency round-trip checks. The + * round-trip harness that drives these lives in the package's tests, not in the + * engine. + */ +/** + * Recursively strip every `attrs.id` from a ProseMirror node tree. Block ids + * are regenerated by `markdownToProseMirror` (SPEC §11), so they must be + * ignored when comparing the semantic shape of two documents. Returns a NEW + * tree; the input is not mutated. + */ +export function stripBlockIds(node) { + if (Array.isArray(node)) { + return node.map(stripBlockIds); + } + if (node && typeof node === "object") { + const out = {}; + for (const key of Object.keys(node)) { + if (key === "attrs" && node.attrs && typeof node.attrs === "object") { + // Drop the `id` attr; keep every other attribute. + const { id, ...rest } = node.attrs; + void id; + out.attrs = stripBlockIds(rest); + } + else { + out[key] = stripBlockIds(node[key]); + } + } + return out; + } + return node; +} +/** + * Find the first divergence between two values via a recursive deep compare. + * Returns a short path + the two differing values, or null if they are equal. + */ +export function firstDivergence(a, b, path = "$") { + if (a === b) + return null; + const ta = typeof a; + const tb = typeof b; + if (ta !== tb || a === null || b === null) { + return { path, a, b }; + } + if (ta !== "object") { + return { path, a, b }; + } + const aIsArr = Array.isArray(a); + const bIsArr = Array.isArray(b); + if (aIsArr !== bIsArr) + return { path, a, b }; + if (aIsArr) { + if (a.length !== b.length) { + return { path: `${path}.length`, a: a.length, b: b.length }; + } + for (let i = 0; i < a.length; i++) { + const d = firstDivergence(a[i], b[i], `${path}[${i}]`); + if (d) + return d; + } + return null; + } + const keys = new Set([...Object.keys(a), ...Object.keys(b)]); + for (const k of keys) { + const d = firstDivergence(a[k], b[k], `${path}.${k}`); + if (d) + return d; + } + return null; +} diff --git a/packages/git-sync/build/engine/sanitize.d.ts b/packages/git-sync/build/engine/sanitize.d.ts new file mode 100644 index 00000000..0889a9f6 --- /dev/null +++ b/packages/git-sync/build/engine/sanitize.d.ts @@ -0,0 +1,23 @@ +/** + * Deterministic filename strategy (SPEC §12). + * + * The file name is COSMETIC — the source of truth for the file<->page link is + * `pageId` / `slugId` inside the meta block, so renaming a file is safe. These + * functions are intentionally dependency-free and pure, so they are trivially + * unit-testable. + */ +/** + * Sanitize a page title into a safe file-name component (WITHOUT extension). + * + * Steps: replace forbidden / control characters with "-", collapse whitespace + * runs to a single space, trim, cap the length, then guard against an empty + * result, an all-dots result, or a reserved Windows device name by prefixing + * with "_". + */ +export declare function sanitizeTitle(title: string): string; +/** + * Disambiguate a sanitized name when two siblings in the same folder collapse + * to the same name. Appends a stable suffix built from the page's `slugId`, so + * the result stays deterministic across runs (SPEC §12: `Title ~slugId`). + */ +export declare function disambiguate(name: string, slugId: string): string; diff --git a/packages/git-sync/build/engine/sanitize.js b/packages/git-sync/build/engine/sanitize.js new file mode 100644 index 00000000..2aff0f3c --- /dev/null +++ b/packages/git-sync/build/engine/sanitize.js @@ -0,0 +1,97 @@ +/** + * Deterministic filename strategy (SPEC §12). + * + * The file name is COSMETIC — the source of truth for the file<->page link is + * `pageId` / `slugId` inside the meta block, so renaming a file is safe. These + * functions are intentionally dependency-free and pure, so they are trivially + * unit-testable. + */ +// Printable characters forbidden in file names on common filesystems (mainly +// Windows): / \ < > : " | ? *. Each match is replaced with a single "-". +// Spaces are NOT in this set; whitespace is normalized separately below. +// ASCII control characters (code points 0..31) are stripped in a separate pass +// (see stripControlChars) to keep this literal free of embedded control bytes. +const FORBIDDEN_PRINTABLE_RE = /[/\\<>:"|?*]/g; +// Runs of whitespace (including tabs/newlines) collapse to a single space. +const WHITESPACE_RUN_RE = /\s+/g; +// Reserved Windows device names (case-insensitive). A bare match (with or +// without an extension) is unusable as a file name, so it is prefixed with "_". +const RESERVED_WINDOWS_NAMES = new Set([ + "con", + "prn", + "aux", + "nul", + "com1", + "com2", + "com3", + "com4", + "com5", + "com6", + "com7", + "com8", + "com9", + "lpt1", + "lpt2", + "lpt3", + "lpt4", + "lpt5", + "lpt6", + "lpt7", + "lpt8", + "lpt9", +]); +// Cap on the sanitized length to stay well within filesystem path-component +// limits (255 bytes on most FSes) while leaving room for an extension and a +// disambiguation suffix. +const MAX_LENGTH = 120; +/** + * Replace every ASCII control character (code points 0..31) with "-". Done by + * scanning code points rather than a control-range regex literal, so the source + * file carries no embedded control bytes. + */ +function stripControlChars(input) { + let out = ""; + for (let i = 0; i < input.length; i++) { + out += input.charCodeAt(i) < 32 ? "-" : input[i]; + } + return out; +} +/** + * Sanitize a page title into a safe file-name component (WITHOUT extension). + * + * Steps: replace forbidden / control characters with "-", collapse whitespace + * runs to a single space, trim, cap the length, then guard against an empty + * result, an all-dots result, or a reserved Windows device name by prefixing + * with "_". + */ +export function sanitizeTitle(title) { + let name = stripControlChars(title ?? "") + .replace(FORBIDDEN_PRINTABLE_RE, "-") + .replace(WHITESPACE_RUN_RE, " ") + .trim(); + if (name.length > MAX_LENGTH) { + name = name.slice(0, MAX_LENGTH).trim(); + } + // Compare the base name (before the first dot) against reserved names, so + // both "CON" and "con.md" are caught. + const base = name.split(".")[0]?.toLowerCase() ?? ""; + // A name that is empty, consists only of dots ("." / ".." / "..."), or is a + // reserved Windows device name is unusable as a path component. The all-dots + // case is a path-traversal hazard in particular: an unprefixed ".." would + // become a parent-directory segment and let a page escape the vault, so it + // MUST be neutralized here (becomes "_..", which is a literal file name). + if (name.length === 0 || + /^\.+$/.test(name) || + RESERVED_WINDOWS_NAMES.has(base)) { + name = "_" + name; + } + return name; +} +/** + * Disambiguate a sanitized name when two siblings in the same folder collapse + * to the same name. Appends a stable suffix built from the page's `slugId`, so + * the result stays deterministic across runs (SPEC §12: `Title ~slugId`). + */ +export function disambiguate(name, slugId) { + return `${name} ~${slugId}`; +} diff --git a/packages/git-sync/build/engine/settings.d.ts b/packages/git-sync/build/engine/settings.d.ts new file mode 100644 index 00000000..8539b439 --- /dev/null +++ b/packages/git-sync/build/engine/settings.d.ts @@ -0,0 +1,41 @@ +/** + * Engine settings. + * + * The engine is driven IN-PROCESS by the NestJS server, which builds the + * `Settings` object from `EnvironmentService` — so this module must NOT reach + * into `process.env`. It exposes only: + * - the `Settings` type the engine consumes, and + * - `parseSettings(env)` as a PURE function (validate a raw env object -> typed + * `Settings`), kept for unit tests and for the server to reuse if it wants + * to validate an env-shaped object. + * There is no `.env`-loading side-effecting entry point. + */ +import { z } from 'zod'; +export declare const envSchema: z.ZodObject<{ + DOCMOST_API_URL: z.ZodString; + DOCMOST_EMAIL: z.ZodString; + DOCMOST_PASSWORD: z.ZodString; + DOCMOST_SPACE_ID: z.ZodString; + VAULT_PATH: z.ZodDefault; + GIT_REMOTE: z.ZodPipe, z.ZodOptional>; + POLL_INTERVAL_MS: z.ZodDefault>; + DEBOUNCE_MS: z.ZodDefault>; + LOG_LEVEL: z.ZodDefault>; +}, z.core.$strip>; +export type Settings = { + docmostApiUrl: string; + docmostEmail: string; + docmostPassword: string; + docmostSpaceId: string; + vaultPath: string; + gitRemote?: string; + pollIntervalMs: number; + debounceMs: number; + logLevel: 'debug' | 'info' | 'warn' | 'error'; +}; +export declare function parseSettings(env: NodeJS.ProcessEnv): Settings; diff --git a/packages/git-sync/build/engine/settings.js b/packages/git-sync/build/engine/settings.js new file mode 100644 index 00000000..b75f8435 --- /dev/null +++ b/packages/git-sync/build/engine/settings.js @@ -0,0 +1,49 @@ +/** + * Engine settings. + * + * The engine is driven IN-PROCESS by the NestJS server, which builds the + * `Settings` object from `EnvironmentService` — so this module must NOT reach + * into `process.env`. It exposes only: + * - the `Settings` type the engine consumes, and + * - `parseSettings(env)` as a PURE function (validate a raw env object -> typed + * `Settings`), kept for unit tests and for the server to reuse if it wants + * to validate an env-shaped object. + * There is no `.env`-loading side-effecting entry point. + */ +import { z } from 'zod'; +// Schema keyed by the real ENV variable names so validation errors name the +// exact variable. Credentials and the address of our OWN Docmost instance have +// NO default — a missing value must fail at startup, never silently fall back. +export const envSchema = z.object({ + // Docmost connection — address of our own instance, no default. + DOCMOST_API_URL: z.string().url(), + // Credentials for /auth/login — no default, never hardcoded. + DOCMOST_EMAIL: z.string().min(1), + DOCMOST_PASSWORD: z.string().min(1), + // Which Docmost space to mirror. + DOCMOST_SPACE_ID: z.string().min(1), + // Local git vault (state store) — kept under data/ so the volume persists it. + VAULT_PATH: z.string().min(1).default('data/vault'), + // Optional git remote the vault pushes to. Empty string is treated as unset. + GIT_REMOTE: z.preprocess((v) => (v === '' ? undefined : v), z.string().min(1).optional()), + // Non-secret tunables — sensible defaults are fine. + POLL_INTERVAL_MS: z.coerce.number().int().positive().default(15000), + DEBOUNCE_MS: z.coerce.number().int().positive().default(2000), + LOG_LEVEL: z.enum(['debug', 'info', 'warn', 'error']).default('info'), +}); +// Pure: validate a raw environment object and map it to a typed Settings. +// Throws ZodError on bad config. No side effects — safe to import in tests. +export function parseSettings(env) { + const e = envSchema.parse(env); + return { + docmostApiUrl: e.DOCMOST_API_URL, + docmostEmail: e.DOCMOST_EMAIL, + docmostPassword: e.DOCMOST_PASSWORD, + docmostSpaceId: e.DOCMOST_SPACE_ID, + vaultPath: e.VAULT_PATH, + gitRemote: e.GIT_REMOTE, + pollIntervalMs: e.POLL_INTERVAL_MS, + debounceMs: e.DEBOUNCE_MS, + logLevel: e.LOG_LEVEL, + }; +} diff --git a/packages/git-sync/build/engine/stabilize.d.ts b/packages/git-sync/build/engine/stabilize.d.ts new file mode 100644 index 00000000..0c1f4921 --- /dev/null +++ b/packages/git-sync/build/engine/stabilize.d.ts @@ -0,0 +1,41 @@ +/** + * Meta object as `exportPageBody` builds it (SPEC §4). Kept byte-for-byte + * compatible so files produced here match `exportPageBody`'s output exactly. + */ +export interface PageMeta { + version: 1; + pageId: string; + slugId: string; + title: string; + spaceId: string; + parentPageId: string | null; +} +/** + * Produce the self-contained `.md` file text for a page from its raw + * ProseMirror `content` + identity meta, in the verified fixpoint form. + * + * md1 = convertProseMirrorToMarkdown(content) + * doc2 = markdownToProseMirror(md1) // one import... + * stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export + * file = serializeDocmostMarkdownBody(meta, stableBody) + * + * The single export->import->export pass is the verified fixpoint (SPEC §11): + * idempotent for already-stable content, and the convergence point for the + * known converter asymmetries. + */ +export declare function stabilizePageFile(content: unknown, meta: PageMeta): Promise; +/** + * The fixpoint markdown BODY for a page's ProseMirror `content`, WITHOUT any meta + * envelope: + * + * md1 = convertProseMirrorToMarkdown(content) // export... + * doc2 = markdownToProseMirror(md1) // ...import... + * stableBody = convertProseMirrorToMarkdown(doc2) // ...re-export + * + * The single export->import->export pass is the verified fixpoint (SPEC §11): + * idempotent for already-stable content, and the convergence point for the known + * converter asymmetries. The native-Obsidian writer (`serializePageFile`) wraps + * this body with a minimal `gitmost_id` frontmatter; determinism here is what + * keeps re-pulls of an unchanged page byte-identical (no churn, loop-guard). + */ +export declare function stabilizePageBody(content: unknown): Promise; diff --git a/packages/git-sync/build/engine/stabilize.js b/packages/git-sync/build/engine/stabilize.js new file mode 100644 index 00000000..0734d84a --- /dev/null +++ b/packages/git-sync/build/engine/stabilize.js @@ -0,0 +1,52 @@ +/** + * Normalize-on-write helper (SPEC §11 "Резолюция"). + * + * git diffs byte-for-byte, so writing a page in a NON-fixpoint markdown form + * would make the next pull re-export it to a slightly different (but stable) + * form and produce a phantom diff -> churny commits. The converter has a couple + * of known one-pass asymmetries (a block image after a paragraph adds an empty + * paragraph; a diagram materializes `data-align`), all of which converge to a + * fixpoint after ONE `export -> import -> export` round-trip. + * + * So at write time we run exactly that one pass and persist the fixpoint form. + * Already-stable content is unaffected (the pass is idempotent), so re-pulls of + * unchanged pages produce identical bytes and git sees no diff. + */ +import { convertProseMirrorToMarkdown, markdownToProseMirror, serializeDocmostMarkdownBody, } from "../lib/index.js"; +/** + * Produce the self-contained `.md` file text for a page from its raw + * ProseMirror `content` + identity meta, in the verified fixpoint form. + * + * md1 = convertProseMirrorToMarkdown(content) + * doc2 = markdownToProseMirror(md1) // one import... + * stableBody = convertProseMirrorToMarkdown(doc2) // ...and re-export + * file = serializeDocmostMarkdownBody(meta, stableBody) + * + * The single export->import->export pass is the verified fixpoint (SPEC §11): + * idempotent for already-stable content, and the convergence point for the + * known converter asymmetries. + */ +export async function stabilizePageFile(content, meta) { + // The meta shape is exactly what `exportPageBody` writes; cast to the lib's + // DocmostMdMeta (a superset with optional fields) for the serializer. + return serializeDocmostMarkdownBody(meta, await stabilizePageBody(content)); +} +/** + * The fixpoint markdown BODY for a page's ProseMirror `content`, WITHOUT any meta + * envelope: + * + * md1 = convertProseMirrorToMarkdown(content) // export... + * doc2 = markdownToProseMirror(md1) // ...import... + * stableBody = convertProseMirrorToMarkdown(doc2) // ...re-export + * + * The single export->import->export pass is the verified fixpoint (SPEC §11): + * idempotent for already-stable content, and the convergence point for the known + * converter asymmetries. The native-Obsidian writer (`serializePageFile`) wraps + * this body with a minimal `gitmost_id` frontmatter; determinism here is what + * keeps re-pulls of an unchanged page byte-identical (no churn, loop-guard). + */ +export async function stabilizePageBody(content) { + const md1 = convertProseMirrorToMarkdown(content); + const doc2 = await markdownToProseMirror(md1); + return convertProseMirrorToMarkdown(doc2); +} diff --git a/packages/git-sync/build/index.d.ts b/packages/git-sync/build/index.d.ts new file mode 100644 index 00000000..47ec1fdf --- /dev/null +++ b/packages/git-sync/build/index.d.ts @@ -0,0 +1,31 @@ +/** + * Public surface of `@docmost/git-sync`. + * + * Exposes the pure converter (markdown <-> ProseMirror, file envelope, + * canonicalization) and the sync engine (reconcile planner, vault layout, + * pull/push, the git wrapper, and the settings parser) that the gitmost server + * drives in-process. + */ +export { serializeDocmostMarkdown, serializeDocmostMarkdownBody, parseDocmostMarkdown, convertProseMirrorToMarkdown, markdownToProseMirror, canonicalizeContent, docsCanonicallyEqual, } from "./lib/index.js"; +export type { DocmostMdMeta } from "./lib/index.js"; +export { planReconciliation, decideAbsenceDeletions, MASS_DELETE_MIN_EXISTING, MASS_DELETE_FRACTION, } from "./engine/reconcile.js"; +export type { LiveEntry, ExistingEntry, WriteEntry, MovedEntry, ReconciliationPlan, DeletionDecision, } from "./engine/reconcile.js"; +export { buildVaultLayout } from "./engine/layout.js"; +export type { PageNode, VaultEntry } from "./engine/layout.js"; +export { sanitizeTitle, disambiguate } from "./engine/sanitize.js"; +export { stabilizePageFile } from "./engine/stabilize.js"; +export type { PageMeta } from "./engine/stabilize.js"; +export { bodyHash } from "./engine/loop-guard.js"; +export type { GitSyncClient, GitSyncPageNodeLite } from "./engine/client.types.js"; +export { VaultGit, vaultGitEnv, buildCommitMessage, BOT_AUTHOR_NAME, BOT_AUTHOR_EMAIL, DEFAULT_BRANCH, } from "./engine/git.js"; +export type { DiffEntry, MergeResult, CommitOptions } from "./engine/git.js"; +export { readExisting, computePullActions, applyPullActions, } from "./engine/pull.js"; +export type { ReadExistingDeps, PullActionsInput, PullActions, ApplyPullActionsDeps, ApplyResult, } from "./engine/pull.js"; +export { classifyRenameMoves, computePushActions, applyPushActions, runPush, parentFolderFile, parseArgs, LAST_PUSHED_REF, DOCMOST_BRANCH, LOCAL_AUTHOR_NAME, LOCAL_AUTHOR_EMAIL, LOCAL_SOURCE_TRAILER, } from "./engine/push.js"; +export type { CreateAction, UpdateAction, DeleteAction, RenameMoveAction, RenameMoveActionClassified, ClassifyRenameMovesDeps, PushActions, PushActionsInput, MetaSide, ApplyPushDeps, WrittenBackPage, PushedPageRecord, PushFailure, PushNoop, ApplyPushResult, PushDeps, PushRunResult, PushParsedArgs, } from "./engine/push.js"; +export { parseSettings, envSchema } from "./engine/settings.js"; +export type { Settings } from "./engine/settings.js"; +export { loadSettingsOrExit } from "./engine/config-errors.js"; +export { runCycle } from "./engine/cycle.js"; +export type { RunCycleDeps, RunCycleResult, CycleFs, } from "./engine/cycle.js"; +export { parsePageFile, serializePageFile } from "./lib/page-file.js"; diff --git a/packages/git-sync/build/index.js b/packages/git-sync/build/index.js new file mode 100644 index 00000000..4dffdfc0 --- /dev/null +++ b/packages/git-sync/build/index.js @@ -0,0 +1,24 @@ +/** + * Public surface of `@docmost/git-sync`. + * + * Exposes the pure converter (markdown <-> ProseMirror, file envelope, + * canonicalization) and the sync engine (reconcile planner, vault layout, + * pull/push, the git wrapper, and the settings parser) that the gitmost server + * drives in-process. + */ +// Pure converter (markdown <-> ProseMirror, file envelope, canonicalization). +export { serializeDocmostMarkdown, serializeDocmostMarkdownBody, parseDocmostMarkdown, convertProseMirrorToMarkdown, markdownToProseMirror, canonicalizeContent, docsCanonicallyEqual, } from "./lib/index.js"; +// Pure engine (no IO): reconcile planner, vault layout, sanitize, stabilize, +// loop-guard body hash. +export { planReconciliation, decideAbsenceDeletions, MASS_DELETE_MIN_EXISTING, MASS_DELETE_FRACTION, } from "./engine/reconcile.js"; +export { buildVaultLayout } from "./engine/layout.js"; +export { sanitizeTitle, disambiguate } from "./engine/sanitize.js"; +export { stabilizePageFile } from "./engine/stabilize.js"; +export { bodyHash } from "./engine/loop-guard.js"; +export { VaultGit, vaultGitEnv, buildCommitMessage, BOT_AUTHOR_NAME, BOT_AUTHOR_EMAIL, DEFAULT_BRANCH, } from "./engine/git.js"; +export { readExisting, computePullActions, applyPullActions, } from "./engine/pull.js"; +export { classifyRenameMoves, computePushActions, applyPushActions, runPush, parentFolderFile, parseArgs, LAST_PUSHED_REF, DOCMOST_BRANCH, LOCAL_AUTHOR_NAME, LOCAL_AUTHOR_EMAIL, LOCAL_SOURCE_TRAILER, } from "./engine/push.js"; +export { parseSettings, envSchema } from "./engine/settings.js"; +export { loadSettingsOrExit } from "./engine/config-errors.js"; +export { runCycle } from "./engine/cycle.js"; +export { parsePageFile, serializePageFile } from "./lib/page-file.js"; diff --git a/packages/git-sync/build/lib/canonicalize.d.ts b/packages/git-sync/build/lib/canonicalize.d.ts new file mode 100644 index 00000000..7f7017c0 --- /dev/null +++ b/packages/git-sync/build/lib/canonicalize.d.ts @@ -0,0 +1,38 @@ +/** + * Semantic canonicalization of ProseMirror/TipTap documents for the round-trip + * idempotency check (SPEC §11, "Задача №0", option (б): compare a CANONICALIZED + * form rather than raw bytes). + * + * `markdownToProseMirror` reconstructs schema DEFAULT attributes (e.g. + * `indent: null` where the source omitted it) and regenerates per-block ids on + * every import. A raw deep-equal of the source doc against the re-imported doc + * therefore diverges even when the two are semantically identical. This module + * normalizes a document so that two semantically-equal docs compare deep-equal + * regardless of block ids and absent-vs-explicit-default-null attributes. + * + * It is a self-contained module with no external dependencies. + */ +/** + * Return a DEEP COPY of a ProseMirror node tree, canonicalized so that two + * semantically-equal documents compare deep-equal. Rules (applied recursively + * to the node, its `content`, and its `marks`): + * + * 1. Remove node-level `attrs.id` (regenerated on import). Mark attrs are NOT + * touched for `id` (marks carry no block id; only their meaningful attrs). + * 2. In any `attrs` object (node OR mark) drop keys whose value is `null`/ + * `undefined` (absent ≡ explicit default null) OR equals that node/mark + * type's known non-null schema default (absent ≡ explicit default). + * Keep every non-default value. The type is passed into the attrs + * normalizer so it can look up `KNOWN_DEFAULTS`. + * 3. If an `attrs` object becomes empty after pruning, drop the `attrs` key. + * 4. Preserve `marks` (including the `comment` mark and its `commentId` — a + * meaningful anchor per SPEC §3; never strip it). + * 5. Preserve `text`, `type`, and `content` order exactly. + * 6. Never mutate the input. + */ +export declare function canonicalizeContent(node: any): any; +/** + * True when two ProseMirror documents are semantically equal: equal after + * canonicalization (block ids stripped, absent-vs-default-null normalized). + */ +export declare function docsCanonicallyEqual(a: any, b: any): boolean; diff --git a/packages/git-sync/build/lib/canonicalize.js b/packages/git-sync/build/lib/canonicalize.js new file mode 100644 index 00000000..d2f36c73 --- /dev/null +++ b/packages/git-sync/build/lib/canonicalize.js @@ -0,0 +1,245 @@ +/** + * Semantic canonicalization of ProseMirror/TipTap documents for the round-trip + * idempotency check (SPEC §11, "Задача №0", option (б): compare a CANONICALIZED + * form rather than raw bytes). + * + * `markdownToProseMirror` reconstructs schema DEFAULT attributes (e.g. + * `indent: null` where the source omitted it) and regenerates per-block ids on + * every import. A raw deep-equal of the source doc against the re-imported doc + * therefore diverges even when the two are semantically identical. This module + * normalizes a document so that two semantically-equal docs compare deep-equal + * regardless of block ids and absent-vs-explicit-default-null attributes. + * + * It is a self-contained module with no external dependencies. + */ +/** + * Known NON-NULL schema defaults that `markdownToProseMirror` materializes on + * import, keyed by node/mark type → { attr: defaultValue }. + * + * Why this exists: `canonicalizeAttrs` already treats an absent attr as + * equivalent to an explicit `null`/`undefined`. But several Docmost schema + * attributes default to a NON-null value, so import fills them in even when the + * source omitted them — making "attr absent" diverge from "attr at its default + * value" under a raw deep-equal. To keep "absent ≡ explicit-default", we ALSO + * drop any attr whose value equals its known schema default. A non-default + * value (e.g. `orderedList.start: 5`) is NOT a default, so it is KEPT. + * + * Every entry below was read from `packages/docmost-client/src/lib/ + * docmost-schema.ts` (the line refs are the exact `default:` declarations) and + * confirmed to be materialized by an export→import→export round-trip: + * - mark `link` target / rel — DocmostAttributes + StarterKit link. + * StarterKit's link extension defaults `target: "_blank"` and + * `rel: "noopener noreferrer nofollow"`; both materialize on import + * (empirically confirmed) even when the source had only `href`. + * - mark `comment` resolved — docmost-schema.ts L213-214 (`default: false`). + * - node `orderedList` start — provided by StarterKit's orderedList + * (`default: 1`); materializes on import (empirically confirmed). + * - node `drawio`/`excalidraw`/`video`/`youtube`/`embed` align — the diagram + * attribute set and the media nodes declare `align: { default: "center" }` + * (docmost-schema.ts L745-750 diagramAttributes; L564 video; L626 youtube; + * L667 embed). The diagram `align` is the one the round-trip materializes + * (docmost-schema.ts L745); the media/embed entries normalize the SAME + * `align` default for consistency. Note: this only normalizes `align` — + * full canonical stability of `embed` is separately limited by the + * converter coercing numeric `width`/`height` to strings, which is outside + * canonicalize's scope. + * + * NOTE: `image` has NO non-null align default — its `align` defaults to `null` + * (docmost-schema.ts L174), so it is already handled by the null-drop rule and + * is intentionally NOT listed here. + */ +const KNOWN_DEFAULTS = { + // mark types + link: { + target: "_blank", + rel: "noopener noreferrer nofollow", + }, + comment: { + resolved: false, + }, + // node types + orderedList: { + start: 1, + }, + drawio: { + align: "center", + }, + excalidraw: { + align: "center", + }, + video: { + align: "center", + }, + youtube: { + align: "center", + }, + embed: { + align: "center", + }, +}; +/** + * Prune an `attrs` object in place on a fresh copy: drop keys whose value is + * `null` or `undefined` (an absent attribute and an explicit default of `null` + * are semantically equivalent here). Optionally also drop a node-level `id` + * (block ids are regenerated on import, SPEC §11). ALSO drop any attr whose + * value equals the node/mark `type`'s known NON-null schema default + * (`KNOWN_DEFAULTS`), so "attr absent" ≡ "attr at its default value" — without + * this, the import-materialized `link.target`/`comment.resolved`/ + * `orderedList.start`/diagram `align` defaults would be a phantom diff. Every + * non-default attribute value is KEPT (level, language, src, href, commentId, + * width, a non-default `start`/`align`, ...). + * + * Returns the pruned attrs object, or `undefined` if nothing meaningful is + * left (so the caller can drop the `attrs` key entirely: `{attrs:{}}` ≡ no + * attrs). + */ +function canonicalizeAttrs(attrs, dropId, type) { + const defaults = type ? KNOWN_DEFAULTS[type] : undefined; + const out = {}; + // Stable key order so a JSON.stringify of the canonical form is comparable + // regardless of the input's key order. + for (const key of Object.keys(attrs).sort()) { + // Block ids are regenerated on import; drop them on NODE attrs only. + if (dropId && key === "id") + continue; + const value = attrs[key]; + // Absent ≡ explicit-default-null/undefined. + if (value === null || value === undefined) + continue; + // Absent ≡ explicit known non-null default (e.g. link.target="_blank"). + // A non-default value (e.g. orderedList.start=5) does NOT match, so it is + // kept. The `comment` mark's `commentId` is never a default, so it always + // survives (SPEC §3); only its `resolved: false` default is normalized away. + if (defaults && key in defaults && value === defaults[key]) + continue; + out[key] = value; + } + return Object.keys(out).length > 0 ? out : undefined; +} +/** + * Return a DEEP COPY of a ProseMirror node tree, canonicalized so that two + * semantically-equal documents compare deep-equal. Rules (applied recursively + * to the node, its `content`, and its `marks`): + * + * 1. Remove node-level `attrs.id` (regenerated on import). Mark attrs are NOT + * touched for `id` (marks carry no block id; only their meaningful attrs). + * 2. In any `attrs` object (node OR mark) drop keys whose value is `null`/ + * `undefined` (absent ≡ explicit default null) OR equals that node/mark + * type's known non-null schema default (absent ≡ explicit default). + * Keep every non-default value. The type is passed into the attrs + * normalizer so it can look up `KNOWN_DEFAULTS`. + * 3. If an `attrs` object becomes empty after pruning, drop the `attrs` key. + * 4. Preserve `marks` (including the `comment` mark and its `commentId` — a + * meaningful anchor per SPEC §3; never strip it). + * 5. Preserve `text`, `type`, and `content` order exactly. + * 6. Never mutate the input. + */ +export function canonicalizeContent(node) { + if (Array.isArray(node)) { + return node.map((child) => canonicalizeContent(child)); + } + if (node === null || typeof node !== "object") { + // Primitive leaf (string/number/boolean/null): returned as-is. + return node; + } + // A node is a mark when it has a `type` but never carries block `content` + // and lives inside a `marks` array. We cannot tell from the node alone, so + // we distinguish at the recursion site: node `attrs` drop `id`, mark `attrs` + // do not. This is handled by passing a `dropId` flag down for the `attrs` + // key specifically (nodes) vs the `marks[].attrs` path (marks). + const out = {}; + for (const key of Object.keys(node)) { + if (key === "attrs" && node.attrs && typeof node.attrs === "object") { + // Node-level attrs: drop the block id, null/undefined attrs, and any + // attr at this node type's known non-null schema default. + const canon = canonicalizeAttrs(node.attrs, true, typeof node.type === "string" ? node.type : undefined); + if (canon !== undefined) + out.attrs = canon; + // else: drop the `attrs` key entirely (rule 3). + } + else if (key === "marks" && Array.isArray(node.marks)) { + // Marks: keep them all (incl. comment); canonicalize their attrs but do + // NOT drop `id` (a mark's `id` would be a meaningful attr, not a block + // id). An empty marks array is dropped so `marks:[]` ≡ no marks. + const marks = node.marks.map((mark) => canonicalizeMark(mark)); + if (marks.length > 0) + out.marks = marks; + } + else { + out[key] = canonicalizeContent(node[key]); + } + } + return out; +} +/** + * Canonicalize a single mark: keep `type`, prune its `attrs` (null/undefined + * AND known non-null defaults dropped, empty attrs removed) but NEVER drop a + * mark's attribute as a "block id" — marks have no block id, only meaningful + * attrs (href, commentId, color, level, ...). Meaningful NON-default attrs + * survive (the `comment` mark's `commentId` is never a default, so it always + * survives — SPEC §3); only known defaults like `link.target="_blank"`, + * `link.rel="noopener…"` and `comment.resolved=false` are normalized away. + */ +function canonicalizeMark(mark) { + if (mark === null || typeof mark !== "object") + return mark; + const out = {}; + for (const key of Object.keys(mark)) { + if (key === "attrs" && mark.attrs && typeof mark.attrs === "object") { + const canon = canonicalizeAttrs(mark.attrs, false, typeof mark.type === "string" ? mark.type : undefined); + if (canon !== undefined) + out.attrs = canon; + } + else { + out[key] = canonicalizeContent(mark[key]); + } + } + return out; +} +/** + * Deep structural equality of two values that is key-order-insensitive. + * Used to compare canonical forms. (`canonicalizeContent` already emits + * `attrs` in a stable key order, but the top-level node keys preserve input + * order, so we compare structurally rather than by string.) + */ +function deepEqual(a, b) { + if (a === b) + return true; + if (typeof a !== typeof b) + return false; + if (a === null || b === null) + return a === b; + if (typeof a !== "object") + return false; + const aIsArr = Array.isArray(a); + const bIsArr = Array.isArray(b); + if (aIsArr !== bIsArr) + return false; + if (aIsArr) { + if (a.length !== b.length) + return false; + for (let i = 0; i < a.length; i++) { + if (!deepEqual(a[i], b[i])) + return false; + } + return true; + } + const aKeys = Object.keys(a); + const bKeys = Object.keys(b); + if (aKeys.length !== bKeys.length) + return false; + for (const k of aKeys) { + if (!Object.prototype.hasOwnProperty.call(b, k)) + return false; + if (!deepEqual(a[k], b[k])) + return false; + } + return true; +} +/** + * True when two ProseMirror documents are semantically equal: equal after + * canonicalization (block ids stripped, absent-vs-default-null normalized). + */ +export function docsCanonicallyEqual(a, b) { + return deepEqual(canonicalizeContent(a), canonicalizeContent(b)); +} diff --git a/packages/git-sync/build/lib/diff.d.ts b/packages/git-sync/build/lib/diff.d.ts new file mode 100644 index 00000000..60997f4a --- /dev/null +++ b/packages/git-sync/build/lib/diff.d.ts @@ -0,0 +1,54 @@ +/** + * Headless, Docmost-equivalent document diff. + * + * Docmost's history editor computes a change set with the exact pipeline below + * (recreateTransform -> ChangeSet.addSteps -> simplifyChanges) and renders it as + * editor decorations. This module runs the SAME computation but serializes the + * result to text + integrity counts instead of decorations, so a diff can be + * previewed without a browser. + * + * recreateTransform here comes from @fellow/prosemirror-recreate-transform, the + * maintained published fork of the MIT prosemirror-recreate-steps source that + * Docmost vendors in @docmost/editor-ext; it exposes the identical + * recreateTransform(fromDoc, toDoc, { complexSteps, wordDiffs, simplifyDiff }) + * signature. + * + * If recreateTransform / the changeset throws on a pathological document pair, + * we fall back to a coarse block-level text diff so the tool never hard-fails. + */ +/** A single inserted/deleted change with its containing-block context. */ +export interface DiffChange { + op: "insert" | "delete"; + /** Lead (plain) text of the block that contains the change, for context. */ + block: string; + /** The inserted or deleted text. */ + text: string; +} +/** Integrity counts as [old, new] tuples; footnoteMarkers as [oldList, newList]. */ +export interface DiffIntegrity { + images: [number, number]; + links: [number, number]; + tables: [number, number]; + callouts: [number, number]; + footnoteMarkers: [number[], number[]]; +} +export interface DiffResult { + summary: { + inserted: number; + deleted: number; + blocksChanged: number; + }; + integrity: DiffIntegrity; + changes: DiffChange[]; + /** Human-readable unified-ish summary. */ + markdown: string; +} +/** + * Diff two ProseMirror JSON documents the way Docmost's history editor does and + * serialize the result to text + integrity counts. + * + * @param oldDocJson the earlier document + * @param newDocJson the later document + * @param notesHeading heading delimiting body from notes for footnote counting + */ +export declare function diffDocs(oldDocJson: any, newDocJson: any, notesHeading?: string): DiffResult; diff --git a/packages/git-sync/build/lib/diff.js b/packages/git-sync/build/lib/diff.js new file mode 100644 index 00000000..5205aff1 --- /dev/null +++ b/packages/git-sync/build/lib/diff.js @@ -0,0 +1,273 @@ +/** + * Headless, Docmost-equivalent document diff. + * + * Docmost's history editor computes a change set with the exact pipeline below + * (recreateTransform -> ChangeSet.addSteps -> simplifyChanges) and renders it as + * editor decorations. This module runs the SAME computation but serializes the + * result to text + integrity counts instead of decorations, so a diff can be + * previewed without a browser. + * + * recreateTransform here comes from @fellow/prosemirror-recreate-transform, the + * maintained published fork of the MIT prosemirror-recreate-steps source that + * Docmost vendors in @docmost/editor-ext; it exposes the identical + * recreateTransform(fromDoc, toDoc, { complexSteps, wordDiffs, simplifyDiff }) + * signature. + * + * If recreateTransform / the changeset throws on a pathological document pair, + * we fall back to a coarse block-level text diff so the tool never hard-fails. + */ +import { getSchema } from "@tiptap/core"; +import { Node } from "@tiptap/pm/model"; +import { ChangeSet, simplifyChanges } from "@tiptap/pm/changeset"; +import { recreateTransform } from "@fellow/prosemirror-recreate-transform"; +import { docmostExtensions } from "./docmost-schema.js"; +/** Build the schema once; it is pure and reused across calls. */ +const schema = getSchema(docmostExtensions); +/** Recursively concatenate the plain text of a JSON node. */ +function plainText(node) { + if (!node || typeof node !== "object") + return ""; + let out = ""; + if (typeof node.text === "string") + out += node.text; + if (Array.isArray(node.content)) { + for (const child of node.content) + out += plainText(child); + } + return out; +} +/** Count nodes in a JSON doc that satisfy `pred` (recursive). */ +function countNodes(doc, pred) { + let n = 0; + const visit = (node) => { + if (!node || typeof node !== "object") + return; + if (pred(node)) + n++; + if (Array.isArray(node.content)) + for (const c of node.content) + visit(c); + }; + visit(doc); + return n; +} +/** + * Count UNIQUE links in a JSON doc by their `href`. A single link can be split + * across several adjacent text runs (e.g. a "link+bold" run followed by a "link" + * run); counting link-bearing runs would over-count it. Walking the tree and + * collecting hrefs into a Set keys each distinct link once. Link marks with a + * missing/empty href are bucketed under a single "" key so a malformed link is + * still counted as one. + */ +function countUniqueLinks(doc) { + const hrefs = new Set(); + const visit = (node) => { + if (!node || typeof node !== "object") + return; + if (node.type === "text" && Array.isArray(node.marks)) { + for (const m of node.marks) { + if (m && m.type === "link") { + const href = m.attrs && typeof m.attrs.href === "string" ? m.attrs.href : ""; + hrefs.add(href); + } + } + } + if (Array.isArray(node.content)) + for (const c of node.content) + visit(c); + }; + visit(doc); + return hrefs.size; +} +/** + * Parse the ordered list of integers from `[N]` footnote markers found in the + * BODY only (every top-level block before the first "Примечания..." notes + * heading; if no such heading, the whole doc). Returned in reading order. + */ +function footnoteMarkers(doc, notesHeading) { + const top = Array.isArray(doc?.content) ? doc.content : []; + const notesIdx = top.findIndex((n) => n && + n.type === "heading" && + plainText(n).trim() === notesHeading); + const bodyBlocks = notesIdx >= 0 ? top.slice(0, notesIdx) : top; + const markers = []; + const re = /\[(\d+)\]/g; + for (const block of bodyBlocks) { + const text = plainText(block); + let m; + re.lastIndex = 0; + while ((m = re.exec(text)) !== null) { + markers.push(Number(m[1])); + } + } + return markers; +} +/** Compute the [old,new] integrity tuples for two JSON docs. */ +function computeIntegrity(oldDoc, newDoc, notesHeading) { + const images = [ + countNodes(oldDoc, (n) => n.type === "image"), + countNodes(newDoc, (n) => n.type === "image"), + ]; + const links = [ + countUniqueLinks(oldDoc), + countUniqueLinks(newDoc), + ]; + const tables = [ + countNodes(oldDoc, (n) => n.type === "table"), + countNodes(newDoc, (n) => n.type === "table"), + ]; + const callouts = [ + countNodes(oldDoc, (n) => n.type === "callout"), + countNodes(newDoc, (n) => n.type === "callout"), + ]; + const fns = [ + footnoteMarkers(oldDoc, notesHeading), + footnoteMarkers(newDoc, notesHeading), + ]; + return { images, links, tables, callouts, footnoteMarkers: fns }; +} +/** + * Resolve the lead text of the top-level block in a ProseMirror Node that + * contains the given document position. Returns "" when out of range. + */ +function blockContextAt(node, pos) { + try { + const clamped = Math.max(0, Math.min(pos, node.content.size)); + const $pos = node.resolve(clamped); + // depth 1 is the top-level block in a doc node. + const block = $pos.depth >= 1 ? $pos.node(1) : $pos.node(0); + const text = block.textContent || ""; + return text.length > 80 ? text.slice(0, 77) + "..." : text; + } + catch { + return ""; + } +} +/** Truncate a string for the markdown summary. */ +function truncate(s, n = 120) { + return s.length > n ? s.slice(0, n - 3) + "..." : s; +} +/** + * Coarse fallback: a block-by-block plain-text diff. Used only when the precise + * changeset pipeline throws, so the tool degrades gracefully instead of failing. + */ +function coarseDiff(oldDoc, newDoc) { + const oldBlocks = Array.isArray(oldDoc?.content) ? oldDoc.content : []; + const newBlocks = Array.isArray(newDoc?.content) ? newDoc.content : []; + const oldTexts = oldBlocks.map(plainText); + const newTexts = newBlocks.map(plainText); + const oldSet = new Set(oldTexts); + const newSet = new Set(newTexts); + const changes = []; + for (const t of oldTexts) { + if (!newSet.has(t) && t.trim() !== "") { + changes.push({ op: "delete", block: truncate(t, 80), text: t }); + } + } + for (const t of newTexts) { + if (!oldSet.has(t) && t.trim() !== "") { + changes.push({ op: "insert", block: truncate(t, 80), text: t }); + } + } + return changes; +} +/** Build the human-readable unified-ish markdown summary. */ +function renderMarkdown(result, fellBack) { + const lines = []; + const { summary, integrity, changes } = result; + lines.push(`# Diff: ${summary.inserted} inserted / ${summary.deleted} deleted (${summary.blocksChanged} blocks changed)`); + if (fellBack) { + lines.push(""); + lines.push("> note: precise diff failed; coarse block-level diff shown."); + } + lines.push(""); + lines.push("## Integrity (old -> new)"); + lines.push(`- images: ${integrity.images[0]} -> ${integrity.images[1]}`); + lines.push(`- links: ${integrity.links[0]} -> ${integrity.links[1]}`); + lines.push(`- tables: ${integrity.tables[0]} -> ${integrity.tables[1]}`); + lines.push(`- callouts: ${integrity.callouts[0]} -> ${integrity.callouts[1]}`); + lines.push(`- footnoteMarkers: [${integrity.footnoteMarkers[0].join(", ")}] -> [${integrity.footnoteMarkers[1].join(", ")}]`); + lines.push(""); + lines.push("## Changes"); + if (changes.length === 0) { + lines.push("(no textual changes)"); + } + else { + for (const c of changes) { + const sign = c.op === "insert" ? "+" : "-"; + const ctx = c.block ? ` @ ${truncate(c.block, 60)}` : ""; + lines.push(`${sign} ${truncate(c.text)}${ctx}`); + } + } + return lines.join("\n"); +} +/** + * Diff two ProseMirror JSON documents the way Docmost's history editor does and + * serialize the result to text + integrity counts. + * + * @param oldDocJson the earlier document + * @param newDocJson the later document + * @param notesHeading heading delimiting body from notes for footnote counting + */ +export function diffDocs(oldDocJson, newDocJson, notesHeading = "Примечания переводчика") { + const integrity = computeIntegrity(oldDocJson, newDocJson, notesHeading); + let changes = []; + let inserted = 0; + let deleted = 0; + let fellBack = false; + const changedBlocks = new Set(); + try { + const oldNode = Node.fromJSON(schema, oldDocJson); + const newNode = Node.fromJSON(schema, newDocJson); + const tr = recreateTransform(oldNode, newNode, { + complexSteps: false, + wordDiffs: true, + simplifyDiff: true, + }); + const changeSet = ChangeSet.create(oldNode).addSteps(tr.doc, tr.mapping.maps, []); + const simplified = simplifyChanges(changeSet.changes, newNode); + for (const change of simplified) { + // Deleted text lives in the OLD doc coordinate range [fromA, toA). + if (change.toA > change.fromA) { + const text = oldNode.textBetween(change.fromA, change.toA, "\n", " "); + if (text.length > 0) { + deleted += text.length; + const block = blockContextAt(oldNode, change.fromA); + changes.push({ op: "delete", block, text }); + if (block) + changedBlocks.add("d:" + block); + } + } + // Inserted text lives in the NEW doc coordinate range [fromB, toB). + if (change.toB > change.fromB) { + const text = newNode.textBetween(change.fromB, change.toB, "\n", " "); + if (text.length > 0) { + inserted += text.length; + const block = blockContextAt(newNode, change.fromB); + changes.push({ op: "insert", block, text }); + if (block) + changedBlocks.add("i:" + block); + } + } + } + } + catch { + // Pathological pair: degrade to a coarse block-level diff so we never throw. + fellBack = true; + changes = coarseDiff(oldDocJson, newDocJson); + for (const c of changes) { + if (c.op === "insert") + inserted += c.text.length; + else + deleted += c.text.length; + if (c.block) + changedBlocks.add(c.op[0] + ":" + c.block); + } + } + const partial = { + summary: { inserted, deleted, blocksChanged: changedBlocks.size }, + integrity, + changes, + }; + return { ...partial, markdown: renderMarkdown(partial, fellBack) }; +} diff --git a/packages/git-sync/build/lib/docmost-schema.d.ts b/packages/git-sync/build/lib/docmost-schema.d.ts new file mode 100644 index 00000000..8684e1bc --- /dev/null +++ b/packages/git-sync/build/lib/docmost-schema.d.ts @@ -0,0 +1,9 @@ +import { Node, Extension, Mark } from "@tiptap/core"; +export declare const clampCalloutType: (value: string | null | undefined) => string; +export declare const sanitizeCssColor: (value: string | null | undefined) => string | null; +/** + * Full extension list. Image is block-level (matches Docmost); the + * ProseMirror DOM parser hoists found inside

automatically. + * StarterKit v3 already bundles the link extension, configured here. + */ +export declare const docmostExtensions: (Node | Mark | Extension | Extension | Node | Node | Node | Mark | Mark)[]; diff --git a/packages/git-sync/build/lib/docmost-schema.js b/packages/git-sync/build/lib/docmost-schema.js new file mode 100644 index 00000000..97cdcafd --- /dev/null +++ b/packages/git-sync/build/lib/docmost-schema.js @@ -0,0 +1,999 @@ +/** + * Full TipTap extension set matching the real Docmost document schema. + * + * The default StarterKit-only schema silently destroys Docmost-specific + * nodes (callout, table) and drops attributes it does not know about + * (node ids, image sizing, link targets). Every code path that converts + * to or from ProseMirror JSON must use THIS set, otherwise a round-trip + * loses content. + */ +import StarterKit from "@tiptap/starter-kit"; +import Image from "@tiptap/extension-image"; +import TaskList from "@tiptap/extension-task-list"; +import TaskItem from "@tiptap/extension-task-item"; +import Highlight from "@tiptap/extension-highlight"; +import Subscript from "@tiptap/extension-subscript"; +import Superscript from "@tiptap/extension-superscript"; +import { Node, Extension, Mark } from "@tiptap/core"; +// Inlined from @tiptap/core's getStyleProperty (added after 3.20.x) so this +// package can stay on the same @tiptap/core version as the editor and avoid a +// duplicate-tiptap version split in the monorepo. Reads a single declaration +// from an element's inline `style` attribute, last-wins, case-insensitive. +function getStyleProperty(element, propertyName) { + const styleAttr = element.getAttribute("style"); + if (!styleAttr) { + return null; + } + const decls = styleAttr.split(";").map((decl) => decl.trim()).filter(Boolean); + const target = propertyName.toLowerCase(); + for (let i = decls.length - 1; i >= 0; i -= 1) { + const decl = decls[i]; + const colonIndex = decl.indexOf(":"); + if (colonIndex === -1) { + continue; + } + const prop = decl.slice(0, colonIndex).trim().toLowerCase(); + if (prop === target) { + return decl.slice(colonIndex + 1).trim(); + } + } + return null; +} +/** Allowed Docmost callout types; anything else falls back to "info". */ +const CALLOUT_TYPES = ["info", "warning", "danger", "success"]; +export const clampCalloutType = (value) => value && CALLOUT_TYPES.includes(value.toLowerCase()) + ? value.toLowerCase() + : "info"; +/** + * Allowlist guard for CSS color values imported from HTML. + * + * Docmost interpolates stored mark colors straight into an inline style + * attribute (e.g. style="background-color: ${color}" / "color: ${color}"). + * An unsanitized value such as `red; --x: url(...)` or `red">