diff --git a/AGENT_GUIDE.md b/AGENT_GUIDE.md index 863aafa..d6df144 100644 --- a/AGENT_GUIDE.md +++ b/AGENT_GUIDE.md @@ -68,6 +68,8 @@ Save a new document or update an existing one. | `content` | Yes | Markdown content. Use H1/H2/H3 headings -- the chunker uses them for segmentation. | | `document_id` | No | UUID of an existing document to update. When provided, updates that document directly regardless of `update_if_exists`. Returns an error if the document does not exist. Workflow: search → note the `[id: ...]` → pass here. | | `update_if_exists` | No | When `true`, updates the document with the same title (versions the old content). Default `false`. Ignored when `document_id` is provided. | +| `expected_content_hash` | **Yes, on content updates** | Optimistic-concurrency token: the `content_hash` of the version you based your edit on (returned by `cerefox_get_document`, `cerefox_search`, and `cerefox_metadata_search`). Stale → **conflict error** (re-read, merge, retry). Absent → **token-required error**. Not needed when creating. See "Concurrent writers" below. | +| `last_write_wins` | No | Explicitly skip the concurrency check (default `false`). Use ONLY when an external source of truth makes conflicts meaningless (file re-sync). Recorded in the audit log. **Never use it to silence a conflict.** | | `project_name` | No | **Single** project name (created if absent). On update: **non-destructive add** — ensures this membership exists, preserves others. See "Project membership semantics" below. | | `project_names` | No | **List** of project names (each created if absent). On update: **destructive replace** — sets the document's full project set to exactly this list. Use when you want to set multiple projects at once, or deliberately change the membership list. Wins over `project_name` when both are passed. | | `metadata` | No | Arbitrary JSON. Use at minimum: `type` and `status`. | @@ -76,15 +78,24 @@ Save a new document or update an existing one. **The update workflow (preferred -- ID-based)**: 1. Search for the document. Note the `[id: abc123]` in the result. -2. Call `cerefox_ingest` with `document_id: "abc123"` and the new content. -3. The old content is automatically versioned and recoverable. +2. `cerefox_get_document("abc123")` — read the current content and note its `content_hash`. +3. Call `cerefox_ingest` with `document_id: "abc123"`, the new content, and `expected_content_hash: ""`. +4. The old content is automatically versioned and recoverable. **The update workflow (fallback -- title-based)**: -1. Search for the document first. -2. Call `cerefox_ingest` with the **exact same title** and `update_if_exists: true`. +1. Search for the document first (note its hash). +2. Call `cerefox_ingest` with the **exact same title**, `update_if_exists: true`, and `expected_content_hash`. 3. If you use a different title, a **new** document is created (the old one remains). This is almost never what you want when revising. -**Deduplication**: Content is SHA-256 hashed. Identical content is skipped (no re-indexing). Metadata-only changes update metadata without creating a version. +**Deduplication**: Content is SHA-256 hashed. Identical content is skipped (no re-indexing, no concurrency check needed — identical content cannot lose data). Metadata-only changes update metadata without creating a version. + +#### Concurrent writers (optimistic concurrency) + +Cerefox is **shared** memory — another agent (or the user) may update a document between your read and your write. Content updates therefore require proof of freshness: `expected_content_hash` must equal the document's current `content_hash` at write time, checked atomically inside the database. + +- **Conflict error** ("document changed since it was read"): the document moved underneath you. `cerefox_get_document` again → **merge your changes into the latest content** → retry with the new hash. Never resolve a conflict by overwriting blindly — the current content includes another writer's work. +- **Token-required error**: you attempted a content update without `expected_content_hash`. Read the document first; if you already did, pass the hash you read. +- `last_write_wins: true` bypasses the check — reserved for re-sync flows where an external source of truth (e.g., files on disk) makes conflicts meaningless. It is recorded in the audit log. **What to ingest**: Distilled summaries, decisions with rationale, curated insights. Not raw dumps, logs, or transcripts. Use Markdown headings for structure. @@ -112,7 +123,7 @@ Retrieve the complete text of a document by its UUID. | `version_id` | No | UUID of an archived version (from `cerefox_list_versions`). | | `requestor` | No | Your agent name. | -Use this when search returns partial results, or to read a previous version before restoring it. +Use this when search returns partial results, or to read a previous version before restoring it. The response header includes the document's current `content_hash` — pass it back as `expected_content_hash` when updating via `cerefox_ingest`. --- @@ -268,18 +279,22 @@ Metadata is matched as **strings**, so store the flag as the string `"true"` (no ``` 1. cerefox_search("topic") -- find relevant docs, note [id: uuid] -2. cerefox_get_document(id) -- get full text if partial +2. cerefox_get_document(id) -- get full text + content_hash 3. cerefox_ingest(title, content, -- update by document ID (deterministic) - document_id="uuid") + document_id="uuid", + expected_content_hash="") +4. On a conflict error: repeat from step 2, merging your changes into + the latest content before retrying with the fresh hash. ``` ### Search then update (title-based -- fallback) ``` -1. cerefox_search("topic") -- find relevant docs -2. cerefox_get_document(id) -- get full text if partial +1. cerefox_search("topic") -- find relevant docs (note the hash) +2. cerefox_get_document(id) -- get full text + content_hash 3. cerefox_ingest(title, content, -- update with same title - update_if_exists=true) + update_if_exists=true, + expected_content_hash="") ``` ### Save new knowledge @@ -287,7 +302,8 @@ Metadata is matched as **strings**, so store the flag as the string `"true"` (no ``` 1. cerefox_search("topic") -- check if it already exists 2. If not found: cerefox_ingest(title, content, project_name, metadata) -3. If found: cerefox_ingest(same_title, new_content, document_id="uuid") +3. If found: cerefox_ingest(same_title, new_content, document_id="uuid", + expected_content_hash="") ``` ### Catch up on recent changes @@ -309,6 +325,7 @@ Metadata is matched as **strings**, so store the flag as the string `"true"` (no 5. **Add metadata**: at minimum `type` (e.g., "research", "decision-log") and `status` ("active", "draft"). 6. **Write structured Markdown** with H1/H2/H3 headings. The chunker uses heading structure. 7. **Distill, don't dump.** Summaries > transcripts. Decisions > discussions. Insights > raw data. +8. **Prove freshness on updates.** Pass `expected_content_hash` (the hash you read) on every content update. On conflict: re-read → merge → retry. Never `last_write_wins` your way out of a conflict. --- @@ -418,7 +435,7 @@ The legacy Python `uv run cerefox` is a frozen husk as of v0.9 — only `uv run | MCP tool | CLI command | |---|---| | `cerefox_search(query, match_count, project_name, metadata_filter, requestor)` | `cerefox search "" --match-count N --project-name --metadata-filter '' --requestor ` (also `--mode`, `--alpha`, `--min-score`, `--only-metadata` — CLI-only) | -| `cerefox_ingest(title, content, project_name, metadata, update_if_exists, document_id, source, author, author_type)` (file) | `cerefox document ingest --title --project-name --metadata '' --update-if-exists\|--document-id --source --author --author-type user\|agent` | +| `cerefox_ingest(title, content, project_name, metadata, update_if_exists, document_id, expected_content_hash, last_write_wins, source, author, author_type)` (file) | `cerefox document ingest --title --project-name --metadata '' --update-if-exists\|--document-id --expected-content-hash \|--last-write-wins --source --author --author-type user\|agent` | | `cerefox_ingest(...)` (paste) | `printf '%s' "" \| cerefox document ingest --paste --title ""` (same flags) | | `cerefox_get_document(document_id, version_id, requestor)` | `cerefox document get <document-id> --version-id <vid> --requestor <name>` | | `cerefox_list_versions(document_id, requestor)` | `cerefox document version list <document-id> --requestor <name>` | @@ -476,12 +493,18 @@ printf '# Title\n\nBody markdown with H2s for chunking.\n' \ # Step 1: search and note the [id: abc12345-...] in the result cerefox search "the exact doc" --match-count 1 --requestor "claude-code" -# Step 2: update by ID +# Step 2: read it — the header shows `content_hash:` (the concurrency token) +cerefox document get "abc12345-..." --requestor "claude-code" + +# Step 3: update by ID, proving freshness with the hash from step 2 printf '...new content...' \ | cerefox document ingest --paste \ --title "Exact Same Title" \ --document-id "abc12345-..." \ + --expected-content-hash "<hash from step 2>" \ --author "claude-code" --author-type "agent" + +# Conflict error? Repeat from step 2, merge into the latest content, retry. ``` **Title-based update (fallback when ID isn't available):** diff --git a/AGENT_QUICK_REFERENCE.md b/AGENT_QUICK_REFERENCE.md index 541b524..44bd79b 100644 --- a/AGENT_QUICK_REFERENCE.md +++ b/AGENT_QUICK_REFERENCE.md @@ -7,8 +7,8 @@ Cerefox is a persistent, shared knowledge base. You have **10 MCP tools** (9 of | Tool | Purpose | Key params | |------|---------|------------| | `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` | -| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` | -| `cerefox_get_document` | Get full document by ID | `document_id` (required) | +| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` | +| `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) | | `cerefox_list_versions` | Version history of a document | `document_id` (required) | | `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since | | `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) | @@ -27,20 +27,25 @@ Cerefox is a persistent, shared knowledge base. You have **10 MCP tools** (9 of 6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search. 7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design. 8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: <uuid>]` after the title; grab it and use it. Title-based linking (`[Text](<Title With Spaces>)`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule. -9. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean "add" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`. +9. **Concurrency: content updates require `expected_content_hash`.** Pass the `content_hash` you read (shown by `cerefox_get_document`, `cerefox_search`, and `cerefox_metadata_search`) when updating a document. If it's stale you get a **conflict** — re-read the document, merge your changes into the latest content, retry with the new hash. **Never resolve a conflict by overwriting blindly** — the current content includes another writer's work. `last_write_wins: true` skips the check; use it ONLY when an external source of truth makes conflicts meaningless (file re-sync), never to silence a conflict. +10. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean "add" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`. ## Update Workflow (ID-based -- preferred) ``` -search("topic") -> find doc [id: abc123] -> get_document(abc123) -> modify -> -ingest(title="Same Title", content="...", document_id="abc123", author="my-agent") +search("topic") -> find doc [id: abc123] -> get_document(abc123) -> note its content_hash -> modify -> +ingest(title="Same Title", content="...", document_id="abc123", + expected_content_hash="<the hash you read>", author="my-agent") ``` +On a **conflict** error: get_document again (fresh content + fresh hash) -> merge your changes -> retry with the new hash. + ## Update Workflow (title-based -- fallback) ``` -search("topic") -> find doc -> modify -> -ingest(title="Same Title", content="...", update_if_exists=true, author="my-agent") +search("topic") -> find doc (note its hash) -> modify -> +ingest(title="Same Title", content="...", update_if_exists=true, + expected_content_hash="<the hash you read>", author="my-agent") ``` ## Catch-Up Workflow @@ -59,7 +64,7 @@ Same operations, same conventions. Full reference: [`docs/guides/cli.md`](docs/g |---|---| | `cerefox_search` | `cerefox search "<q>" --requestor "<your-name>"` | | `cerefox_ingest` (paste) | `printf '...' \| cerefox document ingest --paste --title "<t>" --author "<your-name>" --author-type agent` | -| `cerefox_ingest` (update by ID) | `printf '...' \| cerefox document ingest --paste --title "<t>" --document-id "<uuid>" --author "<your-name>" --author-type agent` | +| `cerefox_ingest` (update by ID) | `printf '...' \| cerefox document ingest --paste --title "<t>" --document-id "<uuid>" --expected-content-hash "<hash>" --author "<your-name>" --author-type agent` | | `cerefox_get_document` | `cerefox document get <id> --version-id <vid> --requestor "<your-name>"` | | `cerefox_list_versions` | `cerefox document version list <id> --requestor "<your-name>"` | | `cerefox_list_projects` | `cerefox project list --requestor "<your-name>"` | diff --git a/CHANGELOG.md b/CHANGELOG.md index aaf7cc1..8d294db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,46 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html) — all ` ## [Unreleased] -Open roadmap. +### Changed — BREAKING + +- **Optimistic concurrency control on content updates** (design: + [`docs/specs/concurrency-control-design.md`](docs/specs/concurrency-control-design.md)). + Updating a document's content (via `document_id` or `update_if_exists`) now requires + **`expected_content_hash`** — the `content_hash` of the version the edit was based on, + returned by every read surface (`cerefox_get_document`, `cerefox_search`, + `cerefox_metadata_search`, the REST EFs, `cerefox document get` / `cerefox search`, + and the web edit page). The check is atomic inside the `cerefox_ingest_document` RPC + (`SELECT … FOR UPDATE`), closing the read→embed→write race where two concurrent + writers silently last-write-wins'd each other. A stale hash fails with a **conflict** + (re-read → merge → retry; HTTP 409 on the REST path); a missing hash fails with + **token-required** (HTTP 400). `last_write_wins: true` (CLI `--last-write-wins`) + explicitly skips the check and is recorded in the audit log — `document ingest-dir` + and `guides ingest` pass it internally (the filesystem / npm package is their source + of truth), and the frozen Python fallback declares it to preserve its historical + behavior. **Breaking**: pre-v0.11 clients' content updates fail against an upgraded + server until updated (`cerefox self-update`); existing GPT Actions need the v2.0.0 + OpenAPI block re-pasted. Creates are unaffected. Schema version 0.4.0 → **0.5.0** + (RPC-only change; ships via `cerefox server deploy --schema-only`). + +### Added + +- `content_hash` returned by all document-shaped reads (MCP tool headers, CLI output, + REST EF responses, web document API) — the token for the concurrency contract above. +- CLI flags `--expected-content-hash` / `--last-write-wins` on `cerefox document ingest`. +- Web edit page detects mid-edit concurrent changes and shows a merge-needed conflict + error instead of silently overwriting. + +### Fixed + +- **Web edit page could corrupt metadata keys via the key autocomplete.** The key + suggestions embedded the usage count in the option label (`status (108)`), and + Mantine's Autocomplete inserts the *label* into the field on select — so picking a + suggestion (and saving) stored the literal string `status (108)` as the metadata key, + polluting the KB taxonomy (it then showed up in the key list as `status (108) (1)`). + The dropdown now shows the count via `renderOption` ("status · 108 docs" style), + while only the bare key ever enters the field. The search filter's key Select (which + was never affected — Select keeps value/label separate) now labels the count as + "(N docs)" for clarity. --- diff --git a/CLAUDE.md b/CLAUDE.md index a42fa54..cf31190 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -362,6 +362,7 @@ authoritative release playbook; read it at the start of any release work. 6. **Edge Function per operation** — each operation has a dedicated Edge Function that is a thin HTTP adapter over a Postgres RPC; `cerefox-mcp` calls those same RPCs directly (no delegation/fan-out to other Edge Functions); single implementation principle (see above) 7. **Chunks-anchored versioning** — `version_id IS NULL` = current version; `version_id = <uuid>` = archived; partial indexes automatically exclude archived chunks from search; no separate content table 8. **Title boosting** — `cerefox_chunks.fts` is a regular `TSVECTOR` (not `GENERATED`) because `GENERATED` columns cannot cross-reference another table. The `cerefox_ingest_document` RPC computes `fts` inline using its `p_title` parameter: document title at weight A, chunk heading at weight A, body at weight B. Embeddings are similarly enriched: `# {doc_title}\n{chunk.content}` is the embedding input (stored content is unchanged). Title changes trigger `cerefox_update_chunk_fts` + re-embed of current chunks. +9. **Optimistic concurrency on content updates** (v0.11 / schema 0.5.0) — updates via `cerefox_ingest_document` require `p_expected_content_hash` (compare-and-swap against the document's existing `content_hash`, atomic via `SELECT … FOR UPDATE` in the RPC) or an explicit `p_last_write_wins` (audit-logged). Stale → `CEREFOX_CONFLICT` (40001 / HTTP 409); absent → `CEREFOX_TOKEN_REQUIRED` (22023 / HTTP 400). All document-shaped read RPCs return `content_hash` so writers hold the token. Filesystem-sync flows (`ingest-dir`, `guides ingest`) and the frozen Python fallback pass `last_write_wins` internally. Design: `docs/specs/concurrency-control-design.md`. ## Documentation as Source of Truth diff --git a/README.md b/README.md index d5037ce..639c6c9 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ Questions, ideas, or want to follow development? **[Join the Cerefox Discord](ht | **Markdown-first ingest** | `.md` / `.txt` / `.docx` (Markdown is the storage format; `.docx` is converted via `mammoth` on ingest, fidelity varies. PDF is not supported — convert upstream) | | **Batch ingest** | `cerefox document ingest-dir` recurses directories | | **Deduplication** | SHA-256 content hash; re-ingesting the same file is a no-op | +| **Concurrency-safe updates** | Optimistic locking on content updates (v0.11+): writers pass the `content_hash` they read; a concurrent change fails with a conflict (re-read → merge → retry) instead of silently overwriting another agent's work. Explicit `last_write_wins` opt-out for file re-sync flows | | **Backup and restore** | JSON snapshots, optional git commit | | **Small-to-big retrieval** | `cerefox_context_expand` RPC returns chunk neighbours for richer context | | **Audit log** | Immutable, append-only log of all write operations (create, update, delete, status change). Author attribution with `author_type` ('user' or 'agent'). Browsable via web UI, queryable via MCP tool and Edge Function | diff --git a/_shared/__tests__/mcp_tools.test.ts b/_shared/__tests__/mcp_tools.test.ts index d13fb07..3a46a66 100644 --- a/_shared/__tests__/mcp_tools.test.ts +++ b/_shared/__tests__/mcp_tools.test.ts @@ -236,6 +236,72 @@ describe("input validation throws McpInvalidParams", () => { }); }); +describe("cerefox_ingest optimistic concurrency (iter-32)", () => { + // Mock client whose document lookup returns a doc with the given hash. + // The stale-token fast-fail throws BEFORE chunking/embedding, so no + // OpenAI or RPC mocking is needed beyond the lookup chain. + function docClient(currentHash: string): SupabaseClient { + const chain = { + select: () => chain, + eq: () => chain, + is: () => chain, + order: () => chain, + limit: () => ({ data: [{ id: "doc-1", title: "T", content_hash: currentHash }], error: null }), + }; + return { + from: () => chain, + rpc: () => ({ data: null, error: null }), + } as unknown as SupabaseClient; + } + + test("stale expected_content_hash fast-fails with merge instructions", async () => { + const tool = TOOLS_BY_NAME["cerefox_ingest"]; + let err: unknown; + try { + await tool.handler( + docClient("c".repeat(64)), + { + title: "T", + content: "new body", + document_id: "doc-1", + expected_content_hash: "a".repeat(64), + }, + { ...FAKE_CTX, openaiApiKey: "test-key" }, + ); + } catch (e) { + err = e; + } + expect(err).toBeInstanceOf(Error); + const msg = (err as Error).message; + expect(msg).toContain("Conflict"); + expect(msg).toContain("cerefox_get_document"); + expect(msg).toContain("c".repeat(64)); // tells the agent the current hash + }); + + test("last_write_wins skips the fast-fail (reaches the embed stage)", async () => { + const tool = TOOLS_BY_NAME["cerefox_ingest"]; + let err: unknown; + try { + await tool.handler( + docClient("c".repeat(64)), + { + title: "T", + content: "new body", + document_id: "doc-1", + expected_content_hash: "a".repeat(64), + last_write_wins: true, + }, + { ...FAKE_CTX, openaiApiKey: "test-key" }, + ); + } catch (e) { + err = e; + } + // It must NOT be the conflict error — with the check bypassed the handler + // proceeds to the embedding call, which fails against the fake key. + expect(String((err as Error)?.message ?? "")).not.toContain("Conflict:"); + }); +}); + describe("cerefox_metadata_search listing (empty filter + scope)", () => { // A mock client that resolves any project name → "proj-1" and records the // params passed to the cerefox_metadata_search RPC. diff --git a/_shared/mcp-tools/get-document.ts b/_shared/mcp-tools/get-document.ts index 8e3aefc..590ca66 100644 --- a/_shared/mcp-tools/get-document.ts +++ b/_shared/mcp-tools/get-document.ts @@ -32,6 +32,7 @@ async function handler( full_content?: string; chunk_count?: number; total_chars?: number; + content_hash?: string; } | undefined; @@ -46,13 +47,16 @@ async function handler( }); const label = version_id !== null ? " (archived version)" : " (current)"; - return `# ${row.doc_title ?? "Untitled"}${label}\n\n${row.full_content ?? ""}`; + // content_hash is the optimistic-concurrency token: pass it back as + // expected_content_hash when updating this document via cerefox_ingest. + const hashLine = row.content_hash ? `content_hash: ${row.content_hash}\n\n` : ""; + return `# ${row.doc_title ?? "Untitled"}${label}\n${hashLine}${row.full_content ?? ""}`; } export const getDocumentTool: ToolDefinition = { name: "cerefox_get_document", description: - "Retrieve the full reconstructed content of a document. Pass version_id to retrieve an archived version; omit it (or pass null) for the current version. Version UUIDs are returned by cerefox_list_versions.", + "Retrieve the full reconstructed content of a document. Pass version_id to retrieve an archived version; omit it (or pass null) for the current version. Version UUIDs are returned by cerefox_list_versions. The response header includes the document's current content_hash — pass it back as expected_content_hash when updating via cerefox_ingest (optimistic concurrency).", inputSchema: { type: "object", required: ["document_id"], diff --git a/_shared/mcp-tools/get-help-content.ts b/_shared/mcp-tools/get-help-content.ts index 89f2d90..35e7fa6 100644 --- a/_shared/mcp-tools/get-help-content.ts +++ b/_shared/mcp-tools/get-help-content.ts @@ -11,16 +11,16 @@ * docs/specs/polish-and-distribution-design.md §10d. */ -export const HELP_FULL = "# Cerefox Knowledge Base -- Agent Quick Reference\n\nCerefox is a persistent, shared knowledge base. You have **10 MCP tools** (9 of them have CLI equivalents — `cerefox_get_help` is MCP-only). For the full guide, search Cerefox for \"How AI Agents Use Cerefox\" or call `cerefox_get_help` to retrieve this content over MCP.\n\n## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` |\n| `cerefox_get_document` | Get full document by ID | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |\n\n## Essential Rules\n\n1. **Search before ingesting** -- check if the document exists first.\n2. **Prefer ID-based updates** -- pass `document_id` from search results for deterministic updates. Falls back to title-matching with `update_if_exists: true`.\n3. **Set `author`/`requestor`** to your name on every call (e.g., \"Claude Code\", \"archiver\"). On MCP, pass as parameters. On CLI, pass `--author`/`--author-type`/`--requestor` flags, or rely on `CEREFOX_AUTHOR_NAME`/`CEREFOX_AUTHOR_TYPE`/`CEREFOX_REQUESTOR_NAME` env vars set in the user's `.env`.\n4. **Use `document_id` from search results** `[id: uuid]` for get_document and list_versions.\n5. **Add metadata** -- at minimum `type` (\"decision-log\", \"research\", \"design-doc\") and `status` (\"active\", \"draft\").\n6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search.\n7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design.\n8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: <uuid>]` after the title; grab it and use it. Title-based linking (`[Text](<Title With Spaces>)`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule.\n9. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean \"add\" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`.\n\n## Update Workflow (ID-based -- preferred)\n\n```\nsearch(\"topic\") -> find doc [id: abc123] -> get_document(abc123) -> modify ->\ningest(title=\"Same Title\", content=\"...\", document_id=\"abc123\", author=\"my-agent\")\n```\n\n## Update Workflow (title-based -- fallback)\n\n```\nsearch(\"topic\") -> find doc -> modify ->\ningest(title=\"Same Title\", content=\"...\", update_if_exists=true, author=\"my-agent\")\n```\n\n## Catch-Up Workflow\n\n```\nmetadata_search(metadata_filter={\"type\": \"decision-log\"}, updated_since=\"2026-03-28T00:00:00Z\")\n```\n\n## CLI fallback (when MCP is unavailable)\n\nIf `cerefox_search` is not in your tool list, your user has likely installed the Cerefox CLI. The canonical invocation is plain **`cerefox <subcommand>`** (the TypeScript CLI, installed via `npm install -g @cerefox/memory`). It uses a resource-verb shape (`cerefox document get`, `cerefox project list`, …). The legacy Python `uv run cerefox` is now a frozen husk as of v0.9 — only `uv run cerefox mcp` still works.\n\nSame operations, same conventions. Full reference: [`docs/guides/cli.md`](docs/guides/cli.md). CLI flag names match MCP parameter names exactly (e.g. `metadata_filter` ↔ `--metadata-filter`); common flags also have single-letter short forms (`-f`, `-p`, `-c`, `-m`, `-u`, `-a`, `-r`). Use the canonical long name (what `--help` shows) or its short form — there are no long-form aliases like `--filter` or `--count`.\n\n| MCP tool | CLI |\n|---|---|\n| `cerefox_search` | `cerefox search \"<q>\" --requestor \"<your-name>\"` |\n| `cerefox_ingest` (paste) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_ingest` (update by ID) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --document-id \"<uuid>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_get_document` | `cerefox document get <id> --version-id <vid> --requestor \"<your-name>\"` |\n| `cerefox_list_versions` | `cerefox document version list <id> --requestor \"<your-name>\"` |\n| `cerefox_list_projects` | `cerefox project list --requestor \"<your-name>\"` |\n| `cerefox_list_metadata_keys` | `cerefox metadata keys` |\n| `cerefox_metadata_search` | `cerefox metadata search --metadata-filter '<json>' --requestor \"<your-name>\"` (list a project: `cerefox document list --project <name>`) |\n| `cerefox_set_document_projects` | `cerefox document set-projects <id> <name...> --author \"<your-name>\" --author-type agent` (or `--clear` to remove all) |\n| `cerefox_get_audit_log` | `cerefox audit list --requestor \"<your-name>\"` (add `--json` for scripted access) |\n| `cerefox_get_help` | `cerefox guides show agent-quick-reference` (or `cerefox guides list` for the full bundled-docs index) |\n\n**Set identity on every call**, exactly as you would on MCP:\n- Writes (`document ingest`, `document ingest-dir`): `--author \"<your-name>\" --author-type agent`\n- Reads: `--requestor \"<your-name>\"`\n\nOr have your user set `CEREFOX_AUTHOR_NAME` / `CEREFOX_AUTHOR_TYPE` / `CEREFOX_REQUESTOR_NAME` in their `.env` to apply defaults once.\n"; +export const HELP_FULL = "# Cerefox Knowledge Base -- Agent Quick Reference\n\nCerefox is a persistent, shared knowledge base. You have **10 MCP tools** (9 of them have CLI equivalents — `cerefox_get_help` is MCP-only). For the full guide, search Cerefox for \"How AI Agents Use Cerefox\" or call `cerefox_get_help` to retrieve this content over MCP.\n\n## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` |\n| `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |\n\n## Essential Rules\n\n1. **Search before ingesting** -- check if the document exists first.\n2. **Prefer ID-based updates** -- pass `document_id` from search results for deterministic updates. Falls back to title-matching with `update_if_exists: true`.\n3. **Set `author`/`requestor`** to your name on every call (e.g., \"Claude Code\", \"archiver\"). On MCP, pass as parameters. On CLI, pass `--author`/`--author-type`/`--requestor` flags, or rely on `CEREFOX_AUTHOR_NAME`/`CEREFOX_AUTHOR_TYPE`/`CEREFOX_REQUESTOR_NAME` env vars set in the user's `.env`.\n4. **Use `document_id` from search results** `[id: uuid]` for get_document and list_versions.\n5. **Add metadata** -- at minimum `type` (\"decision-log\", \"research\", \"design-doc\") and `status` (\"active\", \"draft\").\n6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search.\n7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design.\n8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: <uuid>]` after the title; grab it and use it. Title-based linking (`[Text](<Title With Spaces>)`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule.\n9. **Concurrency: content updates require `expected_content_hash`.** Pass the `content_hash` you read (shown by `cerefox_get_document`, `cerefox_search`, and `cerefox_metadata_search`) when updating a document. If it's stale you get a **conflict** — re-read the document, merge your changes into the latest content, retry with the new hash. **Never resolve a conflict by overwriting blindly** — the current content includes another writer's work. `last_write_wins: true` skips the check; use it ONLY when an external source of truth makes conflicts meaningless (file re-sync), never to silence a conflict.\n10. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean \"add\" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`.\n\n## Update Workflow (ID-based -- preferred)\n\n```\nsearch(\"topic\") -> find doc [id: abc123] -> get_document(abc123) -> note its content_hash -> modify ->\ningest(title=\"Same Title\", content=\"...\", document_id=\"abc123\",\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\nOn a **conflict** error: get_document again (fresh content + fresh hash) -> merge your changes -> retry with the new hash.\n\n## Update Workflow (title-based -- fallback)\n\n```\nsearch(\"topic\") -> find doc (note its hash) -> modify ->\ningest(title=\"Same Title\", content=\"...\", update_if_exists=true,\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\n## Catch-Up Workflow\n\n```\nmetadata_search(metadata_filter={\"type\": \"decision-log\"}, updated_since=\"2026-03-28T00:00:00Z\")\n```\n\n## CLI fallback (when MCP is unavailable)\n\nIf `cerefox_search` is not in your tool list, your user has likely installed the Cerefox CLI. The canonical invocation is plain **`cerefox <subcommand>`** (the TypeScript CLI, installed via `npm install -g @cerefox/memory`). It uses a resource-verb shape (`cerefox document get`, `cerefox project list`, …). The legacy Python `uv run cerefox` is now a frozen husk as of v0.9 — only `uv run cerefox mcp` still works.\n\nSame operations, same conventions. Full reference: [`docs/guides/cli.md`](docs/guides/cli.md). CLI flag names match MCP parameter names exactly (e.g. `metadata_filter` ↔ `--metadata-filter`); common flags also have single-letter short forms (`-f`, `-p`, `-c`, `-m`, `-u`, `-a`, `-r`). Use the canonical long name (what `--help` shows) or its short form — there are no long-form aliases like `--filter` or `--count`.\n\n| MCP tool | CLI |\n|---|---|\n| `cerefox_search` | `cerefox search \"<q>\" --requestor \"<your-name>\"` |\n| `cerefox_ingest` (paste) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_ingest` (update by ID) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --document-id \"<uuid>\" --expected-content-hash \"<hash>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_get_document` | `cerefox document get <id> --version-id <vid> --requestor \"<your-name>\"` |\n| `cerefox_list_versions` | `cerefox document version list <id> --requestor \"<your-name>\"` |\n| `cerefox_list_projects` | `cerefox project list --requestor \"<your-name>\"` |\n| `cerefox_list_metadata_keys` | `cerefox metadata keys` |\n| `cerefox_metadata_search` | `cerefox metadata search --metadata-filter '<json>' --requestor \"<your-name>\"` (list a project: `cerefox document list --project <name>`) |\n| `cerefox_set_document_projects` | `cerefox document set-projects <id> <name...> --author \"<your-name>\" --author-type agent` (or `--clear` to remove all) |\n| `cerefox_get_audit_log` | `cerefox audit list --requestor \"<your-name>\"` (add `--json` for scripted access) |\n| `cerefox_get_help` | `cerefox guides show agent-quick-reference` (or `cerefox guides list` for the full bundled-docs index) |\n\n**Set identity on every call**, exactly as you would on MCP:\n- Writes (`document ingest`, `document ingest-dir`): `--author \"<your-name>\" --author-type agent`\n- Reads: `--requestor \"<your-name>\"`\n\nOr have your user set `CEREFOX_AUTHOR_NAME` / `CEREFOX_AUTHOR_TYPE` / `CEREFOX_REQUESTOR_NAME` in their `.env` to apply defaults once.\n"; /** Sections keyed by their H2 heading text (lower-cased for matching). */ export const HELP_SECTIONS: Record<string, string> = { - "Tools": "## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` |\n| `cerefox_get_document` | Get full document by ID | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |", - "Essential Rules": "## Essential Rules\n\n1. **Search before ingesting** -- check if the document exists first.\n2. **Prefer ID-based updates** -- pass `document_id` from search results for deterministic updates. Falls back to title-matching with `update_if_exists: true`.\n3. **Set `author`/`requestor`** to your name on every call (e.g., \"Claude Code\", \"archiver\"). On MCP, pass as parameters. On CLI, pass `--author`/`--author-type`/`--requestor` flags, or rely on `CEREFOX_AUTHOR_NAME`/`CEREFOX_AUTHOR_TYPE`/`CEREFOX_REQUESTOR_NAME` env vars set in the user's `.env`.\n4. **Use `document_id` from search results** `[id: uuid]` for get_document and list_versions.\n5. **Add metadata** -- at minimum `type` (\"decision-log\", \"research\", \"design-doc\") and `status` (\"active\", \"draft\").\n6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search.\n7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design.\n8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: <uuid>]` after the title; grab it and use it. Title-based linking (`[Text](<Title With Spaces>)`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule.\n9. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean \"add\" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`.", - "Update Workflow (ID-based -- preferred)": "## Update Workflow (ID-based -- preferred)\n\n```\nsearch(\"topic\") -> find doc [id: abc123] -> get_document(abc123) -> modify ->\ningest(title=\"Same Title\", content=\"...\", document_id=\"abc123\", author=\"my-agent\")\n```", - "Update Workflow (title-based -- fallback)": "## Update Workflow (title-based -- fallback)\n\n```\nsearch(\"topic\") -> find doc -> modify ->\ningest(title=\"Same Title\", content=\"...\", update_if_exists=true, author=\"my-agent\")\n```", + "Tools": "## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` |\n| `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |", + "Essential Rules": "## Essential Rules\n\n1. **Search before ingesting** -- check if the document exists first.\n2. **Prefer ID-based updates** -- pass `document_id` from search results for deterministic updates. Falls back to title-matching with `update_if_exists: true`.\n3. **Set `author`/`requestor`** to your name on every call (e.g., \"Claude Code\", \"archiver\"). On MCP, pass as parameters. On CLI, pass `--author`/`--author-type`/`--requestor` flags, or rely on `CEREFOX_AUTHOR_NAME`/`CEREFOX_AUTHOR_TYPE`/`CEREFOX_REQUESTOR_NAME` env vars set in the user's `.env`.\n4. **Use `document_id` from search results** `[id: uuid]` for get_document and list_versions.\n5. **Add metadata** -- at minimum `type` (\"decision-log\", \"research\", \"design-doc\") and `status` (\"active\", \"draft\").\n6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search.\n7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design.\n8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: <uuid>]` after the title; grab it and use it. Title-based linking (`[Text](<Title With Spaces>)`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule.\n9. **Concurrency: content updates require `expected_content_hash`.** Pass the `content_hash` you read (shown by `cerefox_get_document`, `cerefox_search`, and `cerefox_metadata_search`) when updating a document. If it's stale you get a **conflict** — re-read the document, merge your changes into the latest content, retry with the new hash. **Never resolve a conflict by overwriting blindly** — the current content includes another writer's work. `last_write_wins: true` skips the check; use it ONLY when an external source of truth makes conflicts meaningless (file re-sync), never to silence a conflict.\n10. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean \"add\" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`.", + "Update Workflow (ID-based -- preferred)": "## Update Workflow (ID-based -- preferred)\n\n```\nsearch(\"topic\") -> find doc [id: abc123] -> get_document(abc123) -> note its content_hash -> modify ->\ningest(title=\"Same Title\", content=\"...\", document_id=\"abc123\",\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\nOn a **conflict** error: get_document again (fresh content + fresh hash) -> merge your changes -> retry with the new hash.", + "Update Workflow (title-based -- fallback)": "## Update Workflow (title-based -- fallback)\n\n```\nsearch(\"topic\") -> find doc (note its hash) -> modify ->\ningest(title=\"Same Title\", content=\"...\", update_if_exists=true,\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```", "Catch-Up Workflow": "## Catch-Up Workflow\n\n```\nmetadata_search(metadata_filter={\"type\": \"decision-log\"}, updated_since=\"2026-03-28T00:00:00Z\")\n```", - "CLI fallback (when MCP is unavailable)": "## CLI fallback (when MCP is unavailable)\n\nIf `cerefox_search` is not in your tool list, your user has likely installed the Cerefox CLI. The canonical invocation is plain **`cerefox <subcommand>`** (the TypeScript CLI, installed via `npm install -g @cerefox/memory`). It uses a resource-verb shape (`cerefox document get`, `cerefox project list`, …). The legacy Python `uv run cerefox` is now a frozen husk as of v0.9 — only `uv run cerefox mcp` still works.\n\nSame operations, same conventions. Full reference: [`docs/guides/cli.md`](docs/guides/cli.md). CLI flag names match MCP parameter names exactly (e.g. `metadata_filter` ↔ `--metadata-filter`); common flags also have single-letter short forms (`-f`, `-p`, `-c`, `-m`, `-u`, `-a`, `-r`). Use the canonical long name (what `--help` shows) or its short form — there are no long-form aliases like `--filter` or `--count`.\n\n| MCP tool | CLI |\n|---|---|\n| `cerefox_search` | `cerefox search \"<q>\" --requestor \"<your-name>\"` |\n| `cerefox_ingest` (paste) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_ingest` (update by ID) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --document-id \"<uuid>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_get_document` | `cerefox document get <id> --version-id <vid> --requestor \"<your-name>\"` |\n| `cerefox_list_versions` | `cerefox document version list <id> --requestor \"<your-name>\"` |\n| `cerefox_list_projects` | `cerefox project list --requestor \"<your-name>\"` |\n| `cerefox_list_metadata_keys` | `cerefox metadata keys` |\n| `cerefox_metadata_search` | `cerefox metadata search --metadata-filter '<json>' --requestor \"<your-name>\"` (list a project: `cerefox document list --project <name>`) |\n| `cerefox_set_document_projects` | `cerefox document set-projects <id> <name...> --author \"<your-name>\" --author-type agent` (or `--clear` to remove all) |\n| `cerefox_get_audit_log` | `cerefox audit list --requestor \"<your-name>\"` (add `--json` for scripted access) |\n| `cerefox_get_help` | `cerefox guides show agent-quick-reference` (or `cerefox guides list` for the full bundled-docs index) |\n\n**Set identity on every call**, exactly as you would on MCP:\n- Writes (`document ingest`, `document ingest-dir`): `--author \"<your-name>\" --author-type agent`\n- Reads: `--requestor \"<your-name>\"`\n\nOr have your user set `CEREFOX_AUTHOR_NAME` / `CEREFOX_AUTHOR_TYPE` / `CEREFOX_REQUESTOR_NAME` in their `.env` to apply defaults once.", + "CLI fallback (when MCP is unavailable)": "## CLI fallback (when MCP is unavailable)\n\nIf `cerefox_search` is not in your tool list, your user has likely installed the Cerefox CLI. The canonical invocation is plain **`cerefox <subcommand>`** (the TypeScript CLI, installed via `npm install -g @cerefox/memory`). It uses a resource-verb shape (`cerefox document get`, `cerefox project list`, …). The legacy Python `uv run cerefox` is now a frozen husk as of v0.9 — only `uv run cerefox mcp` still works.\n\nSame operations, same conventions. Full reference: [`docs/guides/cli.md`](docs/guides/cli.md). CLI flag names match MCP parameter names exactly (e.g. `metadata_filter` ↔ `--metadata-filter`); common flags also have single-letter short forms (`-f`, `-p`, `-c`, `-m`, `-u`, `-a`, `-r`). Use the canonical long name (what `--help` shows) or its short form — there are no long-form aliases like `--filter` or `--count`.\n\n| MCP tool | CLI |\n|---|---|\n| `cerefox_search` | `cerefox search \"<q>\" --requestor \"<your-name>\"` |\n| `cerefox_ingest` (paste) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_ingest` (update by ID) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --document-id \"<uuid>\" --expected-content-hash \"<hash>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_get_document` | `cerefox document get <id> --version-id <vid> --requestor \"<your-name>\"` |\n| `cerefox_list_versions` | `cerefox document version list <id> --requestor \"<your-name>\"` |\n| `cerefox_list_projects` | `cerefox project list --requestor \"<your-name>\"` |\n| `cerefox_list_metadata_keys` | `cerefox metadata keys` |\n| `cerefox_metadata_search` | `cerefox metadata search --metadata-filter '<json>' --requestor \"<your-name>\"` (list a project: `cerefox document list --project <name>`) |\n| `cerefox_set_document_projects` | `cerefox document set-projects <id> <name...> --author \"<your-name>\" --author-type agent` (or `--clear` to remove all) |\n| `cerefox_get_audit_log` | `cerefox audit list --requestor \"<your-name>\"` (add `--json` for scripted access) |\n| `cerefox_get_help` | `cerefox guides show agent-quick-reference` (or `cerefox guides list` for the full bundled-docs index) |\n\n**Set identity on every call**, exactly as you would on MCP:\n- Writes (`document ingest`, `document ingest-dir`): `--author \"<your-name>\" --author-type agent`\n- Reads: `--requestor \"<your-name>\"`\n\nOr have your user set `CEREFOX_AUTHOR_NAME` / `CEREFOX_AUTHOR_TYPE` / `CEREFOX_REQUESTOR_NAME` in their `.env` to apply defaults once.", }; export const HELP_SECTION_HEADINGS: string[] = ["Tools", "Essential Rules", "Update Workflow (ID-based -- preferred)", "Update Workflow (title-based -- fallback)", "Catch-Up Workflow", "CLI fallback (when MCP is unavailable)"]; diff --git a/_shared/mcp-tools/ingest.ts b/_shared/mcp-tools/ingest.ts index e13c6fb..d6f8dfb 100644 --- a/_shared/mcp-tools/ingest.ts +++ b/_shared/mcp-tools/ingest.ts @@ -23,6 +23,43 @@ import { ensureDocumentInProject, setDocumentProjectsByName } from "./_projects. import { logUsage } from "./_utils.ts"; import { McpInvalidParams, type ToolContext, type ToolDefinition } from "./types.ts"; +/** + * Agent-first instructions for an optimistic-concurrency conflict (iter-32). + * Raised either by the local fast-fail (before the embedding spend) or by the + * authoritative check inside the cerefox_ingest_document RPC. + */ +function conflictError(documentId: string, expectedHash: string, currentHash: string): Error { + return new Error( + `Conflict: document ${documentId} changed since you read it ` + + `(your base hash: ${expectedHash}, current hash: ${currentHash}). ` + + `To resolve: (1) cerefox_get_document("${documentId}") to fetch the latest content ` + + `and its content_hash, (2) merge your changes into it, (3) retry cerefox_ingest ` + + `with expected_content_hash set to the new hash. Do not overwrite blindly — ` + + `the current content may include another writer's work.`, + ); +} + +/** Map RPC-side CEREFOX_CONFLICT / CEREFOX_TOKEN_REQUIRED errors to agent-first text. */ +function mapIngestRpcError(message: string, documentId: string): Error { + if (message.includes("CEREFOX_CONFLICT")) { + const current = message.match(/current hash ([0-9a-f]{64})/)?.[1] ?? "unknown"; + const expected = message.match(/expected hash ([0-9a-f]{64})/)?.[1] ?? "unknown"; + return conflictError(documentId, expected, current); + } + if (message.includes("CEREFOX_TOKEN_REQUIRED")) { + const current = message.match(/Current hash: ([0-9a-f]{64})/)?.[1]; + return new Error( + `Concurrency token required: content updates need expected_content_hash — ` + + `the content_hash of the version you based your edit on (returned by ` + + `cerefox_get_document, cerefox_search, and cerefox_metadata_search).` + + (current ? ` The document's current hash is ${current}; pass it ONLY if your edit was based on the current content.` : "") + + ` If you have not read the document, read it first. To deliberately overwrite ` + + `regardless of concurrent changes, pass last_write_wins=true.`, + ); + } + return new Error(`Ingest RPC failed: ${message}`); +} + async function handler( supabase: MCPSupabaseClient, args: Record<string, unknown>, @@ -38,6 +75,8 @@ async function handler( const update_if_exists = (args.update_if_exists as boolean | undefined) ?? false; const author = (args.author as string | undefined) ?? "mcp-agent"; const author_type = "agent"; // MCP path is always agent + const expected_content_hash = (args.expected_content_hash as string | undefined)?.trim() || null; + const last_write_wins = (args.last_write_wins as boolean | undefined) ?? false; if (!title || !content?.trim()) { throw new McpInvalidParams("title and content are required"); @@ -83,7 +122,13 @@ async function handler( const note = update_if_exists ? "" : " Note: update_if_exists flag was overridden by document_id."; - return `Document already up-to-date: "${existingDoc.title}" (id: ${existingDoc.id}). Content hash unchanged.${note}`; + return `Document already up-to-date: "${existingDoc.title}" (id: ${existingDoc.id}). Content hash unchanged (${contentHash}).${note}`; + } + + // Fast-fail on a stale token BEFORE paying the embedding cost. Advisory + // only — the authoritative, race-free check is inside the RPC (FOR UPDATE). + if (!last_write_wins && expected_content_hash && expected_content_hash !== existingDoc.content_hash) { + throw conflictError(existingDoc.id, expected_content_hash, existingDoc.content_hash); } const chunks = chunkMarkdown(content); @@ -115,9 +160,11 @@ async function handler( p_author: author, p_author_type: author_type, p_source_label: source, + p_expected_content_hash: expected_content_hash, + p_last_write_wins: last_write_wins, }); - if (ingestErr) throw new Error(`Ingest RPC failed: ${ingestErr.message}`); + if (ingestErr) throw mapIngestRpcError(ingestErr.message, existingDoc.id); logUsage(supabase, { operation: "ingest", @@ -136,7 +183,7 @@ async function handler( const note = update_if_exists ? "" : " Note: update_if_exists flag was overridden by document_id."; - return `Document updated: "${title}" (id: ${existingDoc.id}), ${chunks.length} chunk(s), ${totalChars} chars.${note}`; + return `Document updated: "${title}" (id: ${existingDoc.id}), ${chunks.length} chunk(s), ${totalChars} chars. New content_hash: ${contentHash}.${note}`; } // ── Update-existing path ───────────────────────────────────────────────── @@ -152,7 +199,13 @@ async function handler( const existingDoc = existing[0]; if (existingDoc.content_hash === contentHash) { - return `Document already up-to-date: "${existingDoc.title}" (id: ${existingDoc.id}). Content hash unchanged.`; + return `Document already up-to-date: "${existingDoc.title}" (id: ${existingDoc.id}). Content hash unchanged (${contentHash}).`; + } + + // Fast-fail on a stale token BEFORE the embedding cost (advisory; the + // authoritative check is in the RPC). + if (!last_write_wins && expected_content_hash && expected_content_hash !== existingDoc.content_hash) { + throw conflictError(existingDoc.id, expected_content_hash, existingDoc.content_hash); } const chunks = chunkMarkdown(content); @@ -184,9 +237,11 @@ async function handler( p_author: author, p_author_type: author_type, p_source_label: source, + p_expected_content_hash: expected_content_hash, + p_last_write_wins: last_write_wins, }); - if (ingestErr) throw new Error(`Ingest RPC failed: ${ingestErr.message}`); + if (ingestErr) throw mapIngestRpcError(ingestErr.message, existingDoc.id); logUsage(supabase, { operation: "ingest", @@ -202,7 +257,7 @@ async function handler( await ensureDocumentInProject(supabase, existingDoc.id, project_name); } - return `Document updated: "${existingDoc.title}" (id: ${existingDoc.id}), ${chunks.length} chunk(s), ${totalChars} chars.`; + return `Document updated: "${existingDoc.title}" (id: ${existingDoc.id}), ${chunks.length} chunk(s), ${totalChars} chars. New content_hash: ${contentHash}.`; } // Fall through to create path } @@ -303,6 +358,16 @@ export const ingestTool: ToolDefinition = { description: "When true, update an existing document with the same title instead of creating a new one (default: false). Ignored when document_id is provided.", }, + expected_content_hash: { + type: "string", + description: + "REQUIRED on content updates (optimistic concurrency): the content_hash of the document version you based your edit on, as returned by cerefox_get_document / cerefox_search / cerefox_metadata_search. If the document changed since you read it, the update fails with a conflict — re-read, merge, retry with the new hash. Not needed when creating a new document.", + }, + last_write_wins: { + type: "boolean", + description: + "Explicitly skip the concurrency check and overwrite regardless of concurrent changes (default: false). Use ONLY when an external source of truth makes conflicts meaningless (e.g. re-syncing from files). Recorded in the audit log.", + }, metadata: { type: "object", description: "Arbitrary JSON metadata (optional)" }, author: { type: "string", diff --git a/_shared/mcp-tools/metadata-search.ts b/_shared/mcp-tools/metadata-search.ts index 8fd28a8..790ef0e 100644 --- a/_shared/mcp-tools/metadata-search.ts +++ b/_shared/mcp-tools/metadata-search.ts @@ -78,6 +78,7 @@ async function handler( project_ids: string[]; project_names: string[]; version_count: number; + content_hash: string | null; content: string | null; }>; @@ -105,9 +106,10 @@ async function handler( const meta = Object.entries(row.doc_metadata ?? {}) .map(([k, v]) => `${k}=${v}`) .join(", "); + const hash = row.content_hash ? `\nhash: ${row.content_hash}` : ""; const header = `## ${row.title} [id: ${row.document_id}]\n` + - `${meta}${projects} | ${row.total_chars} chars | ${row.review_status} | updated ${row.updated_at?.slice(0, 10) ?? "?"}`; + `${meta}${projects} | ${row.total_chars} chars | ${row.review_status} | updated ${row.updated_at?.slice(0, 10) ?? "?"}${hash}`; if (include_content && row.content) { return `${header}\n\n${row.content}`; diff --git a/_shared/mcp-tools/search.ts b/_shared/mcp-tools/search.ts index 70770f2..3a8b53d 100644 --- a/_shared/mcp-tools/search.ts +++ b/_shared/mcp-tools/search.ts @@ -135,6 +135,7 @@ async function handler( is_partial?: boolean; chunk_count?: number; total_chars?: number; + content_hash?: string; }>; const parts: string[] = rows.map((row) => { @@ -144,7 +145,9 @@ async function handler( const partial = row.is_partial ? ` -- partial (${row.chunk_count} of ${(row.total_chars ?? 0).toLocaleString()} chars)` : ""; - return `## ${title}${docId}${score}${partial}\n\n${row.full_content ?? ""}`; + // content_hash = the concurrency token for cerefox_ingest updates (iter-32). + const hash = row.content_hash ? `\nhash: ${row.content_hash}` : ""; + return `## ${title}${docId}${score}${partial}${hash}\n\n${row.full_content ?? ""}`; }); let output = parts.join("\n\n---\n\n"); diff --git a/docs/e2e-use-cases.md b/docs/e2e-use-cases.md index bc83c79..744da76 100644 --- a/docs/e2e-use-cases.md +++ b/docs/e2e-use-cases.md @@ -131,6 +131,27 @@ gated behind `CEREFOX_LIVE_E2E=1`). | `TestIdBasedIngestEdgeFunction` | `test_ingest_by_id_not_found_returns_404` | Primitive EF: non-existent UUID → HTTP 404 | Done | | `TestIdBasedIngestEdgeFunction` | `test_ingest_by_id_note_when_update_if_exists_false` | Primitive EF: `document_id` + `update_if_exists=false` → `note` field in response | Done | +### 6C. Optimistic Concurrency on Content Updates (iter-32, v0.11) + +TS suites: `packages/memory/test/write-commands.test.ts` (CLI flow) and +`packages/memory/test/ingestion/pipeline-update.test.ts` (pipeline). Both are +probe-and-skip on Supabase reachability **and on deployed schema ≥ 0.5.0** +(against an older server they skip with a "run `cerefox server deploy +--schema-only`" note instead of failing). + +| Suite | Test | Use Case | Status | +|-------|------|----------|--------| +| write-commands (CLI) | update-flow | Changed content without `--expected-content-hash` → exit non-zero + `CEREFOX_TOKEN_REQUIRED` | Done | +| write-commands (CLI) | update-flow | `document get --json` → grab `content_hash` → update with `--expected-content-hash` → updated | Done | +| write-commands (CLI) | update-flow | Re-using the stale hash → exit non-zero + `CEREFOX_CONFLICT` | Done | +| write-commands (CLI) | update-flow | `--last-write-wins` → update succeeds (check bypassed) | Done | +| pipeline-update | content change | Update with correct `expectedContentHash` → reindexed + version snapshot | Done | +| pipeline-update | content change | Stale token → throws `CEREFOX_CONFLICT`; no version snapshot created | Done | +| pipeline-update | content change | Missing token → throws `CEREFOX_TOKEN_REQUIRED`; no snapshot | Done | +| pipeline-update | content change | `lastWriteWins: true` → update succeeds, snapshot created | Done | +| `_shared` unit (mocked) | ingest handler | Stale `expected_content_hash` fast-fails before embedding, with merge instructions + current hash | Done | +| Web UI (manual / future Playwright) | edit conflict | Edit page sends loaded hash; concurrent change → 409 → "merge needed" toast | Manual | + ### 7. Governance Features (future e2e) | # | Use Case | Status | diff --git a/docs/guides/agent-coordination.md b/docs/guides/agent-coordination.md index 8b7e0e7..0bc94a6 100644 --- a/docs/guides/agent-coordination.md +++ b/docs/guides/agent-coordination.md @@ -33,6 +33,19 @@ The coordination model is **asynchronous and knowledge-based**: This is not real-time orchestration. It is persistent, searchable shared memory. +### Concurrent writers are conflict-guarded (v0.11+) + +Shared memory means two agents can hold the same document at once. Cerefox +protects content updates with **optimistic concurrency control**: every read +surface returns the document's `content_hash`, and a content update must pass +it back as `expected_content_hash`. If the document changed in between, the +write fails with a **conflict** instead of silently overwriting the other +writer's work — the losing agent re-reads, merges, and retries with the fresh +hash. (Versioning remains the recovery net; the conflict guard is the +prevention layer.) An explicit `last_write_wins: true` exists for re-sync flows +where an external source of truth makes conflicts meaningless — agents should +never use it to silence a conflict. See `AGENT_GUIDE.md → Concurrent writers`. + --- ## Coordination Patterns @@ -53,7 +66,7 @@ A living document where agents record decisions, experiment outcomes, and lesson **Example**: A coding agent working on a project records "Chose PostgreSQL RPC approach over application-level logic because..." in a decision log document. Next week, a different agent working on a related feature searches Cerefox, finds the decision log, and understands the rationale without re-deriving it. -**How it works**: Create a document with a structured format (date, context, decision, outcome). Use a consistent title or project tag so agents can find it. To add entries over time, re-ingest with `update_if_exists: true` (or `document_id`) — this replaces the document in place, so build the new full content by appending to the prior content you fetched. +**How it works**: Create a document with a structured format (date, context, decision, outcome). Use a consistent title or project tag so agents can find it. To add entries over time, re-ingest with `update_if_exists: true` (or `document_id`) — this replaces the document in place, so build the new full content by appending to the prior content you fetched, and pass the `content_hash` you fetched as `expected_content_hash` (two agents appending entries concurrently is exactly the conflict the guard catches). **Best for**: Project-level institutional memory, avoiding repeated decisions, onboarding new agent sessions. diff --git a/docs/guides/cli.md b/docs/guides/cli.md index c27aabe..b715ec9 100644 --- a/docs/guides/cli.md +++ b/docs/guides/cli.md @@ -46,6 +46,8 @@ cerefox document ingest --paste --title "<title>" [OPTIONS] # stdin | `--metadata` | `-m` | JSON | `{}` | Extra metadata as a JSON object, e.g. `'{"tags":["work"]}'`. | | `--update-if-exists` | `-u` | flag | off | Title/source-path-based fallback update. Mutually exclusive with `--document-id`. | | `--document-id` | `-i` | UUID | _none_ | Deterministic ID-based update. Errors if the document doesn't exist. | +| `--expected-content-hash` | — | sha256 | _none_ | **Required on content updates** (v0.11 optimistic concurrency): the `content_hash` of the version this edit is based on, shown by `cerefox document get` / `cerefox search`. Stale → conflict error (re-read, merge, retry). | +| `--last-write-wins` | — | flag | off | Skip the concurrency check and overwrite regardless of concurrent changes. For re-sync flows where an external source of truth makes conflicts meaningless. Recorded in the audit log. | | `--source` | — | str | `paste` / `file` | Source label recorded on the document. | | `--author` | — | str | `CEREFOX_AUTHOR_NAME` or `unknown` | Audit-log author identity. | | `--author-type` | — | `user`\|`agent` | `CEREFOX_AUTHOR_TYPE` or `user` | Caller type. Agent writes auto-routed to `pending_review`. | @@ -63,12 +65,19 @@ cerefox document ingest notes.md \ --author "claude-code" --author-type "agent" \ --project-name "research" --metadata '{"type":"design-doc"}' -# Deterministic update (preferred — agents should search → grab ID → ingest) +# Deterministic update (preferred — agents should search → grab ID + hash → ingest) cerefox document ingest --paste --title "Same Title" \ --document-id "abc12345-..." \ + --expected-content-hash "<hash from `document get`>" \ --author "claude-code" --author-type "agent" ``` +> **Concurrency (v0.11+)**: content updates require `--expected-content-hash` +> (or an explicit `--last-write-wins`). On a conflict, re-run +> `cerefox document get <id>`, merge your changes into the latest content, and +> retry with the new hash. `document ingest-dir` and `guides ingest` bypass the +> check internally (the filesystem / npm package is their source of truth). + **Output**: human-readable summary line(s) — "Ingested" or "Updated" with the document ID, chunk count, character count. **Exit codes**: `0` success, `1` on validation error (missing `--title`, invalid JSON, document-not-found, mutually-exclusive flags, etc.). @@ -183,7 +192,7 @@ cerefox document get abc12345-... --version-id <version-uuid> # archived cerefox document get abc12345-... | bat -l md # pipe to viewer ``` -**Output**: title + metadata line, blank line, then raw markdown. +**Output**: title + metadata line + `content_hash` line (the optimistic-concurrency token — pass back via `document ingest --expected-content-hash` when updating), blank line, then raw markdown. **MCP equivalent**: [`cerefox_get_document`](../../AGENT_GUIDE.md). @@ -633,7 +642,7 @@ Every MCP parameter has an exact-name CLI flag (kebab-cased). Short forms exist | MCP tool | CLI command | |---|---| | `cerefox_search(query, match_count, project_name, metadata_filter, requestor)` | `cerefox search "<q>" --match-count N --project-name <name> --metadata-filter '<json>' --requestor <name>` | -| `cerefox_ingest(title, content, project_name, metadata, update_if_exists, document_id, source, author, author_type)` (file) | `cerefox document ingest <path> --title <t> --project-name <n> --metadata '<json>' --update-if-exists\|--document-id <uuid> --source <s> --author <a> --author-type <t>` | +| `cerefox_ingest(title, content, project_name, metadata, update_if_exists, document_id, expected_content_hash, last_write_wins, source, author, author_type)` (file) | `cerefox document ingest <path> --title <t> --project-name <n> --metadata '<json>' --update-if-exists\|--document-id <uuid> --expected-content-hash <hash>\|--last-write-wins --source <s> --author <a> --author-type <t>` | | `cerefox_ingest(...)` (paste) | `printf '...' \| cerefox document ingest --paste --title "<t>"` (same flags) | | `cerefox_get_document(document_id, version_id, requestor)` | `cerefox document get <id> --version-id <vid> --requestor <name>` | | `cerefox_list_versions(document_id, requestor)` | `cerefox document version list <id> --requestor <name>` | @@ -709,12 +718,18 @@ cerefox document ingest-dir ./papers --extensions .md \ # Step 1: find it cerefox search "the OAuth design doc" --match-count 1 -# Step 2: copy the id from `Doc: ... (id: <uuid>)` line -# Step 3: update in place +# Step 2: read it — note the id AND the `content_hash:` line (the concurrency token) +cerefox document get "<uuid>" + +# Step 3: update in place, proving freshness with the hash from step 2 printf '%s' "$NEW_CONTENT" | cerefox document ingest --paste \ --title "OAuth 2.1 Design Document" \ --document-id "<uuid>" \ + --expected-content-hash "<hash>" \ --author "claude-code" --author-type "agent" + +# On a conflict error: repeat from step 2 (fresh content + fresh hash), +# merge your changes into the latest content, then retry. ``` ### Unattended sync job diff --git a/docs/guides/connect-agents.md b/docs/guides/connect-agents.md index 1beaed4..fb37c07 100644 --- a/docs/guides/connect-agents.md +++ b/docs/guides/connect-agents.md @@ -568,6 +568,11 @@ You have access to a personal knowledge base via the searchKnowledgeBase action. When the user asks a question, always search the knowledge base first using a relevant query. Present results by document title, citing the source for every claim. Use ingestNote to save any new information the user asks you to remember. +When UPDATING an existing document, first call getDocument and note its +content_hash, then pass it as expected_content_hash on ingestNote. If you get a +409 conflict, the document changed underneath you: call getDocument again, merge +your changes into the latest content, and retry with the new hash — never +overwrite blindly. ``` ### Path B verification @@ -604,7 +609,7 @@ In the action editor, paste this schema (replace `<your-project-ref>`): openapi: 3.1.0 info: title: Cerefox Knowledge Base - version: 1.9.0 + version: 2.0.0 servers: - url: https://<your-project-ref>.supabase.co/functions/v1 paths: @@ -731,6 +736,23 @@ paths: instead of creating a new one. The previous content is archived as a version. If content is unchanged, the document is skipped (no re-indexing). Ignored when document_id is provided. + expected_content_hash: + type: string + description: > + REQUIRED on content updates (optimistic concurrency, v2.0.0): + the content_hash of the version this edit was based on, as + returned by getDocument / searchKnowledgeBase / metadataSearch. + If the document changed since it was read, the update fails + with HTTP 409 — re-read the document, merge your changes, + retry with the new hash. Not needed when creating. + last_write_wins: + type: boolean + default: false + description: > + Explicitly skip the concurrency check and overwrite regardless + of concurrent changes. Use ONLY when an external source of + truth makes conflicts meaningless. Recorded in the audit log. + Never use it to silence a 409 conflict. author: type: string description: > @@ -753,8 +775,19 @@ paths: project_id?, project_name?, # set when a project was assigned on create skipped?, # true when identical content was deduplicated updated?, # true when an existing doc was updated + content_hash?, # the NEW hash after an update (the next edit's token) message?, # human note on dedup/skip/update note? } # note when a flag (e.g. update_if_exists) was overridden + '400': + description: > + Missing expected_content_hash on a content update (and + last_write_wins not set). Read the document first, then retry + with its content_hash. + '409': + description: > + Conflict — the document changed since it was read. Call getDocument + for the latest content + content_hash, merge your changes, and + retry with the new hash. Do not overwrite blindly. /cerefox-metadata: post: operationId: listMetadataKeys @@ -802,7 +835,9 @@ paths: description: > Document content and metadata: { document_id, doc_title, full_content, chunk_count, total_chars, - is_archived, version_id } + is_archived, version_id, content_hash }. + content_hash is the document's CURRENT hash — pass it back as + expected_content_hash when updating via ingestNote. '404': description: Document not found /cerefox-list-versions: @@ -952,7 +987,9 @@ paths: Array of matching documents: [{ document_id, title, doc_metadata, review_status, source, created_at, updated_at, total_chars, chunk_count, project_ids, project_names, - version_count, content }] + version_count, content_hash, content }]. + content_hash is the concurrency token — pass it back as + expected_content_hash when updating via ingestNote. ``` **Step 3 — Configure authentication** diff --git a/docs/plan.md b/docs/plan.md index 73ec9d6..c21655a 100644 --- a/docs/plan.md +++ b/docs/plan.md @@ -3477,13 +3477,20 @@ bundler (`--use-api`, issue #84). Design of record: [`docs/research/local-cerefox-design.md`](research/local-cerefox-design.md). **Near-term tracks** (iteration numbers are planning IDs, not ship order): -1. **Iteration 31 — Local ONNX embedder** (fully-offline World B), target **v0.11.0**, - on `feat/local-embedder`. **This is the only remaining Iteration-30-family build.** +1. **Iteration 32 — Optimistic concurrency control**, target **v0.11.0**, on + `feat/optimistic-locking`. Motivated by a real two-agent last-write-wins incident. + Content updates now require `expected_content_hash` (compare-and-swap on the existing + `content_hash`, atomic in the ingest RPC via `FOR UPDATE`) or an explicit + `last_write_wins`. Design of record: + [`docs/specs/concurrency-control-design.md`](specs/concurrency-control-design.md). + Implemented across RPC + MCP + EF + CLI + web + docs; schema 0.5.0. +2. **Iteration 31 — Local ONNX embedder** (fully-offline World B), target **v0.12+** + (slid from v0.11.0 to make room for iter-32), on `feat/local-embedder`. Design committed; P0 implementation pending review. See iter-31 in the log above. -2. **Iteration 28 — v1.0**, the stability commitment (strict SemVer becomes binding) +3. **Iteration 28 — v1.0**, the stability commitment (strict SemVer becomes binding) + security audit. Trigger: ~2–3 months of v0.10 in the wild + an outside user installing unaided. -3. **Iteration 29 — Document Relations & Semantic Graph** (post-v1.0, target **v1.1+**), +4. **Iteration 29 — Document Relations & Semantic Graph** (post-v1.0, target **v1.1+**), pending — design only. Design of record: [`docs/research/document-relations-and-semantic-graph.md`](research/document-relations-and-semantic-graph.md). (The early semantic-graph exploration branch was already merged to main; diff --git a/docs/requirements-and-specs.md b/docs/requirements-and-specs.md index a9a66ca..21f0f0d 100644 --- a/docs/requirements-and-specs.md +++ b/docs/requirements-and-specs.md @@ -70,6 +70,7 @@ Projects and categories are created, renamed, and deleted by the user at any tim | FR-1.8 | Batch ingest (directory of files) | P1 | | FR-1.9 | Ingestion is fire-and-forget (async, non-blocking) | P0 | | FR-1.10 | Report ingestion failures via UI event/log | P0 | +| FR-1.11 | Optimistic concurrency on content updates (v0.11): updates require `expected_content_hash` (the `content_hash` the edit was based on, returned by all read surfaces), checked atomically in the ingest RPC; stale → conflict (re-read, merge, retry), absent → token-required; explicit `last_write_wins` opt-out for re-sync flows (audit-logged) | P0 | ### FR-2: Content Chunking diff --git a/docs/solution-design.md b/docs/solution-design.md index 6c3fb5f..6680a39 100644 --- a/docs/solution-design.md +++ b/docs/solution-design.md @@ -682,6 +682,18 @@ flowchart TD where the document has zero current chunks. `chunk_count` / `total_chars` are updated atomically with the chunk insert. +**Optimistic concurrency (v0.11 / schema 0.5.0)**: the update path locks the +document row (`SELECT … FOR UPDATE`) and compares the caller-supplied +`p_expected_content_hash` against the current `content_hash` before writing. +Stale → `CEREFOX_CONFLICT` (SQLSTATE 40001); absent (without +`p_last_write_wins`) → `CEREFOX_TOKEN_REQUIRED` (22023). This closes the +read→chunk+embed→write race in which two concurrent writers would silently +last-write-wins each other — the check is atomic at the one place all +transports share. Every document-shaped read RPC (`cerefox_get_document`, +`cerefox_search_docs`, `cerefox_metadata_search`) returns `content_hash` so +writers always hold the token. Design of record: +[`docs/specs/concurrency-control-design.md`](specs/concurrency-control-design.md). + **Single-implementation pattern**: chunking + embedding happen in the TypeScript caller; *all* write logic (snapshot, archive, insert, audit, cleanup) lives in the `cerefox_ingest_document` RPC. New write-side behaviour is added to the RPC, diff --git a/docs/specs/concurrency-control-design.md b/docs/specs/concurrency-control-design.md new file mode 100644 index 0000000..ebcf747 --- /dev/null +++ b/docs/specs/concurrency-control-design.md @@ -0,0 +1,126 @@ +# Optimistic Concurrency Control for Content Updates + +**Status**: Design of record — implemented on `feat/optimistic-locking` (Iteration 32, target v0.11.0). +**Date**: 2026-06-12 +**Motivation**: a real incident — two agent sessions updated the same document at close +times; the later write silently shadowed the earlier one (last-write-wins). Versioning +made the merge recoverable, but recovery is not prevention. Concurrent agents writing to +shared memory need the same conflict discipline as any distributed system. + +## 1. The race + +Every content-update path does: read document → chunk → **embed (seconds of external +API latency)** → call `cerefox_ingest_document`. The embedding window is the race: two +writers can both read the same base version, both embed, and the second RPC call +overwrites the first with no warning. The pre-RPC "already up-to-date" hash check in the +handlers does not help — it runs *before* the window, not atomically with the write. + +## 2. Design: compare-and-swap on `content_hash` + +`cerefox_documents.content_hash` (SHA-256 of the normalized markdown, NOT NULL, +maintained on every write) is already the system's identity for "this exact content". +It becomes the optimistic-locking token — no new column, no migration, no second +versioning concept. + +**Writer contract (update paths only — `document_id` or `update_if_exists`):** + +1. Read the document; note its `content_hash` (now returned by every read surface). +2. Prepare the new content. +3. Call ingest with `expected_content_hash=<the hash you read>`. +4. The RPC, **atomically** (row locked with `SELECT … FOR UPDATE` inside the single + ingest transaction): + - hash matches current → proceed exactly as before (snapshot version, update, chunks). + - hash differs → **conflict error**: the document moved underneath you. Re-read, + merge, retry with the fresh hash. + - token absent → **token-required error** (see policy below). + +**Create path**: both parameters ignored — there is nothing to conflict with. + +### Policy: safe by default, explicit escape hatch + +Content updates **require** the token. There is no silent opt-out — the escape hatch is +an explicit `last_write_wins=true` flag that names the semantics the caller is choosing. +Rationale: opt-in safety doesn't get used (the incident happened precisely because +nothing required anyone to opt in). No `cerefox_config` knob: the flag *is* the policy, +visible per call and recorded in the audit description when used. + +`last_write_wins` is intended for flows where an external source of truth makes +conflicts meaningless: + +| Caller | Behavior | +|---|---| +| `cerefox document ingest-dir` | passes `last_write_wins` internally (filesystem is the source of truth) | +| `cerefox guides ingest` (self-docs sync) | same | +| Python frozen fallback (`db/client.py`) | passes `last_write_wins` (preserves its historical behavior; explicitly unsafe — one deliberate exception to its frozen status) | +| Everything else (MCP, EF, CLI single-doc, web edit) | must supply `expected_content_hash` or explicitly pass the flag | + +### Error surface + +Two distinct failures, raised in the RPC so every transport behaves identically +(single-implementation principle): + +| Condition | SQLSTATE | Meaning | +|---|---|---| +| Token absent on update (and not `last_write_wins`) | `22023` (invalid_parameter_value) | caller didn't follow the read-before-write contract | +| Token stale | `40001` (serialization_failure) | the document changed underneath the caller | + +Messages are prefixed `CEREFOX_CONFLICT:` / `CEREFOX_TOKEN_REQUIRED:` so transport +handlers can detect them without parsing prose. Each handler maps them to an +agent-first instruction: the current hash, and the exact workflow (re-read via +`cerefox_get_document` → merge → retry with the new hash). The `cerefox-ingest` EF maps +conflict → HTTP 409, token-required → HTTP 400. + +**Interplay with the "already up-to-date" short-circuit**: handlers still return early +when the *new* content's hash equals the current hash — even if the caller's +`expected_content_hash` is stale. Correct: identical content cannot lose data, so no +conflict is surfaced. + +## 3. Read surfaces — where writers get the token + +`content_hash` added to every document-shaped read: + +- `cerefox_get_document` RPC → MCP tool header, CLI `document get` header, EF response. +- `cerefox_search_docs` RPC (docs mode) → search result headers (MCP + CLI + EF). +- `cerefox_metadata_search` RPC → result rows (the Decision-Log append workflow starts + here, so it must carry the token too). +- Web: `DocumentEditPage` keeps the hash from load, sends it on save; a 409 shows a + "document changed since you opened it — merge needed" error. + +The full 64-char hex is returned everywhere (it must round-trip exactly; no prefix +matching — cleverness there buys little and complicates the RPC contract). + +## 4. Compatibility and versioning + +- **RPC signature change** (`DROP FUNCTION` + `CREATE` with two new defaulted params, + plus new `content_hash` columns in three read RPCs' return tables) → + **`schema_version` 0.4.0 → 0.5.0** (both literals, lockstep). RPC-only change: ships + via `cerefox server deploy` re-apply; **no migration file** (no table change). +- **This is a deliberate breaking change for updaters**: an old client (pre-feature CLI + or local MCP) updating against an upgraded server fails with the token-required error + until updated (`self-update`); the doctor drift nudge already covers discovery. + Old *readers* are unaffected (extra returned column is ignored). +- **GPT Actions**: ingest action body gains the two fields; existing custom GPTs' + updates fail until the schema block is re-pasted → OpenAPI `info.version` → **2.0.0**. +- **Release**: behavior-changing default ⇒ a **minor** (proposed v0.11.0; the local + embedder slides to v0.12.0 — maintainer decides at cut time). + +## 5. Out of scope (explicit) + +- **Metadata/title edits** (`document edit`, `cerefox_set_document_projects`): metadata + is not versioned; guarding it is a separate (smaller) problem. Not covered. +- **Server-side merge**: on conflict the caller merges. Cerefox stores; agents think. +- **Pessimistic locks / leases**: wrong tool for occasionally-conflicting, + long-think-time writers; optimistic CAS + versioning-as-recovery is the fit. + +## 6. Alternatives considered + +- **`expected_version` (monotonic revision counter)**: classic, human-friendly + ("you have rev 5, server is at 7"), but needs a new column + migration, a second + version concept alongside archived `version_number` (which interacts with retention + cleanup), and false-conflicts on revert-to-identical-content. Rejected: the hash gives + the same protection with strictly less machinery and exact semantics ("did the + *content* move", not "did a write happen"). +- **`If-Unmodified-Since` on `updated_at`**: timestamps are brittle tokens + (serialization precision, equality semantics). Rejected. +- **Config-gated enforcement** (`require_concurrency_token`): more machinery for a + policy the flag already expresses per call. Rejected. diff --git a/frontend/src/api/documents.ts b/frontend/src/api/documents.ts index 9c73514..07f5fd3 100644 --- a/frontend/src/api/documents.ts +++ b/frontend/src/api/documents.ts @@ -28,6 +28,9 @@ export async function editDocument( content: string; project_ids: string[]; metadata: Record<string, string>; + /** Optimistic-concurrency token: the content_hash the document was + * loaded with. A concurrent change → HTTP 409 (iter-32). */ + expected_content_hash?: string | null; }, ): Promise<EditResponse> { return apiFetch<EditResponse>(`/documents/${documentId}/edit`, { diff --git a/frontend/src/api/types.ts b/frontend/src/api/types.ts index ef43a03..b1695e2 100644 --- a/frontend/src/api/types.ts +++ b/frontend/src/api/types.ts @@ -80,6 +80,9 @@ export interface DocumentDetail { created_at: string | null; updated_at: string | null; deleted_at: string | null; + /** Optimistic-concurrency token (iter-32): send back as + * expected_content_hash when saving a content edit. */ + content_hash: string | null; versions: DocumentVersion[]; } diff --git a/frontend/src/components/SearchControls.tsx b/frontend/src/components/SearchControls.tsx index 1623927..40b85c8 100644 --- a/frontend/src/components/SearchControls.tsx +++ b/frontend/src/components/SearchControls.tsx @@ -127,7 +127,9 @@ export function SearchControls({ }; const keyOptions = - metadataKeys?.map((mk) => ({ value: mk.key, label: `${mk.key} (${mk.doc_count})` })) ?? []; + // Select keeps value/label separate (the bare key is what's stored), so + // the count is safe to embed in the label here — but label it clearly. + metadataKeys?.map((mk) => ({ value: mk.key, label: `${mk.key} (${mk.doc_count} docs)` })) ?? []; return ( <div className={`${ui.card} ${styles.searchBar} ${ui.rise}`}> diff --git a/frontend/src/pages/DocumentEditPage.tsx b/frontend/src/pages/DocumentEditPage.tsx index af2b885..db16ceb 100644 --- a/frontend/src/pages/DocumentEditPage.tsx +++ b/frontend/src/pages/DocumentEditPage.tsx @@ -17,6 +17,7 @@ import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query"; import { useState } from "react"; import { useNavigate, useParams } from "react-router-dom"; +import { ApiError } from "../api/client"; import { editDocument, fetchDocument } from "../api/documents"; import { MarkdownViewer } from "../components/MarkdownViewer"; import { useMetadataKeys, useProjects } from "../hooks/useProjects"; @@ -72,6 +73,9 @@ export function DocumentEditPage() { content, project_ids: projectIds, metadata, + // The hash the document was loaded with — lets the server detect a + // concurrent change (another writer saved while we were editing). + expected_content_hash: doc?.content_hash ?? null, }); }, onSuccess: (result) => { @@ -85,6 +89,13 @@ export function DocumentEditPage() { } }, onError: (err) => { + if (err instanceof ApiError && err.status === 409) { + showError( + "Edit conflict", + "This document changed while you were editing it (another writer saved a newer version). Open the document in a new tab, merge your changes, then save again.", + ); + return; + } if (!showV07DeferredToast(err)) { showError("Save failed", err instanceof Error ? err.message : String(err)); } @@ -94,11 +105,13 @@ export function DocumentEditPage() { const projectOptions = projects?.map((p) => ({ value: p.id, label: p.name })) || []; - const keyOptions = - metadataKeys?.map((mk) => ({ - value: mk.key, - label: `${mk.key} (${mk.doc_count})`, - })) || []; + // Mantine Autocomplete inserts the option LABEL into the input on select, + // so the label must be exactly the key — embedding the doc count in it + // (`status (108)`) used to leak the count into the saved metadata key, + // polluting the KB taxonomy. The count is shown via renderOption instead + // (dropdown-only; never enters the field). + const keyCounts = new Map((metadataKeys ?? []).map((mk) => [mk.key, mk.doc_count])); + const keyOptions = metadataKeys?.map((mk) => mk.key) || []; if (isLoading || !initialized) { return ( @@ -178,6 +191,14 @@ export function DocumentEditPage() { updated[idx] = { ...pair, key: v }; setMetaPairs(updated); }} + renderOption={({ option }) => ( + <Group justify="space-between" w="100%" wrap="nowrap"> + <span>{option.value}</span> + <Text size="xs" c="dimmed"> + {keyCounts.get(option.value)} docs + </Text> + </Group> + )} w={200} size="sm" /> diff --git a/packages/memory/src/cli/commands/get-doc.ts b/packages/memory/src/cli/commands/get-doc.ts index f27e9eb..86e9837 100644 --- a/packages/memory/src/cli/commands/get-doc.ts +++ b/packages/memory/src/cli/commands/get-doc.ts @@ -26,6 +26,7 @@ interface DocPayload { total_chars: number; is_archived: boolean; version_id: string | null; + content_hash: string | null; } async function action( @@ -81,6 +82,11 @@ async function action( (doc.version_id ? ` · version: ${doc.version_id}` : ""), ), ); + // The concurrency token: pass back via `document ingest + // --expected-content-hash` when updating this document (iter-32). + if (doc.content_hash) { + println(c.dim(`content_hash: ${doc.content_hash}`)); + } println(""); println(doc.full_content); } diff --git a/packages/memory/src/cli/commands/ingest-dir.ts b/packages/memory/src/cli/commands/ingest-dir.ts index c1341ea..8128333 100644 --- a/packages/memory/src/cli/commands/ingest-dir.ts +++ b/packages/memory/src/cli/commands/ingest-dir.ts @@ -144,6 +144,9 @@ async function action(dir: string, options: IngestDirOptions): Promise<void> { updateExisting: Boolean(options.updateIfExists), author, authorType: authorType as "user" | "agent", + // Filesystem-sync semantics: the directory IS the source of truth, so + // the optimistic-concurrency check is bypassed by design (iter-32). + lastWriteWins: true, }); outcomes.push({ file, diff --git a/packages/memory/src/cli/commands/ingest.ts b/packages/memory/src/cli/commands/ingest.ts index c8672e7..7d01e22 100644 --- a/packages/memory/src/cli/commands/ingest.ts +++ b/packages/memory/src/cli/commands/ingest.ts @@ -47,6 +47,8 @@ interface IngestOptions { metadata?: string; source?: string; updateIfExists?: boolean; + expectedContentHash?: string; + lastWriteWins?: boolean; documentId?: string; author?: string; authorType?: string; @@ -155,6 +157,8 @@ async function action( documentId: options.documentId ?? null, author, authorType: authorType as "user" | "agent", + expectedContentHash: options.expectedContentHash ?? null, + lastWriteWins: Boolean(options.lastWriteWins), }) : await pipeline.ingestText({ text: content, @@ -167,6 +171,8 @@ async function action( documentId: options.documentId ?? null, author, authorType: authorType as "user" | "agent", + expectedContentHash: options.expectedContentHash ?? null, + lastWriteWins: Boolean(options.lastWriteWins), }); // Match the legacy `ingestTool.handler` output shape — users may @@ -228,6 +234,14 @@ export function registerIngest(program: Command): void { "-i, --document-id <uuid>", "Update a specific document by UUID (overrides --update-if-exists).", ) + .option( + "--expected-content-hash <sha256>", + "Optimistic-concurrency token: the content_hash of the version this edit is based on (shown by `document get` / `search`). Required on content updates unless --last-write-wins.", + ) + .option( + "--last-write-wins", + "Skip the concurrency check and overwrite regardless of concurrent changes (recorded in the audit log).", + ) .option("-a, --author <name>", "Caller identity (audit log).") .option( "--author-type <type>", diff --git a/packages/memory/src/cli/commands/search.ts b/packages/memory/src/cli/commands/search.ts index 57b13d0..4fcfba0 100644 --- a/packages/memory/src/cli/commands/search.ts +++ b/packages/memory/src/cli/commands/search.ts @@ -43,6 +43,7 @@ interface DocResult { doc_updated_at: string | null; version_count: number; doc_project_ids: string[] | null; + content_hash: string | null; } interface ChunkResult { @@ -226,10 +227,14 @@ async function action( ? doc.best_chunk_heading_path.join(" › ") : null; const updated = doc.doc_updated_at ? doc.doc_updated_at.slice(0, 10) : null; - if (bestMatch || updated) { + // content_hash = the concurrency token for `document ingest + // --expected-content-hash` (iter-32). + const hash = doc.content_hash ? `hash: ${doc.content_hash}` : null; + if (bestMatch || updated || hash) { const bits = [ bestMatch ? `best match: ${bestMatch}` : null, updated ? `updated ${updated}` : null, + hash, ].filter(Boolean); println(c.dim(` ${bits.join(" · ")}`)); } diff --git a/packages/memory/src/cli/commands/sync-self-docs.ts b/packages/memory/src/cli/commands/sync-self-docs.ts index b919572..3f9a65f 100644 --- a/packages/memory/src/cli/commands/sync-self-docs.ts +++ b/packages/memory/src/cli/commands/sync-self-docs.ts @@ -99,6 +99,9 @@ export async function runSyncSelfDocs(options: SyncSelfDocsOptions = {}): Promis topic: doc.topic, }, update_if_exists: true, + // Bundled-docs sync: the npm package is the source of truth, so the + // optimistic-concurrency check is bypassed by design (iter-32). + last_write_wins: true, project_name: project, author, author_type: authorType, diff --git a/packages/memory/src/ingestion/client-bridge.ts b/packages/memory/src/ingestion/client-bridge.ts index 41c98d5..0ff6f57 100644 --- a/packages/memory/src/ingestion/client-bridge.ts +++ b/packages/memory/src/ingestion/client-bridge.ts @@ -29,6 +29,11 @@ import type { SupabaseClient } from "@supabase/supabase-js"; +import { + ConcurrencyConflictError, + ConcurrencyTokenRequiredError, +} from "./types.ts"; + // ── Types ─────────────────────────────────────────────────────────────────── export interface DocumentRow { @@ -191,6 +196,8 @@ export class IngestionDbBridge { sourceLabel?: string; retentionHours?: number; cleanupEnabled?: boolean; + expectedContentHash?: string | null; + lastWriteWins?: boolean; }): Promise<IngestDocumentRpcResult> { const params: Record<string, unknown> = { p_document_id: args.documentId, @@ -209,12 +216,28 @@ export class IngestionDbBridge { params.p_retention_hours = args.retentionHours; if (args.cleanupEnabled !== undefined) params.p_cleanup_enabled = args.cleanupEnabled; + // Optimistic concurrency (iter-32). Always sent on the update path so the + // RPC's token-required default applies; the RPC ignores both on create. + if (args.documentId !== null) { + params.p_expected_content_hash = args.expectedContentHash ?? null; + params.p_last_write_wins = args.lastWriteWins ?? false; + } const { data, error } = await this.supabase.rpc( "cerefox_ingest_document", params, ); - if (error) throw new Error(error.message ?? JSON.stringify(error)); + if (error) { + const msg = error.message ?? JSON.stringify(error); + if (msg.includes("CEREFOX_CONFLICT")) { + const current = msg.match(/current hash ([0-9a-f]{64})/)?.[1] ?? null; + throw new ConcurrencyConflictError(args.documentId ?? "", current, msg); + } + if (msg.includes("CEREFOX_TOKEN_REQUIRED")) { + throw new ConcurrencyTokenRequiredError(msg); + } + throw new Error(msg); + } // RPC returns either a single object or an array-with-one-object // depending on Supabase client version. Normalise. if (Array.isArray(data) && data.length > 0) { diff --git a/packages/memory/src/ingestion/pipeline.ts b/packages/memory/src/ingestion/pipeline.ts index 9cfba13..0cc1392 100644 --- a/packages/memory/src/ingestion/pipeline.ts +++ b/packages/memory/src/ingestion/pipeline.ts @@ -36,6 +36,7 @@ import { } from "./client-bridge.ts"; import { fileToMarkdown } from "./file-to-markdown.ts"; import { + ConcurrencyConflictError, loadPipelineSettings, type IngestResult, type IngestTextOptions, @@ -96,6 +97,8 @@ export class IngestionPipeline { documentId, author = "unknown", authorType = "user", + expectedContentHash, + lastWriteWins = false, } = opts; const listFormProvided = @@ -125,6 +128,8 @@ export class IngestionPipeline { metadata, author, authorType, + expectedContentHash, + lastWriteWins, }); if (!listFormProvided && (projectId || projectName)) { const singular = await resolveProjectIds( @@ -169,6 +174,8 @@ export class IngestionPipeline { metadata, author, authorType, + expectedContentHash, + lastWriteWins, }); if (!listFormProvided && (projectId || projectName)) { const singular = await resolveProjectIds( @@ -311,6 +318,8 @@ export class IngestionPipeline { metadata, author = "unknown", authorType = "user", + expectedContentHash, + lastWriteWins = false, } = opts; // ── (1) Verify document exists ─────────────────────────────────────── @@ -324,6 +333,25 @@ export class IngestionPipeline { const contentUnchanged = newHash === existing.content_hash; if (!contentUnchanged) { + // Optimistic-concurrency fast-fail (iter-32): a stale token fails here, + // BEFORE the embedding spend. Advisory only — the authoritative, + // race-free check is inside the cerefox_ingest_document RPC + // (SELECT … FOR UPDATE). Content-unchanged saves are exempt: identical + // content cannot lose data. + if ( + !lastWriteWins && + expectedContentHash && + expectedContentHash !== existing.content_hash + ) { + throw new ConcurrencyConflictError( + documentId, + existing.content_hash, + `CEREFOX_CONFLICT: document ${documentId} changed since it was read ` + + `(expected hash ${expectedContentHash}, current hash ${existing.content_hash}). ` + + `Re-read the document, merge your changes, and retry with the new hash.`, + ); + } + const collision = await this.db.getDocumentByHash(newHash); if (collision && collision.id !== documentId) { throw new Error( @@ -467,6 +495,8 @@ export class IngestionPipeline { sourceLabel: source, retentionHours: this.settings.versionRetentionHours, cleanupEnabled: this.settings.versionCleanupEnabled, + expectedContentHash: expectedContentHash ?? null, + lastWriteWins, }); // Update project membership if explicitly provided. @@ -524,5 +554,7 @@ export { type IngestTextOptions, type UpdateDocumentOptions, type PipelineSettings, + ConcurrencyConflictError, + ConcurrencyTokenRequiredError, DEFAULT_PIPELINE_SETTINGS, } from "./types.ts"; diff --git a/packages/memory/src/ingestion/types.ts b/packages/memory/src/ingestion/types.ts index cb92a62..b56a1db 100644 --- a/packages/memory/src/ingestion/types.ts +++ b/packages/memory/src/ingestion/types.ts @@ -61,6 +61,14 @@ export interface IngestTextOptions { documentId?: string | null; // explicit update; bypasses dedup author?: string; // default "unknown" authorType?: "user" | "agent"; // default "user" + /** + * Optimistic-concurrency token (iter-32): the content_hash of the document + * version this edit was based on. Required by the RPC on content updates + * unless `lastWriteWins` is set. Ignored on create. + */ + expectedContentHash?: string | null; + /** Explicitly skip the concurrency check (filesystem-sync flows). */ + lastWriteWins?: boolean; } /** Options for `updateDocument`. Mirrors Python's `update_document(...)`. */ @@ -74,6 +82,38 @@ export interface UpdateDocumentOptions { metadata?: Record<string, unknown> | null; author?: string; authorType?: "user" | "agent"; + /** Optimistic-concurrency token — see IngestTextOptions.expectedContentHash. */ + expectedContentHash?: string | null; + /** Explicitly skip the concurrency check (filesystem-sync flows). */ + lastWriteWins?: boolean; +} + +/** + * Thrown when a content update loses the optimistic-concurrency race + * (iter-32): the document's content_hash moved between read and write. + * Callers should re-read the document, merge, and retry with the new hash. + * The web routes map this to HTTP 409. + */ +export class ConcurrencyConflictError extends Error { + readonly documentId: string; + readonly currentHash: string | null; + constructor(documentId: string, currentHash: string | null, message: string) { + super(message); + this.name = "ConcurrencyConflictError"; + this.documentId = documentId; + this.currentHash = currentHash; + } +} + +/** + * Thrown when a content update supplies neither expectedContentHash nor + * lastWriteWins. The web routes map this to HTTP 400. + */ +export class ConcurrencyTokenRequiredError extends Error { + constructor(message: string) { + super(message); + this.name = "ConcurrencyTokenRequiredError"; + } } /** Settings the pipeline needs. Subset of the broader app settings. */ diff --git a/packages/memory/src/web/routes/documents-read.ts b/packages/memory/src/web/routes/documents-read.ts index 4581095..8c7ea13 100644 --- a/packages/memory/src/web/routes/documents-read.ts +++ b/packages/memory/src/web/routes/documents-read.ts @@ -175,6 +175,9 @@ export function registerDocumentReadRoutes(app: Hono, ctx: WebContext): void { created_at: meta ? ((meta.created_at as string | null) ?? null) : null, updated_at: meta ? ((meta.updated_at as string | null) ?? null) : null, deleted_at: meta ? ((meta.deleted_at as string | null) ?? null) : null, + // Optimistic-concurrency token (iter-32): the edit page sends this back + // as expected_content_hash on save. Always the CURRENT hash. + content_hash: meta ? ((meta.content_hash as string | null) ?? null) : null, versions: versions.map((v) => ({ version_id: v.version_id, version_number: v.version_number, diff --git a/packages/memory/src/web/routes/documents-write.ts b/packages/memory/src/web/routes/documents-write.ts index db38d71..7ee52c2 100644 --- a/packages/memory/src/web/routes/documents-write.ts +++ b/packages/memory/src/web/routes/documents-write.ts @@ -35,7 +35,11 @@ import { Hono } from "hono"; import { contentHash } from "../../../../../_shared/ingest/index.ts"; -import { IngestionPipeline } from "../../ingestion/pipeline.ts"; +import { + ConcurrencyConflictError, + ConcurrencyTokenRequiredError, + IngestionPipeline, +} from "../../ingestion/pipeline.ts"; import type { WebContext } from "../context.ts"; // `normaliseForHash` + `contentHash` promoted to `_shared/ingest/pipeline- @@ -162,9 +166,37 @@ export function registerDocumentWriteRoutes(app: Hono, ctx: WebContext): void { metadata: Object.keys(metadata).length > 0 ? metadata : undefined, author: "web-ui", authorType: "user", + // Optimistic concurrency (iter-32): the SPA sends the content_hash + // it loaded the document with; a concurrent change → 409 below. + expectedContentHash: + typeof body.expected_content_hash === "string" + ? body.expected_content_hash + : null, }); return c.json({ success: true, reindexed: result.reindexed }); } catch (err) { + if (err instanceof ConcurrencyConflictError) { + return c.json( + { + success: false, + error: "conflict", + message: + "This document changed while you were editing it (another writer saved a newer version). Open it again in a new tab, merge your changes, and save from there.", + current_hash: err.currentHash, + }, + 409, + ); + } + if (err instanceof ConcurrencyTokenRequiredError) { + return c.json( + { + success: false, + error: "expected_content_hash required", + message: err.message, + }, + 400, + ); + } return c.json( { success: false, diff --git a/packages/memory/test/ingestion/pipeline-update.test.ts b/packages/memory/test/ingestion/pipeline-update.test.ts index b7d9fc1..a6713c3 100644 --- a/packages/memory/test/ingestion/pipeline-update.test.ts +++ b/packages/memory/test/ingestion/pipeline-update.test.ts @@ -36,13 +36,33 @@ const RUN_TAG = String(Date.now()); describe("IngestionPipeline.updateDocument (live)", () => { let supabase: SupabaseClient | null = null; let pipeline: IngestionPipeline | null = null; + let schemaTooOld = false; const created: string[] = []; - beforeAll(() => { + beforeAll(async () => { if (!LIVE_OK) return; - supabase = createClient(SUPABASE_URL, SUPABASE_KEY, { + const client = createClient(SUPABASE_URL, SUPABASE_KEY, { auth: { persistSession: false }, }); + // iter-32 gate: the update path now requires the v0.5.0 schema + // (p_expected_content_hash / p_last_write_wins on the ingest RPC). + // Against an older deployed server, leave the suite skipped instead of + // failing with "function not found". + try { + const { data: ver } = await client.rpc("cerefox_schema_version"); + const [maj = 0, min = 0] = String(ver ?? "0.0.0").split(".").map(Number); + if (maj === 0 && min < 5) { + schemaTooOld = true; + console.log( + `(skipped: deployed schema ${ver} < 0.5.0 — run \`cerefox server deploy --schema-only\` to enable these tests)`, + ); + return; + } + } catch { + schemaTooOld = true; + return; + } + supabase = client; pipeline = new IngestionPipeline({ supabase, openAiApiKey: OPENAI_API_KEY, @@ -74,6 +94,10 @@ describe("IngestionPipeline.updateDocument (live)", () => { console.log("(skipped: Supabase + OpenAI not both available)"); return; } + if (schemaTooOld) { + // Deployed schema predates iter-32; the beforeAll left the suite off. + return; + } expect(pipeline).not.toBeNull(); }); @@ -134,23 +158,67 @@ describe("IngestionPipeline.updateDocument (live)", () => { }); created.push(v1.documentId); + // Read the current hash — the optimistic-concurrency token (iter-32). + const { data: v1Row } = await supabase + .from("cerefox_documents") + .select("content_hash") + .eq("id", v1.documentId) + .maybeSingle(); + const v1Hash = v1Row?.content_hash as string; + expect(v1Hash).toBeTruthy(); + const newText = "# Initial\n\nv2 content — totally new. Run " + RUN_TAG + ".\n"; const updated = await pipeline.updateDocument({ documentId: v1.documentId, text: newText, title, author: "pipeline-update-test", + expectedContentHash: v1Hash, }); expect(updated.action).toBe("updated"); expect(updated.reindexed).toBe(true); - // Verify a version was snapshotted. + // ── iter-32: concurrency contract ──────────────────────────────────── + // (a) Updating again with the now-STALE v1 hash → conflict. + await expect( + pipeline.updateDocument({ + documentId: v1.documentId, + text: "# Initial\n\nv3 from a stale base. Run " + RUN_TAG + ".\n", + title, + author: "pipeline-update-test", + expectedContentHash: v1Hash, + }), + ).rejects.toThrow(/CEREFOX_CONFLICT/); + + // (b) Updating with NO token and no last-write-wins → token required. + await expect( + pipeline.updateDocument({ + documentId: v1.documentId, + text: "# Initial\n\nv3 tokenless. Run " + RUN_TAG + ".\n", + title, + author: "pipeline-update-test", + }), + ).rejects.toThrow(/CEREFOX_TOKEN_REQUIRED/); + + // (c) last_write_wins=true bypasses the check. + const forced = await pipeline.updateDocument({ + documentId: v1.documentId, + text: "# Initial\n\nv3 forced. Run " + RUN_TAG + ".\n", + title, + author: "pipeline-update-test", + lastWriteWins: true, + }); + expect(forced.reindexed).toBe(true); + + // Verify versions were snapshotted: one for the token-checked update, + // one for the forced (last-write-wins) update. The two failed attempts + // (stale token, missing token) must NOT have created snapshots. const { data: versions } = await supabase .from("cerefox_document_versions") .select("id, version_number") .eq("document_id", v1.documentId); - expect(versions?.length).toBe(1); - expect(versions?.[0].version_number).toBe(1); + expect(versions?.length).toBe(2); + expect(versions?.map((v) => v.version_number).sort()).toEqual([1, 2]); // Verify the new content_hash on the doc row differs. const { data: row } = await supabase diff --git a/packages/memory/test/write-commands.test.ts b/packages/memory/test/write-commands.test.ts index 56f77a4..9b3e3f6 100644 --- a/packages/memory/test/write-commands.test.ts +++ b/packages/memory/test/write-commands.test.ts @@ -81,6 +81,23 @@ async function hardPurgeE2eDocs(): Promise<void> { const probe = run(["project", "list", "--json"]); const LIVE_OK = probe.status === 0; +// iter-32 gate: content updates require the v0.5.0 schema +// (p_expected_content_hash / p_last_write_wins on cerefox_ingest_document). +// Against an older deployed server the update-flow test skips instead of +// failing with "function not found". +const SCHEMA_OK = await (async () => { + if (!LIVE_OK) return false; + try { + const settings = loadSettings(); + const client = createClient(settings); + const ver = await client.rpc<string>("cerefox_schema_version", {}); + const [maj = 0, min = 0] = String(ver ?? "0.0.0").split(".").map(Number); + return maj > 0 || min >= 5; + } catch { + return false; + } +})(); + // Track docs we create so we can clean them up regardless of test // success / failure. const createdIds: string[] = []; @@ -201,7 +218,16 @@ describe("cerefox write commands (live)", () => { expect(r2.status).toBe(0); expect(r2.stdout).toContain("up-to-date"); - // Re-ingest with changed content → updated. + if (!SCHEMA_OK) { + console.log( + "(update-flow steps skipped: deployed schema < 0.5.0 — run `cerefox server deploy --schema-only`)", + ); + return; + } + + // Re-ingest with changed content but NO concurrency token → rejected + // (iter-32: content updates require --expected-content-hash or + // --last-write-wins). const r3 = run( [ "document", "ingest", @@ -218,8 +244,76 @@ describe("cerefox write commands (live)", () => { ], { stdin: "# Update flow\nV2 content.\n" }, ); - expect(r3.status).toBe(0); - expect(r3.stdout).toContain("updated"); + expect(r3.status).not.toBe(0); + expect(r3.stderr + r3.stdout).toContain("CEREFOX_TOKEN_REQUIRED"); + + // Fetch the current hash (the token) via `document get --json`. + const rGet = run(["document", "get", id!, "--json"]); + expect(rGet.status).toBe(0); + const currentHash = (JSON.parse(rGet.stdout) as { content_hash?: string }) + .content_hash; + expect(currentHash).toBeTruthy(); + + // Changed content WITH the token → updated. + const r4 = run( + [ + "document", "ingest", + "--paste", + "--title", + title, + "--project-name", + "_e2e-v0.5", + "--update-if-exists", + "--expected-content-hash", + currentHash!, + "--author", + "v0.5-test", + "--author-type", + "agent", + ], + { stdin: "# Update flow\nV2 content.\n" }, + ); + expect(r4.status).toBe(0); + expect(r4.stdout).toContain("updated"); + + // Re-using the now-STALE token → conflict. + const r5 = run( + [ + "document", "ingest", + "--paste", + "--title", + title, + "--update-if-exists", + "--expected-content-hash", + currentHash!, + "--author", + "v0.5-test", + "--author-type", + "agent", + ], + { stdin: "# Update flow\nV3 content.\n" }, + ); + expect(r5.status).not.toBe(0); + expect(r5.stderr + r5.stdout).toContain("CEREFOX_CONFLICT"); + + // --last-write-wins bypasses the check. + const r6 = run( + [ + "document", "ingest", + "--paste", + "--title", + title, + "--update-if-exists", + "--last-write-wins", + "--author", + "v0.5-test", + "--author-type", + "agent", + ], + { stdin: "# Update flow\nV4 content.\n" }, + ); + expect(r6.status).toBe(0); + expect(r6.stdout).toContain("updated"); }); test("ingest-dir: walks tree and ingests matching files", () => { diff --git a/src/cerefox/db/client.py b/src/cerefox/db/client.py index 9a0fda0..6691aef 100644 --- a/src/cerefox/db/client.py +++ b/src/cerefox/db/client.py @@ -763,6 +763,11 @@ def ingest_document_rpc( "p_source_label": source_label, "p_retention_hours": retention_hours, "p_cleanup_enabled": cleanup_enabled, + # iter-32 (optimistic concurrency): the frozen Python fallback + # predates the expected_content_hash contract, so it explicitly + # declares last-write-wins to preserve its historical behavior. + # This is the one deliberate exception to "Python is frozen". + "p_last_write_wins": True, } rows = self.rpc("cerefox_ingest_document", params) diff --git a/src/cerefox/db/rpcs.sql b/src/cerefox/db/rpcs.sql index 2f1dee8..4f78bd8 100644 --- a/src/cerefox/db/rpcs.sql +++ b/src/cerefox/db/rpcs.sql @@ -38,6 +38,13 @@ DROP FUNCTION IF EXISTS cerefox_hybrid_search(TEXT, VECTOR(768), INT, FLOAT, BOO DROP FUNCTION IF EXISTS cerefox_fts_search(TEXT, INT, UUID); DROP FUNCTION IF EXISTS cerefox_semantic_search(VECTOR(768), INT, BOOLEAN, UUID, FLOAT); DROP FUNCTION IF EXISTS cerefox_reconstruct_doc(UUID); + +-- Iteration 32 (v0.11, optimistic concurrency): content_hash added to the return +-- types of all document-shaped reads — the writer's concurrency token must be +-- obtainable from every read surface. Drop the pre-change signatures first. +DROP FUNCTION IF EXISTS cerefox_get_document(UUID, UUID); +DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT, INT, INT, JSONB); +DROP FUNCTION IF EXISTS cerefox_metadata_search(JSONB, UUID, TIMESTAMPTZ, TIMESTAMPTZ, INT, BOOLEAN, INT); DROP FUNCTION IF EXISTS cerefox_search_docs(TEXT, VECTOR(768), INT, FLOAT, UUID, FLOAT); -- Iteration 13: Drop pre-metadata-filter signatures so we can add p_metadata_filter JSONB. @@ -592,7 +599,10 @@ RETURNS TABLE ( total_chars INT, doc_updated_at TIMESTAMPTZ, version_count INT, - is_partial BOOL + is_partial BOOL, + -- Optimistic-concurrency token (iter-32): the document's current + -- content_hash, to pass back as expected_content_hash on update. + content_hash TEXT ) LANGUAGE sql SECURITY DEFINER @@ -625,7 +635,8 @@ AS $$ cr.doc_project_ids, cr.doc_project_names, cr.version_count, - d.updated_at AS doc_updated_at + d.updated_at AS doc_updated_at, + d.content_hash FROM chunk_results cr JOIN cerefox_documents d ON d.id = cr.document_id ORDER BY cr.document_id, cr.score DESC @@ -707,7 +718,8 @@ AS $$ ds.total_chars, -- always full document size, even for partial results td.doc_updated_at, td.version_count, - ac.is_partial + ac.is_partial, + td.content_hash FROM top_docs td JOIN doc_sizes ds ON ds.document_id = td.document_id JOIN all_content ac ON ac.document_id = td.document_id @@ -832,7 +844,11 @@ RETURNS TABLE ( full_content TEXT, chunk_count INT, total_chars INT, - created_at TIMESTAMPTZ + created_at TIMESTAMPTZ, + -- Current content_hash of the document — the optimistic-concurrency token + -- to pass back as expected_content_hash on update (iter-32). Note: always + -- the CURRENT hash, even when an archived version is being retrieved. + content_hash TEXT ) LANGUAGE sql SECURITY DEFINER @@ -853,7 +869,8 @@ AS $$ STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) AS full_content, COUNT(*)::INT AS chunk_count, SUM(c.char_count)::INT AS total_chars, - d.created_at + d.created_at, + d.content_hash FROM cerefox_documents d JOIN cerefox_chunks c ON c.document_id = d.id WHERE d.id = p_document_id @@ -861,7 +878,7 @@ AS $$ (p_version_id IS NULL AND c.version_id IS NULL) OR (p_version_id IS NOT NULL AND c.version_id = p_version_id) ) - GROUP BY d.id, d.title, d.source, d.metadata, d.created_at; + GROUP BY d.id, d.title, d.source, d.metadata, d.created_at, d.content_hash; $$; -- ── cerefox_list_document_versions ──────────────────────────────────────────── @@ -1037,11 +1054,21 @@ $$; -- p_source_label : version source label for snapshot ('file','paste','agent','manual') -- p_retention_hours : for version cleanup (default 48) -- p_cleanup_enabled : whether version cleanup runs (default true) +-- p_expected_content_hash : optimistic-concurrency token (iter-32). On the UPDATE +-- path this must equal the document's current content_hash — +-- the caller proves they based their edit on the live version. +-- Mismatch → CEREFOX_CONFLICT (SQLSTATE 40001). Absent (NULL) +-- without p_last_write_wins → CEREFOX_TOKEN_REQUIRED (22023). +-- Ignored on the CREATE path. +-- p_last_write_wins : explicit opt-out of the concurrency check (filesystem-sync +-- flows where an external source of truth makes conflicts +-- meaningless). Recorded in the audit description when used. -- -- Returns: document_id, chunk_count, total_chars, operation ('create' or 'update-content'), -- version_id (UUID of snapshot, null on create) DROP FUNCTION IF EXISTS cerefox_ingest_document(UUID, TEXT, TEXT, TEXT, TEXT, JSONB, TEXT, JSONB, TEXT, TEXT, TEXT, INT, BOOLEAN); +DROP FUNCTION IF EXISTS cerefox_ingest_document(UUID, TEXT, TEXT, TEXT, TEXT, JSONB, TEXT, JSONB, TEXT, TEXT, TEXT, INT, BOOLEAN, TEXT, BOOLEAN); CREATE FUNCTION cerefox_ingest_document( p_document_id UUID DEFAULT NULL, p_title TEXT DEFAULT 'Untitled', @@ -1055,7 +1082,9 @@ CREATE FUNCTION cerefox_ingest_document( p_author_type TEXT DEFAULT 'user', p_source_label TEXT DEFAULT 'manual', p_retention_hours INT DEFAULT 48, - p_cleanup_enabled BOOLEAN DEFAULT TRUE + p_cleanup_enabled BOOLEAN DEFAULT TRUE, + p_expected_content_hash TEXT DEFAULT NULL, + p_last_write_wins BOOLEAN DEFAULT FALSE ) RETURNS TABLE ( document_id UUID, @@ -1075,6 +1104,7 @@ DECLARE v_operation TEXT; v_version_id UUID := NULL; v_old_chars INT := 0; + v_current_hash TEXT; v_chunk JSONB; v_snap RECORD; v_status TEXT; @@ -1114,9 +1144,38 @@ BEGIN v_doc_id := p_document_id; v_operation := 'update-content'; - -- Get old size for audit - SELECT COALESCE(d.total_chars, 0) INTO v_old_chars - FROM cerefox_documents d WHERE d.id = v_doc_id; + -- Lock the row and read its current state. FOR UPDATE makes the + -- concurrency check below atomic with the write: two simultaneous + -- updaters serialize here, and the second one sees the first one's + -- hash — the race window (chunk + embed latency) is closed at the + -- only place all transports share (iter-32). + SELECT COALESCE(d.total_chars, 0), d.content_hash + INTO v_old_chars, v_current_hash + FROM cerefox_documents d WHERE d.id = v_doc_id + FOR UPDATE; + + IF NOT FOUND THEN + RAISE EXCEPTION 'cerefox_ingest_document: document not found: %', v_doc_id + USING ERRCODE = '22023'; -- invalid_parameter_value + END IF; + + -- ── Optimistic concurrency check (iter-32) ─────────────────── + -- Content updates must prove freshness (expected hash) or explicitly + -- choose last-write-wins. Message prefixes are machine-detectable: + -- transport handlers map them to agent-first retry instructions. + IF NOT p_last_write_wins THEN + IF p_expected_content_hash IS NULL THEN + RAISE EXCEPTION + 'CEREFOX_TOKEN_REQUIRED: content updates require expected_content_hash (the content_hash you read) or last_write_wins=true. Current hash: %', + v_current_hash + USING ERRCODE = '22023'; -- invalid_parameter_value + ELSIF p_expected_content_hash <> v_current_hash THEN + RAISE EXCEPTION + 'CEREFOX_CONFLICT: document % changed since it was read (expected hash %, current hash %). Re-read the document, merge your changes, and retry with the new hash.', + v_doc_id, p_expected_content_hash, v_current_hash + USING ERRCODE = '40001'; -- serialization_failure + END IF; + END IF; -- Snapshot old version (archives current chunks, runs retention cleanup) SELECT sv.version_id INTO v_version_id @@ -1183,6 +1242,8 @@ BEGIN p_size_before := CASE WHEN v_operation = 'create' THEN NULL ELSE v_old_chars END, p_size_after := v_total_chars, p_description := v_operation || ': ' || p_title || ' (' || v_chunk_count || ' chunks, ' || v_total_chars || ' chars)' + || CASE WHEN p_last_write_wins AND v_operation = 'update-content' + THEN ' [last-write-wins]' ELSE '' END ); RETURN QUERY SELECT v_doc_id, v_chunk_count, v_total_chars, v_operation, v_version_id; @@ -1407,6 +1468,9 @@ RETURNS TABLE ( project_ids UUID[], project_names TEXT[], version_count INT, + -- Optimistic-concurrency token (iter-32): pass back as + -- expected_content_hash on update. + content_hash TEXT, content TEXT ) LANGUAGE plpgsql @@ -1436,6 +1500,7 @@ BEGIN WHERE dp.document_id = d.id) AS project_names, (SELECT COUNT(*)::INT FROM cerefox_document_versions dv WHERE dv.document_id = d.id) AS version_count, + d.content_hash, CASE WHEN p_include_content THEN (SELECT STRING_AGG(c.content, E'\n\n' ORDER BY c.chunk_index) FROM cerefox_chunks c @@ -1474,6 +1539,7 @@ BEGIN project_ids := v_row.project_ids; project_names := v_row.project_names; version_count := v_row.version_count; + content_hash := v_row.content_hash; content := v_row.content; RETURN NEXT; END LOOP; @@ -1694,7 +1760,7 @@ SET search_path = public, pg_catalog AS $$ -- Keep in lockstep with the `@version:` marker in schema.sql (cut_release.ts -- enforces it). Bump whenever schema.sql OR rpcs.sql changes. - SELECT '0.4.0'::TEXT; + SELECT '0.5.0'::TEXT; $$; diff --git a/src/cerefox/db/schema.sql b/src/cerefox/db/schema.sql index 4c089af..f8c9ef8 100644 --- a/src/cerefox/db/schema.sql +++ b/src/cerefox/db/schema.sql @@ -5,7 +5,7 @@ -- Requires extensions: vector (pgvector), uuid-ossp -- These are enabled at the top of db_deploy.py before this file is applied. -- --- @version: 0.4.0 +-- @version: 0.5.0 -- The `@version` marker above is read by the schema-version-mismatch banner -- (see /api/v1/schema-version). Bump it whenever schema.sql OR rpcs.sql -- changes in a way that requires `cerefox server deploy` to be re-run — diff --git a/supabase/functions/cerefox-get-document/index.ts b/supabase/functions/cerefox-get-document/index.ts index a9e794d..c5305b3 100644 --- a/supabase/functions/cerefox-get-document/index.ts +++ b/supabase/functions/cerefox-get-document/index.ts @@ -17,7 +17,7 @@ import { isVersionRequest, versionResponse } from "../../../_shared/ef-meta/inde * { document_id: string, version_id?: string | null } * * Response (200): - * { document_id, doc_title, full_content, chunk_count, total_chars, is_archived, version_id } + * { document_id, doc_title, full_content, chunk_count, total_chars, is_archived, version_id, content_hash } * Response (404): * { error: "Document not found" } * Response (400): @@ -98,6 +98,7 @@ Deno.serve(async (req: Request): Promise<Response> => { full_content?: string; chunk_count?: number; total_chars?: number; + content_hash?: string; } | undefined; if (!row) { @@ -125,6 +126,9 @@ Deno.serve(async (req: Request): Promise<Response> => { total_chars: row.total_chars ?? 0, is_archived: version_id !== null, version_id, + // Optimistic-concurrency token (iter-32): always the CURRENT hash — + // pass back as expected_content_hash when updating via ingest. + content_hash: row.content_hash ?? null, }), { status: 200, headers: { ...CORS_HEADERS, "Content-Type": "application/json" } }, ); diff --git a/supabase/functions/cerefox-ingest/index.ts b/supabase/functions/cerefox-ingest/index.ts index e34d55a..ceeb127 100644 --- a/supabase/functions/cerefox-ingest/index.ts +++ b/supabase/functions/cerefox-ingest/index.ts @@ -40,6 +40,42 @@ interface IngestRequest { update_if_exists?: boolean; author?: string; author_type?: string; // 'user' | 'agent' + // Optimistic concurrency (iter-32): REQUIRED on content updates — the + // content_hash of the version this edit was based on. Conflict → HTTP 409. + expected_content_hash?: string; + // Explicitly skip the concurrency check (external source of truth). + last_write_wins?: boolean; +} + +// Map the RPC's CEREFOX_CONFLICT / CEREFOX_TOKEN_REQUIRED errors to HTTP +// responses (409 conflict / 400 missing token). Returns null for other errors. +function concurrencyErrorResponse( + message: string, + headers: Record<string, string>, +): Response | null { + if (message.includes("CEREFOX_CONFLICT")) { + return new Response( + JSON.stringify({ + error: "conflict", + message: + "Document changed since it was read. Re-read it (getDocument), merge your changes, and retry with the new expected_content_hash.", + detail: message, + }), + { status: 409, headers }, + ); + } + if (message.includes("CEREFOX_TOKEN_REQUIRED")) { + return new Response( + JSON.stringify({ + error: "expected_content_hash required", + message: + "Content updates require expected_content_hash (the content_hash returned by getDocument / searchKnowledgeBase / metadataSearch) or last_write_wins=true.", + detail: message, + }), + { status: 400, headers }, + ); + } + return null; } interface Chunk { @@ -428,7 +464,7 @@ Deno.serve(async (req: Request) => { }); } - const { title, content, document_id = null, project_name, source = "agent", metadata = {}, update_if_exists = false, author = "agent", author_type = "agent" } = body; + const { title, content, document_id = null, project_name, source = "agent", metadata = {}, update_if_exists = false, author = "agent", author_type = "agent", expected_content_hash = null, last_write_wins = false } = body; // Validate + normalize project_names if provided (full-set destructive form) let project_names: string[] | null = null; @@ -525,6 +561,16 @@ Deno.serve(async (req: Request) => { ); } + // Optimistic-concurrency fast-fail (iter-32): stale token fails BEFORE + // the embedding spend. Advisory only — the authoritative race-free check + // is inside the RPC (SELECT … FOR UPDATE). + if (!last_write_wins && expected_content_hash && expected_content_hash !== existingDoc.content_hash) { + return concurrencyErrorResponse( + `CEREFOX_CONFLICT: document ${existingDoc.id} changed since it was read (expected hash ${expected_content_hash}, current hash ${existingDoc.content_hash}).`, + headers, + )!; + } + // Content changed -- re-chunk, re-embed, ingest via RPC const chunks = chunkMarkdown(content); if (chunks.length === 0) { @@ -562,9 +608,13 @@ Deno.serve(async (req: Request) => { p_author: author, p_author_type: author_type, p_source_label: source, + p_expected_content_hash: expected_content_hash, + p_last_write_wins: last_write_wins, }); if (ingestErr) { + const mapped = concurrencyErrorResponse(ingestErr.message ?? "", headers); + if (mapped) return mapped; return new Response(JSON.stringify({ error: `Ingest RPC failed: ${ingestErr.message}` }), { status: 500, headers }); } @@ -593,6 +643,7 @@ Deno.serve(async (req: Request) => { chunk_count: chunks.length, total_chars: totalChars, updated: true, + content_hash: contentHash, ...(note && { note }), }), { headers }, @@ -625,6 +676,14 @@ Deno.serve(async (req: Request) => { ); } + // Optimistic-concurrency fast-fail (iter-32) — see ID-based path. + if (!last_write_wins && expected_content_hash && expected_content_hash !== existingDoc.content_hash) { + return concurrencyErrorResponse( + `CEREFOX_CONFLICT: document ${existingDoc.id} changed since it was read (expected hash ${expected_content_hash}, current hash ${existingDoc.content_hash}).`, + headers, + )!; + } + // Content changed — re-chunk, re-embed, ingest via RPC const chunks = chunkMarkdown(content); if (chunks.length === 0) { @@ -667,9 +726,13 @@ Deno.serve(async (req: Request) => { p_author: author, p_author_type: author_type, p_source_label: source, + p_expected_content_hash: expected_content_hash, + p_last_write_wins: last_write_wins, }); if (ingestErr) { + const mapped = concurrencyErrorResponse(ingestErr.message ?? "", headers); + if (mapped) return mapped; return new Response( JSON.stringify({ error: `Ingest RPC failed: ${ingestErr.message}` }), { status: 500, headers }, @@ -701,6 +764,7 @@ Deno.serve(async (req: Request) => { chunk_count: chunks.length, total_chars: totalChars, updated: true, + content_hash: contentHash, }), { headers }, ); diff --git a/web/static/cerefox_local_vs_cloud.png b/web/static/cerefox_local_vs_cloud.png index 40d794c..36b3e64 100644 Binary files a/web/static/cerefox_local_vs_cloud.png and b/web/static/cerefox_local_vs_cloud.png differ