From b54537fdb02c51ffa5ebe16580d18d9a4c5a17f3 Mon Sep 17 00:00:00 2001 From: Fotis Stamatelopoulos Date: Fri, 12 Jun 2026 18:19:25 -0700 Subject: [PATCH] fix: content updates no longer wipe document metadata; CLI metadata-search parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes surfaced by a real metadata-loss incident (a content update via the CLI silently cleared a document's tags, and metadata is unversioned so the loss was unrecoverable): 1. metadata contract: NULL = "not provided" → KEEP existing (create uses {}), enforced once in the cerefox_ingest_document RPC (metadata = COALESCE(p_metadata, metadata)). Every transport used to default an absent metadata argument to {} and the RPC applied it verbatim — wiping tags on any content update that didn't re-pass them. Callers fixed in lockstep: MCP handler, cerefox-ingest EF, CLI ingest/ingest-dir, frozen Python client. Pass {} explicitly to deliberately clear. Schema 0.5.0 → 0.6.0 (RPC-only; `cerefox server deploy` required — v0.11.1 clients send NULL, which a 0.5.0 server would reject on update via the NOT NULL column). 2. CLI parity: `cerefox metadata search` no longer hard-requires --metadata-filter — same contract as the MCP tool / EF (relaxed in v0.10.x; the CLI was missed): at least one of filter / --project-name / --updated-since / --created-since; --project-name alone lists a project. Tests: RPC-level preserve/clear contract (synthetic embedding, no OpenAI); CLI update-flow asserts tags survive four metadata-less updates; metadata search parity smokes; live-suite schema gates bumped to 0.6.0. Backlog: docs/research/metadata-versioning.md (recovery for the unversioned-metadata gap) + TODO entry. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENT_GUIDE.md | 2 +- AGENT_QUICK_REFERENCE.md | 2 +- CHANGELOG.md | 17 +++- _shared/mcp-tools/get-help-content.ts | 4 +- _shared/mcp-tools/ingest.ts | 5 +- docs/TODO.md | 6 ++ docs/guides/cli.md | 6 +- docs/guides/connect-agents.md | 6 +- docs/plan.md | 13 ++- docs/research/metadata-versioning.md | 66 +++++++++++++ .../memory/src/cli/commands/ingest-dir.ts | 6 +- packages/memory/src/cli/commands/ingest.ts | 9 +- .../src/cli/commands/metadata-search.ts | 28 ++++-- .../test/ingestion/pipeline-update.test.ts | 92 +++++++++++++++++-- packages/memory/test/read-commands.test.ts | 6 +- packages/memory/test/write-commands.test.ts | 28 +++++- src/cerefox/db/client.py | 4 +- src/cerefox/db/rpcs.sql | 19 ++-- src/cerefox/db/schema.sql | 2 +- supabase/functions/cerefox-ingest/index.ts | 9 +- 20 files changed, 283 insertions(+), 47 deletions(-) create mode 100644 docs/research/metadata-versioning.md diff --git a/AGENT_GUIDE.md b/AGENT_GUIDE.md index d6df144..5c4faef 100644 --- a/AGENT_GUIDE.md +++ b/AGENT_GUIDE.md @@ -72,7 +72,7 @@ Save a new document or update an existing one. | `last_write_wins` | No | Explicitly skip the concurrency check (default `false`). Use ONLY when an external source of truth makes conflicts meaningless (file re-sync). Recorded in the audit log. **Never use it to silence a conflict.** | | `project_name` | No | **Single** project name (created if absent). On update: **non-destructive add** — ensures this membership exists, preserves others. See "Project membership semantics" below. | | `project_names` | No | **List** of project names (each created if absent). On update: **destructive replace** — sets the document's full project set to exactly this list. Use when you want to set multiple projects at once, or deliberately change the membership list. Wins over `project_name` when both are passed. | -| `metadata` | No | Arbitrary JSON. Use at minimum: `type` and `status`. | +| `metadata` | No | Arbitrary JSON. Use at minimum: `type` and `status`. **On update, omitting this keeps the document's existing metadata** (v0.11.1); pass `{}` to deliberately clear all tags. | | `author` | No | Your agent name for audit attribution. Always set this. | | `source` | No | Origin label (default "agent"). | diff --git a/AGENT_QUICK_REFERENCE.md b/AGENT_QUICK_REFERENCE.md index 44bd79b..3695e75 100644 --- a/AGENT_QUICK_REFERENCE.md +++ b/AGENT_QUICK_REFERENCE.md @@ -7,7 +7,7 @@ Cerefox is a persistent, shared knowledge base. You have **10 MCP tools** (9 of | Tool | Purpose | Key params | |------|---------|------------| | `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` | -| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` | +| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata` (omit on update to keep existing tags; `{}` clears), `author` | | `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) | | `cerefox_list_versions` | Version history of a document | `document_id` (required) | | `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since | diff --git a/CHANGELOG.md b/CHANGELOG.md index 150c22f..352470b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,22 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html) — all ` ## [Unreleased] -Open roadmap. +### Fixed + +- **Content updates no longer wipe a document's metadata.** Every transport defaulted + an absent `metadata` argument to `{}` and the ingest RPC applied it verbatim — so any + content update that didn't re-pass the tags (CLI `document ingest` without + `--metadata`, MCP `cerefox_ingest`, the REST EF, the frozen Python fallback) silently + cleared the document's metadata. And since metadata is not versioned, the loss was + unrecoverable. The contract is now **NULL = "not provided" → keep existing** (create + uses `{}`), enforced once in the `cerefox_ingest_document` RPC; pass `{}` explicitly + to deliberately clear. Schema version 0.5.0 → **0.6.0** (RPC-only; run + `cerefox server deploy` — v0.11.1 clients sending NULL against a 0.5.0 server would + fail the NOT NULL constraint on update). +- **CLI parity: `cerefox metadata search` no longer requires `--metadata-filter`.** + Like the MCP tool / EF (relaxed in v0.10.x — the CLI was missed), at least one of + filter / `--project-name` / `--updated-since` / `--created-since` is required; + `--project-name` alone lists that project's documents. --- diff --git a/_shared/mcp-tools/get-help-content.ts b/_shared/mcp-tools/get-help-content.ts index 35e7fa6..860108b 100644 --- a/_shared/mcp-tools/get-help-content.ts +++ b/_shared/mcp-tools/get-help-content.ts @@ -11,11 +11,11 @@ * docs/specs/polish-and-distribution-design.md §10d. */ -export const HELP_FULL = "# Cerefox Knowledge Base -- Agent Quick Reference\n\nCerefox is a persistent, shared knowledge base. You have **10 MCP tools** (9 of them have CLI equivalents — `cerefox_get_help` is MCP-only). For the full guide, search Cerefox for \"How AI Agents Use Cerefox\" or call `cerefox_get_help` to retrieve this content over MCP.\n\n## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` |\n| `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |\n\n## Essential Rules\n\n1. **Search before ingesting** -- check if the document exists first.\n2. **Prefer ID-based updates** -- pass `document_id` from search results for deterministic updates. Falls back to title-matching with `update_if_exists: true`.\n3. **Set `author`/`requestor`** to your name on every call (e.g., \"Claude Code\", \"archiver\"). On MCP, pass as parameters. On CLI, pass `--author`/`--author-type`/`--requestor` flags, or rely on `CEREFOX_AUTHOR_NAME`/`CEREFOX_AUTHOR_TYPE`/`CEREFOX_REQUESTOR_NAME` env vars set in the user's `.env`.\n4. **Use `document_id` from search results** `[id: uuid]` for get_document and list_versions.\n5. **Add metadata** -- at minimum `type` (\"decision-log\", \"research\", \"design-doc\") and `status` (\"active\", \"draft\").\n6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search.\n7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design.\n8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: ]` after the title; grab it and use it. Title-based linking (`[Text]()`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule.\n9. **Concurrency: content updates require `expected_content_hash`.** Pass the `content_hash` you read (shown by `cerefox_get_document`, `cerefox_search`, and `cerefox_metadata_search`) when updating a document. If it's stale you get a **conflict** — re-read the document, merge your changes into the latest content, retry with the new hash. **Never resolve a conflict by overwriting blindly** — the current content includes another writer's work. `last_write_wins: true` skips the check; use it ONLY when an external source of truth makes conflicts meaningless (file re-sync), never to silence a conflict.\n10. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean \"add\" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`.\n\n## Update Workflow (ID-based -- preferred)\n\n```\nsearch(\"topic\") -> find doc [id: abc123] -> get_document(abc123) -> note its content_hash -> modify ->\ningest(title=\"Same Title\", content=\"...\", document_id=\"abc123\",\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\nOn a **conflict** error: get_document again (fresh content + fresh hash) -> merge your changes -> retry with the new hash.\n\n## Update Workflow (title-based -- fallback)\n\n```\nsearch(\"topic\") -> find doc (note its hash) -> modify ->\ningest(title=\"Same Title\", content=\"...\", update_if_exists=true,\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\n## Catch-Up Workflow\n\n```\nmetadata_search(metadata_filter={\"type\": \"decision-log\"}, updated_since=\"2026-03-28T00:00:00Z\")\n```\n\n## CLI fallback (when MCP is unavailable)\n\nIf `cerefox_search` is not in your tool list, your user has likely installed the Cerefox CLI. The canonical invocation is plain **`cerefox <subcommand>`** (the TypeScript CLI, installed via `npm install -g @cerefox/memory`). It uses a resource-verb shape (`cerefox document get`, `cerefox project list`, …). The legacy Python `uv run cerefox` is now a frozen husk as of v0.9 — only `uv run cerefox mcp` still works.\n\nSame operations, same conventions. Full reference: [`docs/guides/cli.md`](docs/guides/cli.md). CLI flag names match MCP parameter names exactly (e.g. `metadata_filter` ↔ `--metadata-filter`); common flags also have single-letter short forms (`-f`, `-p`, `-c`, `-m`, `-u`, `-a`, `-r`). Use the canonical long name (what `--help` shows) or its short form — there are no long-form aliases like `--filter` or `--count`.\n\n| MCP tool | CLI |\n|---|---|\n| `cerefox_search` | `cerefox search \"<q>\" --requestor \"<your-name>\"` |\n| `cerefox_ingest` (paste) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_ingest` (update by ID) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --document-id \"<uuid>\" --expected-content-hash \"<hash>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_get_document` | `cerefox document get <id> --version-id <vid> --requestor \"<your-name>\"` |\n| `cerefox_list_versions` | `cerefox document version list <id> --requestor \"<your-name>\"` |\n| `cerefox_list_projects` | `cerefox project list --requestor \"<your-name>\"` |\n| `cerefox_list_metadata_keys` | `cerefox metadata keys` |\n| `cerefox_metadata_search` | `cerefox metadata search --metadata-filter '<json>' --requestor \"<your-name>\"` (list a project: `cerefox document list --project <name>`) |\n| `cerefox_set_document_projects` | `cerefox document set-projects <id> <name...> --author \"<your-name>\" --author-type agent` (or `--clear` to remove all) |\n| `cerefox_get_audit_log` | `cerefox audit list --requestor \"<your-name>\"` (add `--json` for scripted access) |\n| `cerefox_get_help` | `cerefox guides show agent-quick-reference` (or `cerefox guides list` for the full bundled-docs index) |\n\n**Set identity on every call**, exactly as you would on MCP:\n- Writes (`document ingest`, `document ingest-dir`): `--author \"<your-name>\" --author-type agent`\n- Reads: `--requestor \"<your-name>\"`\n\nOr have your user set `CEREFOX_AUTHOR_NAME` / `CEREFOX_AUTHOR_TYPE` / `CEREFOX_REQUESTOR_NAME` in their `.env` to apply defaults once.\n"; +export const HELP_FULL = "# Cerefox Knowledge Base -- Agent Quick Reference\n\nCerefox is a persistent, shared knowledge base. You have **10 MCP tools** (9 of them have CLI equivalents — `cerefox_get_help` is MCP-only). For the full guide, search Cerefox for \"How AI Agents Use Cerefox\" or call `cerefox_get_help` to retrieve this content over MCP.\n\n## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata` (omit on update to keep existing tags; `{}` clears), `author` |\n| `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |\n\n## Essential Rules\n\n1. **Search before ingesting** -- check if the document exists first.\n2. **Prefer ID-based updates** -- pass `document_id` from search results for deterministic updates. Falls back to title-matching with `update_if_exists: true`.\n3. **Set `author`/`requestor`** to your name on every call (e.g., \"Claude Code\", \"archiver\"). On MCP, pass as parameters. On CLI, pass `--author`/`--author-type`/`--requestor` flags, or rely on `CEREFOX_AUTHOR_NAME`/`CEREFOX_AUTHOR_TYPE`/`CEREFOX_REQUESTOR_NAME` env vars set in the user's `.env`.\n4. **Use `document_id` from search results** `[id: uuid]` for get_document and list_versions.\n5. **Add metadata** -- at minimum `type` (\"decision-log\", \"research\", \"design-doc\") and `status` (\"active\", \"draft\").\n6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search.\n7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design.\n8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: <uuid>]` after the title; grab it and use it. Title-based linking (`[Text](<Title With Spaces>)`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule.\n9. **Concurrency: content updates require `expected_content_hash`.** Pass the `content_hash` you read (shown by `cerefox_get_document`, `cerefox_search`, and `cerefox_metadata_search`) when updating a document. If it's stale you get a **conflict** — re-read the document, merge your changes into the latest content, retry with the new hash. **Never resolve a conflict by overwriting blindly** — the current content includes another writer's work. `last_write_wins: true` skips the check; use it ONLY when an external source of truth makes conflicts meaningless (file re-sync), never to silence a conflict.\n10. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean \"add\" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`.\n\n## Update Workflow (ID-based -- preferred)\n\n```\nsearch(\"topic\") -> find doc [id: abc123] -> get_document(abc123) -> note its content_hash -> modify ->\ningest(title=\"Same Title\", content=\"...\", document_id=\"abc123\",\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\nOn a **conflict** error: get_document again (fresh content + fresh hash) -> merge your changes -> retry with the new hash.\n\n## Update Workflow (title-based -- fallback)\n\n```\nsearch(\"topic\") -> find doc (note its hash) -> modify ->\ningest(title=\"Same Title\", content=\"...\", update_if_exists=true,\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\n## Catch-Up Workflow\n\n```\nmetadata_search(metadata_filter={\"type\": \"decision-log\"}, updated_since=\"2026-03-28T00:00:00Z\")\n```\n\n## CLI fallback (when MCP is unavailable)\n\nIf `cerefox_search` is not in your tool list, your user has likely installed the Cerefox CLI. The canonical invocation is plain **`cerefox <subcommand>`** (the TypeScript CLI, installed via `npm install -g @cerefox/memory`). It uses a resource-verb shape (`cerefox document get`, `cerefox project list`, …). The legacy Python `uv run cerefox` is now a frozen husk as of v0.9 — only `uv run cerefox mcp` still works.\n\nSame operations, same conventions. Full reference: [`docs/guides/cli.md`](docs/guides/cli.md). CLI flag names match MCP parameter names exactly (e.g. `metadata_filter` ↔ `--metadata-filter`); common flags also have single-letter short forms (`-f`, `-p`, `-c`, `-m`, `-u`, `-a`, `-r`). Use the canonical long name (what `--help` shows) or its short form — there are no long-form aliases like `--filter` or `--count`.\n\n| MCP tool | CLI |\n|---|---|\n| `cerefox_search` | `cerefox search \"<q>\" --requestor \"<your-name>\"` |\n| `cerefox_ingest` (paste) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_ingest` (update by ID) | `printf '...' \\| cerefox document ingest --paste --title \"<t>\" --document-id \"<uuid>\" --expected-content-hash \"<hash>\" --author \"<your-name>\" --author-type agent` |\n| `cerefox_get_document` | `cerefox document get <id> --version-id <vid> --requestor \"<your-name>\"` |\n| `cerefox_list_versions` | `cerefox document version list <id> --requestor \"<your-name>\"` |\n| `cerefox_list_projects` | `cerefox project list --requestor \"<your-name>\"` |\n| `cerefox_list_metadata_keys` | `cerefox metadata keys` |\n| `cerefox_metadata_search` | `cerefox metadata search --metadata-filter '<json>' --requestor \"<your-name>\"` (list a project: `cerefox document list --project <name>`) |\n| `cerefox_set_document_projects` | `cerefox document set-projects <id> <name...> --author \"<your-name>\" --author-type agent` (or `--clear` to remove all) |\n| `cerefox_get_audit_log` | `cerefox audit list --requestor \"<your-name>\"` (add `--json` for scripted access) |\n| `cerefox_get_help` | `cerefox guides show agent-quick-reference` (or `cerefox guides list` for the full bundled-docs index) |\n\n**Set identity on every call**, exactly as you would on MCP:\n- Writes (`document ingest`, `document ingest-dir`): `--author \"<your-name>\" --author-type agent`\n- Reads: `--requestor \"<your-name>\"`\n\nOr have your user set `CEREFOX_AUTHOR_NAME` / `CEREFOX_AUTHOR_TYPE` / `CEREFOX_REQUESTOR_NAME` in their `.env` to apply defaults once.\n"; /** Sections keyed by their H2 heading text (lower-cased for matching). */ export const HELP_SECTIONS: Record<string, string> = { - "Tools": "## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata`, `author` |\n| `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |", + "Tools": "## Tools\n\n| Tool | Purpose | Key params |\n|------|---------|------------|\n| `cerefox_search` | Find documents (hybrid FTS + semantic) | `query` (required), `project_name`, `metadata_filter`, `requestor` |\n| `cerefox_ingest` | Save or update a document | `title`, `content` (required), `document_id` (update by ID), `expected_content_hash` (**required on content updates** — see rule 9), `last_write_wins`, `update_if_exists`, `project_name` (single, non-destructive add on update), `project_names` (list, destructive replace on update), `metadata` (omit on update to keep existing tags; `{}` clears), `author` |\n| `cerefox_get_document` | Get full document by ID (header includes `content_hash` — the update token) | `document_id` (required) |\n| `cerefox_list_versions` | Version history of a document | `document_id` (required) |\n| `cerefox_metadata_search` | Find or list docs by metadata, project, or time (no text query) | `metadata_filter`, `project_name` (list a project's docs), `updated_since`, `include_content` — **at least one** of metadata_filter/project_name/updated_since/created_since |\n| `cerefox_list_metadata_keys` | Discover available metadata keys | (none required) |\n| `cerefox_list_projects` | List all projects | (none required) |\n| `cerefox_set_document_projects` | Set doc's project memberships to exactly the given list (destructive replace; metadata-only, no content change) | `document_id`, `project_names` (required) |\n| `cerefox_get_audit_log` | Query write operation history | `document_id`, `author`, `operation`, `since` |\n| `cerefox_get_help` | Retrieve Cerefox conventions (this reference) over MCP. **Call this whenever uncertain.** | `topic` (optional, case-insensitive H2 substring match) |", "Essential Rules": "## Essential Rules\n\n1. **Search before ingesting** -- check if the document exists first.\n2. **Prefer ID-based updates** -- pass `document_id` from search results for deterministic updates. Falls back to title-matching with `update_if_exists: true`.\n3. **Set `author`/`requestor`** to your name on every call (e.g., \"Claude Code\", \"archiver\"). On MCP, pass as parameters. On CLI, pass `--author`/`--author-type`/`--requestor` flags, or rely on `CEREFOX_AUTHOR_NAME`/`CEREFOX_AUTHOR_TYPE`/`CEREFOX_REQUESTOR_NAME` env vars set in the user's `.env`.\n4. **Use `document_id` from search results** `[id: uuid]` for get_document and list_versions.\n5. **Add metadata** -- at minimum `type` (\"decision-log\", \"research\", \"design-doc\") and `status` (\"active\", \"draft\").\n6. **Write structured Markdown** with H1/H2/H3 headings for good chunking and search.\n7. **Deletes are soft (recoverable); purge is web-UI-only.** If you decide to delete, surface it to the user (`I soft-deleted X — recoverable from the Cerefox web UI trash`). You cannot un-do your own delete from agent code by design.\n8. **Cross-doc links inside content**: **always use `[Text](document-uuid)`.** UUIDs are the only fully reliable link form — stable across title changes, never ambiguous, no encoding gotchas. Every `cerefox_search` result shows `[id: <uuid>]` after the title; grab it and use it. Title-based linking (`[Text](<Title With Spaces>)`) is fragile (breaks on colons, parens, ampersands, brackets — silently navigates to wrong page) — **don't write title-based links**; do an extra search to get the UUID instead. Repo-path forms (`[Text](docs/path.md)`) exist for repo-ingested files; don't construct manually. See `AGENT_GUIDE.md → Writing linkable content` for the full rule.\n9. **Concurrency: content updates require `expected_content_hash`.** Pass the `content_hash` you read (shown by `cerefox_get_document`, `cerefox_search`, and `cerefox_metadata_search`) when updating a document. If it's stale you get a **conflict** — re-read the document, merge your changes into the latest content, retry with the new hash. **Never resolve a conflict by overwriting blindly** — the current content includes another writer's work. `last_write_wins: true` skips the check; use it ONLY when an external source of truth makes conflicts meaningless (file re-sync), never to silence a conflict.\n10. **Project memberships — non-destructive by default**: on `cerefox_ingest` updates, **`project_name` (singular) is a non-destructive add** (ensures membership, preserves others). Use **`project_names` (list)** when you want to set the doc's full project set in one call (destructive replace). For metadata-only project changes without writing content, use **`cerefox_set_document_projects(document_id, project_names)`** — that tool is the destructive-replace contract made explicit. Never call `cerefox_set_document_projects` with a single name when you mean \"add\" — that would REMOVE the doc from all other projects. When in doubt, use `cerefox_ingest` with singular `project_name`.", "Update Workflow (ID-based -- preferred)": "## Update Workflow (ID-based -- preferred)\n\n```\nsearch(\"topic\") -> find doc [id: abc123] -> get_document(abc123) -> note its content_hash -> modify ->\ningest(title=\"Same Title\", content=\"...\", document_id=\"abc123\",\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```\n\nOn a **conflict** error: get_document again (fresh content + fresh hash) -> merge your changes -> retry with the new hash.", "Update Workflow (title-based -- fallback)": "## Update Workflow (title-based -- fallback)\n\n```\nsearch(\"topic\") -> find doc (note its hash) -> modify ->\ningest(title=\"Same Title\", content=\"...\", update_if_exists=true,\n expected_content_hash=\"<the hash you read>\", author=\"my-agent\")\n```", diff --git a/_shared/mcp-tools/ingest.ts b/_shared/mcp-tools/ingest.ts index d6f8dfb..4c027fa 100644 --- a/_shared/mcp-tools/ingest.ts +++ b/_shared/mcp-tools/ingest.ts @@ -71,7 +71,10 @@ async function handler( const project_name = args.project_name as string | undefined; const project_names_raw = args.project_names; const source = (args.source as string | undefined) ?? "agent"; - const metadata = (args.metadata as Record<string, unknown> | undefined) ?? {}; + // null = "not provided": the RPC keeps existing metadata on update and uses + // {} on create (v0.11.1 — defaulting to {} here used to wipe a document's + // tags on every content update that didn't re-pass them). + const metadata = (args.metadata as Record<string, unknown> | undefined) ?? null; const update_if_exists = (args.update_if_exists as boolean | undefined) ?? false; const author = (args.author as string | undefined) ?? "mcp-agent"; const author_type = "agent"; // MCP path is always agent diff --git a/docs/TODO.md b/docs/TODO.md index db769b3..b5fd9dd 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -13,6 +13,12 @@ ## Known Tasks (Not Yet Scheduled) +### Data Safety +- [ ] **Metadata versioning / recovery** — version snapshots capture content only; + a metadata wipe is unrecoverable (bit us in the v0.11.1 incident). Proposal with + options (audit-log before/after values, and/or metadata on version rows): + [`docs/research/metadata-versioning.md`](research/metadata-versioning.md). + ### Search & Ranking - [ ] Reciprocal Rank Fusion (RRF) for hybrid search instead of linear alpha blending - [ ] True BM25 ranking via pg_textsearch or ParadeDB extension diff --git a/docs/guides/cli.md b/docs/guides/cli.md index b715ec9..32198ea 100644 --- a/docs/guides/cli.md +++ b/docs/guides/cli.md @@ -43,7 +43,7 @@ cerefox document ingest --paste --title "<title>" [OPTIONS] # stdin | `--title` | `-t` | str | filename stem | Document title. Required with `--paste`. | | `--project-name` | `--project`, `-p` | str | _none_ | Project name to assign the document to (created if missing). | | `--paste` | — | flag | off | Read markdown from stdin. Requires `--title`. | -| `--metadata` | `-m` | JSON | `{}` | Extra metadata as a JSON object, e.g. `'{"tags":["work"]}'`. | +| `--metadata` | `-m` | JSON | _not provided_ | Extra metadata as a JSON object, e.g. `'{"tags":["work"]}'`. **On update, omitting this keeps the document's existing metadata** (v0.11.1); pass `'{}'` to deliberately clear all metadata. | | `--update-if-exists` | `-u` | flag | off | Title/source-path-based fallback update. Mutually exclusive with `--document-id`. | | `--document-id` | `-i` | UUID | _none_ | Deterministic ID-based update. Errors if the document doesn't exist. | | `--expected-content-hash` | — | sha256 | _none_ | **Required on content updates** (v0.11 optimistic concurrency): the `content_hash` of the version this edit is based on, shown by `cerefox document get` / `cerefox search`. Stale → conflict error (re-read, merge, retry). | @@ -410,8 +410,8 @@ cerefox metadata search --metadata-filter '<json>' [OPTIONS] | Flag | Type | Default | Description | |---|---|---|---| -| `--metadata-filter <json>` (`-f`) | JSON | **required** | Metadata filter, e.g. `'{"type":"decision-log"}'`. | -| `--project-name <name>` (`-p`) | str | _none_ | Filter by project name. | +| `--metadata-filter <json>` (`-f`) | JSON | _none_ | Metadata filter, e.g. `'{"type":"decision-log"}'`. Optional since v0.11.1 — at least one of filter / `--project-name` / `--updated-since` / `--created-since` is required (parity with the MCP tool). | +| `--project-name <name>` (`-p`) | str | _none_ | Filter by project name. Sufficient on its own to list that project's documents. | | `--updated-since TEXT` | ISO-8601 | _none_ | Documents updated after this timestamp. | | `--created-since TEXT` | ISO-8601 | _none_ | Documents created after this timestamp. | | `--limit INTEGER` | int | `10` | Max results. | diff --git a/docs/guides/connect-agents.md b/docs/guides/connect-agents.md index fb37c07..fc4f6a0 100644 --- a/docs/guides/connect-agents.md +++ b/docs/guides/connect-agents.md @@ -609,7 +609,7 @@ In the action editor, paste this schema (replace `<your-project-ref>`): openapi: 3.1.0 info: title: Cerefox Knowledge Base - version: 2.0.0 + version: 2.1.0 servers: - url: https://<your-project-ref>.supabase.co/functions/v1 paths: @@ -728,6 +728,10 @@ paths: default: agent metadata: type: object + description: > + Arbitrary JSON metadata. On an UPDATE, omitting this keeps + the document's existing metadata (v2.1.0); pass {} to + deliberately clear all tags. update_if_exists: type: boolean default: false diff --git a/docs/plan.md b/docs/plan.md index c21655a..17f6f35 100644 --- a/docs/plan.md +++ b/docs/plan.md @@ -3477,13 +3477,18 @@ bundler (`--use-api`, issue #84). Design of record: [`docs/research/local-cerefox-design.md`](research/local-cerefox-design.md). **Near-term tracks** (iteration numbers are planning IDs, not ship order): -1. **Iteration 32 — Optimistic concurrency control**, target **v0.11.0**, on - `feat/optimistic-locking`. Motivated by a real two-agent last-write-wins incident. - Content updates now require `expected_content_hash` (compare-and-swap on the existing +1. **Iteration 32 — Optimistic concurrency control**: ✅ **SHIPPED v0.11.0** + (2026-06-12; schema 0.5.0; deployed + live-validated on the maintainer cloud). + Content updates require `expected_content_hash` (compare-and-swap on the existing `content_hash`, atomic in the ingest RPC via `FOR UPDATE`) or an explicit `last_write_wins`. Design of record: [`docs/specs/concurrency-control-design.md`](specs/concurrency-control-design.md). - Implemented across RPC + MCP + EF + CLI + web + docs; schema 0.5.0. + **v0.11.1 follow-up** (on `fix/metadata-preserve-on-update`, schema 0.6.0): + content updates without metadata no longer wipe a document's tags + (`p_metadata` NULL = keep existing), plus CLI `metadata search` parity (filter + optional with another scope). The wipe incident also spawned the + **metadata-versioning** backlog proposal: + [`docs/research/metadata-versioning.md`](research/metadata-versioning.md). 2. **Iteration 31 — Local ONNX embedder** (fully-offline World B), target **v0.12+** (slid from v0.11.0 to make room for iter-32), on `feat/local-embedder`. Design committed; P0 implementation pending review. See iter-31 in the log above. diff --git a/docs/research/metadata-versioning.md b/docs/research/metadata-versioning.md new file mode 100644 index 0000000..0ed6976 --- /dev/null +++ b/docs/research/metadata-versioning.md @@ -0,0 +1,66 @@ +# Metadata Versioning — proposal (backlog, unscheduled) + +**Status**: Proposal / design sketch — NOT scheduled. Lives in `research/` until +it graduates to an iteration (then a design-of-record in `specs/`). +**Date**: 2026-06-13 +**Motivation**: the v0.11.1 incident — a metadata-wipe bug destroyed document +tags with **no recovery path**, because version snapshots capture content only. +Content enjoys two safety layers (optimistic locking for prevention, versioning +for recovery); metadata now has prevention-adjacent protection (absent ≠ clear, +v0.11.1) but still **zero recovery**. + +## Current state + +- `cerefox_document_versions` + archived chunks snapshot **content only**. +- Metadata lives solely on the live `cerefox_documents.metadata` JSONB column. +- Metadata writes (ingest-with-metadata, `document edit --set-meta/--unset-meta`, + web edit, `cerefox_set_document_projects` for memberships) produce audit + entries, but the audit log records **descriptions, not values** — you can see + *that* metadata changed, not *what it was*. +- This was a deliberate simplicity choice (two-table design, lean version rows). + +## Options + +### Option A — snapshot metadata into version rows (smallest) + +Add `metadata JSONB` to `cerefox_document_versions`; `cerefox_snapshot_version` +copies the document's metadata at snapshot time. + +- Pros: one column + one line in the snapshot RPC; restore-from-version can + optionally restore tags; zero new tables. +- Cons: only captures metadata at **content-update** moments — metadata-only + edits between content updates still vanish without a trace; retention + cleanup expires it with the version. + +### Option B — audit log records metadata before/after values + +Add `meta_before` / `meta_after` JSONB columns to `cerefox_audit_log`, +populated on `update-metadata` and `update-content` operations. + +- Pros: covers **every** metadata change (including metadata-only edits); + audit log is immutable and survives version cleanup; recovery = read the + last good `meta_before`. +- Cons: grows the audit table (metadata is small; likely fine); recovery is + manual-ish (no one-click restore), though a `document edit --restore-meta + <audit-id>` verb could be added later. + +### Option C — full metadata version table + +A dedicated `cerefox_metadata_versions` table, one row per metadata change. + +- Cons: a third versioning concept for marginal benefit over B. Rejected + unless A+B prove insufficient. + +## Leaning + +**B, possibly A+B together** (they're independent and both cheap). B is the +real recovery net because metadata-only edits are the common case; A makes +"restore this version" semantically complete. Both are additive schema changes +(migration + `schema_version` bump). Revisit when the pain recurs or before +v1.0's stability commitment freezes the schema surface. + +## Non-goals + +- Optimistic locking for metadata-only edits (separate, smaller discussion — + the v0.11.0 design doc scoped it out deliberately). +- Project-membership versioning (memberships are M2M rows, not metadata). diff --git a/packages/memory/src/cli/commands/ingest-dir.ts b/packages/memory/src/cli/commands/ingest-dir.ts index 8128333..a390410 100644 --- a/packages/memory/src/cli/commands/ingest-dir.ts +++ b/packages/memory/src/cli/commands/ingest-dir.ts @@ -95,7 +95,9 @@ async function action(dir: string, options: IngestDirOptions): Promise<void> { "No --author / CEREFOX_AUTHOR_NAME set — audit log will record these writes as 'unknown'.", ); } - const metadata = parseJsonObjectArg(options.metadata, "--metadata") ?? {}; + // undefined = "not provided": re-ingesting existing files keeps their + // current metadata (v0.11.1). Pass --metadata '{}' to clear on update. + const metadata = parseJsonObjectArg(options.metadata, "--metadata"); const settings = loadSettings(); if (!settings.supabaseUrl || !settings.supabaseKey) { @@ -140,7 +142,7 @@ async function action(dir: string, options: IngestDirOptions): Promise<void> { title: basename(file, extname(file)), source: options.source ?? "cli", projectName: options.projectName ?? null, - metadata: metadata as Record<string, unknown>, + metadata: metadata ?? null, updateExisting: Boolean(options.updateIfExists), author, authorType: authorType as "user" | "agent", diff --git a/packages/memory/src/cli/commands/ingest.ts b/packages/memory/src/cli/commands/ingest.ts index 7d01e22..e5c9452 100644 --- a/packages/memory/src/cli/commands/ingest.ts +++ b/packages/memory/src/cli/commands/ingest.ts @@ -114,7 +114,10 @@ async function action( "No --author / CEREFOX_AUTHOR_NAME set — audit log will record this write as 'unknown'.", ); } - const metadata = parseJsonObjectArg(options.metadata, "--metadata") ?? {}; + // undefined = "not provided": on update the existing metadata is KEPT + // (v0.11.1 — the old `?? {}` default wiped a document's tags on every + // content update without --metadata). Pass --metadata '{}' to clear. + const metadata = parseJsonObjectArg(options.metadata, "--metadata"); let projectNames: string[] | undefined; if (options.projectNames) { @@ -152,7 +155,7 @@ async function action( source: options.source ?? "cli", projectName: options.projectName ?? null, projectNames: projectNames ?? null, - metadata: metadata as Record<string, unknown>, + metadata: metadata ?? null, updateExisting: Boolean(options.updateIfExists), documentId: options.documentId ?? null, author, @@ -166,7 +169,7 @@ async function action( source: options.source ?? "cli", projectName: options.projectName ?? null, projectNames: projectNames ?? null, - metadata: metadata as Record<string, unknown>, + metadata: metadata ?? null, updateExisting: Boolean(options.updateIfExists), documentId: options.documentId ?? null, author, diff --git a/packages/memory/src/cli/commands/metadata-search.ts b/packages/memory/src/cli/commands/metadata-search.ts index a0857e6..a0be653 100644 --- a/packages/memory/src/cli/commands/metadata-search.ts +++ b/packages/memory/src/cli/commands/metadata-search.ts @@ -38,7 +38,7 @@ interface MetadataSearchRow { } async function action(options: { - metadataFilter: string; + metadataFilter?: string; projectName?: string; updatedSince?: string; createdSince?: string; @@ -48,11 +48,21 @@ async function action(options: { requestor?: string; json?: boolean; }): Promise<void> { - const metadataFilter = parseJsonObjectArg(options.metadataFilter, "--metadata-filter"); - if (!metadataFilter || Object.keys(metadataFilter).length === 0) { + // Parity with the MCP tool / EF (v0.10.x relaxation, CLI caught up in + // v0.11.1): the filter is optional, but at least one narrowing criterion is + // required so this never becomes an unbounded whole-KB dump. An empty filter + // + --project-name lists that project's documents. + const metadataFilter = + parseJsonObjectArg(options.metadataFilter, "--metadata-filter") ?? {}; + if ( + Object.keys(metadataFilter).length === 0 && + !options.projectName && + !options.updatedSince && + !options.createdSince + ) { throw userError( - "--metadata-filter is required and must be a non-empty JSON object.", - `Example: --metadata-filter '{"type":"decision-log"}'.`, + "Provide at least one of: --metadata-filter, --project-name, --updated-since, or --created-since.", + `Examples: --metadata-filter '{"type":"decision-log"}' · --project-name "research" (lists that project's docs).`, ); } @@ -136,10 +146,12 @@ async function action(options: { export function registerMetadataSearch(program: Command): void { program .command("metadata-search") - .description("Find documents by metadata criteria (no text query).") - .requiredOption( + .description( + "Find or list documents by metadata, project, or time criteria (no text query).", + ) + .option( "-f, --metadata-filter <json>", - "JSON object; only docs whose metadata contains ALL pairs are returned.", + "JSON object; only docs whose metadata contains ALL pairs are returned. Optional — omit to list by --project-name / time range alone (at least one criterion is required).", ) .option("-p, --project-name <name>", "Filter to a specific project.") .option("--updated-since <iso>", "Only docs updated on/after this ISO timestamp.") diff --git a/packages/memory/test/ingestion/pipeline-update.test.ts b/packages/memory/test/ingestion/pipeline-update.test.ts index a6713c3..877c434 100644 --- a/packages/memory/test/ingestion/pipeline-update.test.ts +++ b/packages/memory/test/ingestion/pipeline-update.test.ts @@ -44,17 +44,17 @@ describe("IngestionPipeline.updateDocument (live)", () => { const client = createClient(SUPABASE_URL, SUPABASE_KEY, { auth: { persistSession: false }, }); - // iter-32 gate: the update path now requires the v0.5.0 schema - // (p_expected_content_hash / p_last_write_wins on the ingest RPC). - // Against an older deployed server, leave the suite skipped instead of - // failing with "function not found". + // Schema gate: the update path requires the v0.5.0 schema (iter-32 + // concurrency params) and the metadata-preserve assertions require + // v0.6.0 (p_metadata NULL = keep existing). Against an older deployed + // server, leave the suite skipped instead of failing. try { const { data: ver } = await client.rpc("cerefox_schema_version"); const [maj = 0, min = 0] = String(ver ?? "0.0.0").split(".").map(Number); - if (maj === 0 && min < 5) { + if (maj === 0 && min < 6) { schemaTooOld = true; console.log( - `(skipped: deployed schema ${ver} < 0.5.0 — run \`cerefox server deploy --schema-only\` to enable these tests)`, + `(skipped: deployed schema ${ver} < 0.6.0 — run \`cerefox server deploy --schema-only\` to enable these tests)`, ); return; } @@ -260,6 +260,86 @@ describe("IngestionPipeline.updateDocument (live)", () => { ).rejects.toThrow(/Identical content already exists/); }); + test("RPC: p_metadata NULL on update keeps existing metadata (v0.11.1)", async () => { + if (!pipeline || !supabase) return; + + // Create a doc WITH metadata via the pipeline. + const title = `${TITLE_PREFIX} meta-preserve-${RUN_TAG}`; + const createdDoc = await pipeline.ingestText({ + text: "# Meta preserve\n\nBody v1. Run " + RUN_TAG + ".\n", + title, + metadata: { type: "e2e-meta", keep: "me" }, + author: "pipeline-update-test", + }); + created.push(createdDoc.documentId); + + // Direct RPC content update WITHOUT p_metadata (and with a synthetic + // embedding — no OpenAI needed): the v0.6.0 contract is NULL = keep. + const { error } = await supabase.rpc("cerefox_ingest_document", { + p_document_id: createdDoc.documentId, + p_title: title, + p_source: "agent", + p_content_hash: "e2e-meta-preserve-" + RUN_TAG, + p_review_status: "approved", + p_chunks: [ + { + chunk_index: 0, + heading_path: ["Meta preserve"], + heading_level: 1, + title: "Meta preserve", + content: "Body v2. Run " + RUN_TAG + ".", + char_count: 24, + embedding: new Array(768).fill(0.001), + embedder: "e2e-test", + }, + ], + p_author: "pipeline-update-test", + p_author_type: "agent", + p_last_write_wins: true, + // p_metadata deliberately omitted → NULL → keep existing + }); + expect(error).toBeNull(); + + const { data: row } = await supabase + .from("cerefox_documents") + .select("metadata") + .eq("id", createdDoc.documentId) + .maybeSingle(); + expect(row?.metadata).toEqual({ type: "e2e-meta", keep: "me" }); + + // And an explicit '{}' DOES clear (the deliberate-clear contract). + const { error: clearErr } = await supabase.rpc("cerefox_ingest_document", { + p_document_id: createdDoc.documentId, + p_title: title, + p_source: "agent", + p_content_hash: "e2e-meta-clear-" + RUN_TAG, + p_metadata: {}, + p_review_status: "approved", + p_chunks: [ + { + chunk_index: 0, + heading_path: ["Meta preserve"], + heading_level: 1, + title: "Meta preserve", + content: "Body v3. Run " + RUN_TAG + ".", + char_count: 24, + embedding: new Array(768).fill(0.001), + embedder: "e2e-test", + }, + ], + p_author: "pipeline-update-test", + p_author_type: "agent", + p_last_write_wins: true, + }); + expect(clearErr).toBeNull(); + const { data: cleared } = await supabase + .from("cerefox_documents") + .select("metadata") + .eq("id", createdDoc.documentId) + .maybeSingle(); + expect(cleared?.metadata).toEqual({}); + }); + test("update non-existent document → throws", async () => { if (!pipeline) return; const fakeId = "00000000-0000-0000-0000-000000000000"; diff --git a/packages/memory/test/read-commands.test.ts b/packages/memory/test/read-commands.test.ts index 623b662..cb469eb 100644 --- a/packages/memory/test/read-commands.test.ts +++ b/packages/memory/test/read-commands.test.ts @@ -139,10 +139,12 @@ describe("cerefox read commands (live)", () => { expect(stderr).toContain("not valid JSON"); }); - test("metadata-search: empty object → exit 1", () => { + test("metadata-search: empty object with no other criteria → exit 1", () => { + // v0.11.1: the filter alone may be empty, but at least one narrowing + // criterion (filter / project / time) is required. const { status, stderr } = run(["metadata", "search", "--metadata-filter", "{}"]); expect(status).toBe(1); - expect(stderr).toContain("non-empty"); + expect(stderr).toContain("at least one of"); }); test("search: empty query → exit 1", () => { diff --git a/packages/memory/test/write-commands.test.ts b/packages/memory/test/write-commands.test.ts index 9b3e3f6..370d2e9 100644 --- a/packages/memory/test/write-commands.test.ts +++ b/packages/memory/test/write-commands.test.ts @@ -92,7 +92,7 @@ const SCHEMA_OK = await (async () => { const client = createClient(settings); const ver = await client.rpc<string>("cerefox_schema_version", {}); const [maj = 0, min = 0] = String(ver ?? "0.0.0").split(".").map(Number); - return maj > 0 || min >= 5; + return maj > 0 || min >= 6; } catch { return false; } @@ -185,6 +185,8 @@ describe("cerefox write commands (live)", () => { title, "--project-name", "_e2e-v0.5", + "--metadata", + '{"type":"e2e-flow","keep":"me"}', "--update-if-exists", "--author", "v0.5-test", @@ -220,7 +222,7 @@ describe("cerefox write commands (live)", () => { if (!SCHEMA_OK) { console.log( - "(update-flow steps skipped: deployed schema < 0.5.0 — run `cerefox server deploy --schema-only`)", + "(update-flow steps skipped: deployed schema < 0.6.0 — run `cerefox server deploy --schema-only`)", ); return; } @@ -314,6 +316,28 @@ describe("cerefox write commands (live)", () => { ); expect(r6.status).toBe(0); expect(r6.stdout).toContain("updated"); + + // v0.11.1: none of the four content updates above passed --metadata, so + // the metadata set at creation must have survived all of them (the old + // `?? {}` default wiped tags on every update). + const rList = run(["document", "list", "--project", "_e2e-v0.5", "--json"]); + expect(rList.status).toBe(0); + const row = (JSON.parse(rList.stdout) as Array<{ id: string; metadata: Record<string, unknown> }>) + .find((d) => d.id === id); + expect(row?.metadata).toEqual({ type: "e2e-flow", keep: "me" }); + }); + + test("metadata search: --project-name alone lists docs (v0.11.1 parity)", () => { + const r = run(["metadata", "search", "--project-name", "_e2e-v0.5", "--json"]); + expect(r.status).toBe(0); + const rows = JSON.parse(r.stdout) as unknown[]; + expect(rows.length).toBeGreaterThan(0); + }); + + test("metadata search: no criteria at all → exit 1 with guidance", () => { + const r = run(["metadata", "search"]); + expect(r.status).toBe(1); + expect(r.stderr).toContain("at least one of"); }); test("ingest-dir: walks tree and ingests matching files", () => { diff --git a/src/cerefox/db/client.py b/src/cerefox/db/client.py index 6691aef..fada438 100644 --- a/src/cerefox/db/client.py +++ b/src/cerefox/db/client.py @@ -755,7 +755,9 @@ def ingest_document_rpc( "p_source": source, "p_source_path": source_path, "p_content_hash": content_hash, - "p_metadata": metadata or {}, + # v0.11.1: None = "not provided" — the RPC keeps existing metadata + # on update ({} on create). `metadata or {}` used to wipe tags. + "p_metadata": metadata, "p_review_status": review_status, "p_chunks": chunk_data, "p_author": author, diff --git a/src/cerefox/db/rpcs.sql b/src/cerefox/db/rpcs.sql index 4f78bd8..809ae82 100644 --- a/src/cerefox/db/rpcs.sql +++ b/src/cerefox/db/rpcs.sql @@ -1045,7 +1045,10 @@ $$; -- -- Parameters: -- p_document_id : NULL for create, UUID for update --- p_title, p_source, p_source_path, p_content_hash, p_metadata : document fields +-- p_title, p_source, p_source_path, p_content_hash : document fields +-- p_metadata : JSONB metadata. NULL = "not provided" → create uses '{}', +-- update keeps the existing metadata (v0.11.1). Pass '{}' +-- explicitly to clear all metadata. -- p_review_status : 'approved' or 'pending_review' (based on author_type) -- p_chunks : JSONB array of chunk objects, each with: -- chunk_index, heading_path, heading_level, title, @@ -1075,7 +1078,10 @@ CREATE FUNCTION cerefox_ingest_document( p_source TEXT DEFAULT 'agent', p_source_path TEXT DEFAULT NULL, p_content_hash TEXT DEFAULT '', - p_metadata JSONB DEFAULT '{}', + -- NULL = "not provided": create uses '{}', update KEEPS existing metadata + -- (v0.11.1 fix — content updates without metadata used to wipe tags). + -- Pass '{}' explicitly to deliberately clear all metadata. + p_metadata JSONB DEFAULT NULL, p_review_status TEXT DEFAULT 'approved', p_chunks JSONB DEFAULT '[]', p_author TEXT DEFAULT 'unknown', @@ -1181,13 +1187,14 @@ BEGIN SELECT sv.version_id INTO v_version_id FROM cerefox_snapshot_version(v_doc_id, p_source_label, p_retention_hours, p_cleanup_enabled) sv; - -- Update document record + -- Update document record. metadata: NULL = keep existing (v0.11.1 — + -- a content update without metadata must not wipe the document's tags). UPDATE cerefox_documents SET title = p_title, source = p_source, source_path = COALESCE(p_source_path, source_path), content_hash = p_content_hash, - metadata = p_metadata, + metadata = COALESCE(p_metadata, metadata), chunk_count = v_chunk_count, total_chars = v_total_chars, review_status = v_status, @@ -1202,7 +1209,7 @@ BEGIN title, source, source_path, content_hash, metadata, chunk_count, total_chars, review_status ) VALUES ( - p_title, p_source, p_source_path, p_content_hash, p_metadata, + p_title, p_source, p_source_path, p_content_hash, COALESCE(p_metadata, '{}'::JSONB), v_chunk_count, v_total_chars, v_status ) RETURNING id INTO v_doc_id; @@ -1760,7 +1767,7 @@ SET search_path = public, pg_catalog AS $$ -- Keep in lockstep with the `@version:` marker in schema.sql (cut_release.ts -- enforces it). Bump whenever schema.sql OR rpcs.sql changes. - SELECT '0.5.0'::TEXT; + SELECT '0.6.0'::TEXT; $$; diff --git a/src/cerefox/db/schema.sql b/src/cerefox/db/schema.sql index f8c9ef8..12c4d1f 100644 --- a/src/cerefox/db/schema.sql +++ b/src/cerefox/db/schema.sql @@ -5,7 +5,7 @@ -- Requires extensions: vector (pgvector), uuid-ossp -- These are enabled at the top of db_deploy.py before this file is applied. -- --- @version: 0.5.0 +-- @version: 0.6.0 -- The `@version` marker above is read by the schema-version-mismatch banner -- (see /api/v1/schema-version). Bump it whenever schema.sql OR rpcs.sql -- changes in a way that requires `cerefox server deploy` to be re-run — diff --git a/supabase/functions/cerefox-ingest/index.ts b/supabase/functions/cerefox-ingest/index.ts index ceeb127..984c6c6 100644 --- a/supabase/functions/cerefox-ingest/index.ts +++ b/supabase/functions/cerefox-ingest/index.ts @@ -17,7 +17,9 @@ import { isVersionRequest, versionResponse } from "../../../_shared/ef-meta/inde * content string required Markdown content * project_name string optional Project to assign to (looked up by name, created if absent) * source string optional Origin label (default: "agent") - * metadata object optional Arbitrary JSONB metadata + * metadata object optional Arbitrary JSONB metadata. Omitted on an + * update → existing metadata is KEPT + * (v0.11.1); pass {} explicitly to clear. * * Response: { document_id, title, chunk_count, project_id? } */ @@ -464,7 +466,10 @@ Deno.serve(async (req: Request) => { }); } - const { title, content, document_id = null, project_name, source = "agent", metadata = {}, update_if_exists = false, author = "agent", author_type = "agent", expected_content_hash = null, last_write_wins = false } = body; + // metadata: null = "not provided" — the RPC keeps existing metadata on + // update and uses {} on create (v0.11.1; a `= {}` default here used to wipe + // a document's tags on every content update that didn't re-pass them). + const { title, content, document_id = null, project_name, source = "agent", metadata = null, update_if_exists = false, author = "agent", author_type = "agent", expected_content_hash = null, last_write_wins = false } = body; // Validate + normalize project_names if provided (full-set destructive form) let project_names: string[] | null = null;