From 027f03acd7b5ddddeebbe9d8d56b42b39a1b3796 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Fri, 22 May 2026 09:37:11 -0700 Subject: [PATCH] test(tools): add all-tools integration round-trip canary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the unit-test-coverage gap that let memory_check_contradictions ship to Fly with a hidden bug. Pre-existing test_server.py mocks every external dep (LLM, NLI, embedder, vecstore), which is right for isolated-contract testing but leaves a gap: a tool's real call path can raise an uncaught exception that no mocked test exercises. tests/test_tools_integration.py iterates the entire registered MCP tool manager and invokes each tool against a real seeded vault with minimal-valid inputs. Three test classes: 1. test_every_tool_invokes_cleanly — no unhandled exception, return shape matches MCP contract (str/dict/list). Coverage-check assertions force every newly-registered tool to have a fixture input AND every removed tool to have its fixture cleaned up. 2. test_no_tool_returns_opaque_error_string — outputs must not contain "Error occurred during tool execution" or "Internal server error" verbatim. Opaque envelopes come from the MCP transport wrapping an escaped Python exception — never acceptable as a tool's own output. 3. test_destructive_tools_respect_dry_run — locks the dry_run contract on memory_check_contradictions + memory_sweep. Mutating inputs are stubbed to "would-decay" labels via mocked NLI; the test asserts pre-state == post-state. Heavy paths (NLI classify, FastEmbed re-embed) are stubbed so the suite stays under ~17s; real-model paths are validated by scripts/calibrate_capture_threshold.py and the operator Layer-3 web test ritual (extended in follow-up PR #158). Catches the failure class that bit memory_check_contradictions on 2026-05-22: the LLM-required path would have raised under [server] extras even with the broad try/except, and this canary's assertion "no unhandled exception" would have fired on PR #154 when the salience-tier tools were added — forcing the fix before merge. Note: this PR doesn't yet exercise the [server]-only install matrix (that's PR #158). It DOES catch any tool that raises through its wrapper regardless of extras — the universal subset of the gap. Composes with feedback_no_silent_fails: every tool must catch failures at its boundary and return a clean error string, never let an exception escape. Suite 847 → 850 passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 18 +- tests/test_tools_integration.py | 326 ++++++++++++++++++++++++++++++++ 2 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 tests/test_tools_integration.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d471f57..0a4c7be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,7 +69,23 @@ label mapping, error surfacing, availability probe. `tests/test_contradiction.py` refactored to mock the NLI layer instead of `mnemon.llm.generate`; adds dry-run mutation-skip - test + nli-unavailable clean-flag test. Suite 836 → 847 passing. + test + nli-unavailable clean-flag test. + +- **`tests/test_tools_integration.py` (3 new) — every-MCP-tool + round-trip canary.** Closes the unit-test-coverage gap that let + `memory_check_contradictions` ship to Fly with a hidden bug + (mocked unit tests passed; the real call path raised an opaque + envelope client-side). Iterates the entire registered tool + manager, invokes each tool with minimal-valid inputs against a + seeded vault, asserts no unhandled exception + sane return shape + + no opaque-error strings in outputs. Per-tool fixture entries + enforce that future tools can't ship without being exercised + here (registered-tools-vs-fixture-keys diff check). Stubs the + heavy external calls (NLI classify, FastEmbed re-embed) so the + full suite stays ~17s. Composes with the existing + `test_server.py` per-tool unit tests (mocked deps, isolated + contracts) — that suite covers logic; this suite covers + no-exception-escapes-the-boundary. Suite 836 → 850 passing. ## [0.7.0rc1] - 2026-05-22 diff --git a/tests/test_tools_integration.py b/tests/test_tools_integration.py new file mode 100644 index 0000000..3b0515f --- /dev/null +++ b/tests/test_tools_integration.py @@ -0,0 +1,326 @@ +"""End-to-end integration test — invoke every registered MCP tool +against a real seeded vault. + +Driver: 2026-05-22 finding that ``memory_check_contradictions`` +returned an opaque "Error occurred during tool execution" envelope +to claude.ai despite all unit tests passing. The unit tests in +``test_server.py`` mock external deps (LLM, NLI, embedder, vecstore), +which is appropriate for asserting per-tool behavior in isolation — +but it leaves a gap where a tool's real call path can raise an +uncaught exception (or hang) that no unit test exercises. + +This file closes that gap. The test iterates every tool registered +in the FastMCP tool manager, builds minimal-valid inputs from a +seeded vault, invokes each, and asserts: + 1. No unhandled exception (a tool that raises through FastMCP's + wrapper produces the opaque "Error occurred during tool + execution" envelope downstream — never acceptable) + 2. Return shape matches the MCP tool contract (str, dict, or list) + +Operates on a temp vault — no production state touched. Mutating +tools are constrained to safe inputs (dry_run flags, throwaway IDs) +or are run last so they can't interfere with earlier reads. + +Composes with [[feedback_no_silent_fails]] — every tool's failure +mode must be a NAMED exception caught at the tool boundary and +surfaced as a clean error message, never propagated as an +unhandled exception. This test enforces that contract. +""" + +from __future__ import annotations + +import json +import os +import tempfile +from typing import Any +from unittest.mock import patch + +import numpy as np +import pytest + +import mnemon.server as server_mod +from mnemon.embedder import embed_document +from mnemon.store import Store + + +# Tools that are too expensive to exercise with real models in +# every CI run. We still call them but stub the heavy external +# dependency to keep test runtime under a few seconds total. +# (The real-model path is separately validated by +# ``scripts/calibrate_capture_threshold.py`` + the operator +# Layer-3 web test.) +_HEAVY_TOOLS = {"memory_check_contradictions", "memory_rebuild"} + + +@pytest.fixture +def seeded_store(monkeypatch): + """Real Store with several seeded memories + vectors indexed. + + Replaces the server singleton so all MCP tools route to this + fixture. Cleans up the temp vault on teardown. + """ + fd, db_path = tempfile.mkstemp(suffix=".sqlite") + os.close(fd) + os.unlink(db_path) + + store = Store(db_path=db_path) + + # Seed three memories of distinct types so the tool inputs can + # exercise different code paths (note / preference / decision). + ids = [] + for title, content, ctype in [ + ("Tool integration seed A", "First seeded memory for tool integration testing.", "note"), + ("Tool integration seed B — preference", "A standing preference about test fixtures.", "preference"), + ("Tool integration seed C — decision", "A decision about how to handle the integration suite.", "decision"), + ]: + doc_id = store.save(title=title, content=content, content_type=ctype) + ids.append(doc_id) + doc = store.get(doc_id) + embed_document(store, doc.hash, doc.title, doc.content) + + # Reset + patch the server's _get_store to return this store + server_mod._store = None + monkeypatch.setattr(server_mod, "_get_store", lambda: store) + + yield {"store": store, "ids": ids} + + store.close() + server_mod._store = None + for ext in ("", "-wal", "-shm"): + try: + os.unlink(db_path + ext) + except FileNotFoundError: + pass + + +def _tool_inputs_for(seeded: dict[str, Any]) -> dict[str, dict]: + """Build minimal-valid inputs per registered tool name. + + Tools that mutate state are given inputs that won't interfere + with parallel reads (``dry_run=True``, throwaway IDs allocated + specifically for the mutating call, etc.). + """ + ids = seeded["ids"] + return { + # Read-only tools — exercise against any seeded id + "memory_search": {"query": "integration"}, + "memory_get": {"id": ids[0]}, + "memory_status": {}, + "memory_timeline": {"limit": 5}, + "memory_related": {"id": ids[0], "limit": 5}, + "memory_list_standing": {}, + "memory_export_vectors": {}, + "profile_get": {}, + + # Pure-create — safe, doesn't modify existing rows + "memory_save": { + "title": "Integration test write", + "content": "Content produced by the all-tools integration test.", + }, + "profile_update": { + "title": "test-profile", + "content": "test profile payload", + }, + + # Mutating tools constrained to dry-run / throwaway: + "memory_sweep": {"dry_run": True}, # default but pass explicitly + "memory_check_contradictions": {"id": ids[0], "dry_run": True}, + + # Tier mutations — promote then demote so the vault state + # round-trips. Use seed B since it's a preference (operator + # intent is to elevate this kind of memory). + "memory_promote": {"id": ids[1]}, + "memory_demote": {"id": ids[1]}, # will run after promote per ordering below + + # Pin + Forget — use a dedicated seed so the read-only tests + # before them aren't disrupted. Forget is destructive; the + # fixture tears down anyway. + "memory_pin": {"id": ids[2]}, + "memory_forget": {"id": ids[2]}, # last to run + + # Rebuild — heavy, may re-embed every doc. Tested via stub. + "memory_rebuild": {}, + } + + +# Ordering matters for mutating tools: pin/promote must run before +# forget/demote+forget on the same id. We invoke tools alphabetically +# but override the ordering for known dependent pairs. +def _tool_order(tool_names: list[str]) -> list[str]: + """Sort tools so mutations to a shared id happen in a sane order: + promote before demote (id B); pin before forget (id C); reads + before destructive writes on those ids.""" + priority = { + "memory_promote": 90, + "memory_demote": 91, # after promote + "memory_pin": 92, # after reads + "memory_forget": 99, # last — destructive + "memory_rebuild": 95, # after most reads + } + return sorted(tool_names, key=lambda n: (priority.get(n, 0), n)) + + +class TestAllToolsRoundTrip: + """Every registered MCP tool must complete without an unhandled + exception when called with minimal valid inputs. This is the + canary that catches the failure class of the 2026-05-22 + contradiction-check incident.""" + + def test_every_tool_invokes_cleanly(self, seeded_store): + from mnemon.server import mcp + + registered = set(mcp._tool_manager._tools.keys()) + inputs_map = _tool_inputs_for(seeded_store) + + # Coverage check: every registered tool has a fixture input. + # If a new tool ships without a fixture entry, this fails + # explicitly — forcing the contributor to consider how to + # exercise the new tool in CI. + missing = registered - set(inputs_map.keys()) + assert not missing, ( + f"Registered MCP tools without integration-test inputs: {sorted(missing)}. " + "Add a minimal-valid input to _tool_inputs_for() in this file." + ) + + # Conversely, ensure we don't have stale fixtures for removed tools. + stale = set(inputs_map.keys()) - registered + assert not stale, ( + f"Stale fixture inputs for unregistered tools: {sorted(stale)}. " + "Remove these from _tool_inputs_for() or restore the tool." + ) + + results: dict[str, tuple[str, Any]] = {} + for tool_name in _tool_order(list(registered)): + tool = mcp._tool_manager._tools[tool_name] + inputs = inputs_map[tool_name] + try: + if tool_name in _HEAVY_TOOLS: + # Stub the expensive external dependency so we + # exercise the wrapping tool plumbing without + # paying the full model-inference cost on every + # test run. The real path is validated by the + # Layer-3 web test ritual. + if tool_name == "memory_check_contradictions": + # Make the NLI classifier deterministic + + # fast: every pair → "unrelated", so no + # mutations land regardless of dry_run. + from mnemon.nli import BidirectionalResult, NLIResult + neutral = NLIResult( + label="neutral", + probs={"contradiction": 0.0, "entailment": 0.0, "neutral": 1.0}, + ) + bidir = BidirectionalResult( + mnemon_label="unrelated", + a_implies_b=neutral, b_implies_a=neutral, + ) + with patch("mnemon.nli.classify_pair_bidirectional", return_value=bidir): + result = tool.fn(**inputs) + elif tool_name == "memory_rebuild": + # Patch the embedder so rebuild doesn't re-run + # the FastEmbed model over every seed memory. + with patch("mnemon.embedder.embed_document"): + result = tool.fn(**inputs) + else: # pragma: no cover — exhaustive guard + result = tool.fn(**inputs) + else: + result = tool.fn(**inputs) + results[tool_name] = ("ok", result) + except Exception as exc: + pytest.fail( + f"Tool {tool_name!r} raised unhandled " + f"{type(exc).__name__}: {exc}. Inputs: {inputs}. " + f"This is the failure class that produces opaque " + f"'Error occurred during tool execution' envelopes " + f"in claude.ai/Desktop. Every tool must catch its " + f"failure modes at the boundary and return a clean " + f"error string instead.", + ) + + # Every successful call returned a value of the documented + # MCP-tool contract type. Mostly strings; the JSON-returning + # tools (memory_status, memory_list_standing, + # memory_export_vectors) return JSON strings too. + for name, (_, result) in results.items(): + assert isinstance(result, (str, dict, list)), ( + f"Tool {name!r} returned unexpected type " + f"{type(result).__name__}: {result!r}" + ) + + def test_no_tool_returns_opaque_error_string(self, seeded_store): + """A clean-error response must include either the named + exception type / specific cause OR a human-actionable + instruction. Opaque envelopes like 'Error occurred during + tool execution' must never be the tool's own output — + that envelope comes from the MCP transport layer wrapping + a Python exception that escaped the tool boundary.""" + from mnemon.server import mcp + + inputs_map = _tool_inputs_for(seeded_store) + + opaque_phrases = { + "error occurred during tool execution", + "internal server error", + # If a tool ever returns just a request_id string, that's + # the Anthropic-side wrapping leaking through — also opaque. + } + + for tool_name in _tool_order(list(mcp._tool_manager._tools.keys())): + tool = mcp._tool_manager._tools[tool_name] + inputs = inputs_map[tool_name] + if tool_name in _HEAVY_TOOLS: + continue # covered above with stubs + result = tool.fn(**inputs) + if not isinstance(result, str): + continue # JSON tools handled by the shape test + lower = result.lower() + for phrase in opaque_phrases: + assert phrase not in lower, ( + f"Tool {tool_name!r} returned an opaque error string " + f"({phrase!r} found in output). Replace with a named " + f"cause or human-actionable message. Got: {result!r}" + ) + + def test_destructive_tools_respect_dry_run(self, seeded_store): + """memory_check_contradictions(dry_run=True) and + memory_sweep(dry_run=True) must NOT mutate the vault. This + locks the dry_run contract added in PR #157.""" + from mnemon.server import mcp + + store = seeded_store["store"] + ids = seeded_store["ids"] + + # Snapshot pre-state + pre_confidences = { + i: store.get(i).confidence for i in ids + } + pre_relations = store.db.execute( + "SELECT COUNT(*) AS c FROM relations" + ).fetchone()["c"] + + # Run dry-run mutations with stubbed NLI + from mnemon.nli import BidirectionalResult, NLIResult + upd = NLIResult(label="entailment", + probs={"contradiction": 0.0, "entailment": 1.0, "neutral": 0.0}) + bidir = BidirectionalResult( + mnemon_label="update", # would-decay if not dry-run + a_implies_b=upd, b_implies_a=upd, + ) + with patch("mnemon.nli.classify_pair_bidirectional", return_value=bidir): + mcp._tool_manager._tools["memory_check_contradictions"].fn( + id=ids[0], dry_run=True, + ) + mcp._tool_manager._tools["memory_sweep"].fn(dry_run=True) + + # Confidences unchanged, no new relations + for i in ids: + assert store.get(i).confidence == pre_confidences[i], ( + f"Confidence on #{i} changed under dry_run — " + f"{pre_confidences[i]} → {store.get(i).confidence}" + ) + post_relations = store.db.execute( + "SELECT COUNT(*) AS c FROM relations" + ).fetchone()["c"] + assert post_relations == pre_relations, ( + f"Relations count changed under dry_run — " + f"{pre_relations} → {post_relations}" + )