From 7e30af180104bd67fcb634feca50b06246895e5d Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Fri, 22 May 2026 10:16:54 -0700 Subject: [PATCH] ci(server-extras): [server]-only matrix + layer3 --exercise-all-tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to PR #158 — closes the [server]-extras gap that the local integration canary can't see + extends the operator Layer-3 web test ritual to probe every MCP tool against the live Fly app. Two artifacts: 1. .github/workflows/ci-server-extras.yml — installs mnemon-memory[server] ONLY (the Fly Docker install) + pytest as a separate test runner. Runs the full suite under that minimal install. Includes a guard that asserts llama-cpp-python is NOT installed under [server] — so future PRs can't accidentally drag the LLM dep into the production path. This is the workflow that would have caught memory_check_contradictions's LLM hard-dependency on PR #154 when the salience-tier tools were first added; ci.yml passed because [dev] installs everything. 2. scripts/promote_stable.sh layer3 --exercise-all-tools — opt-in flag that, after the test Fly app is up but before downgrade, iterates every registered MCP tool against the remote and asserts each returns cleanly. Catches Fly-specific breakage (missing baked models, Anthropic MCP proxy timeouts, transport regressions) that the local Python-level canary in tests/test_tools_integration.py can't see. Tool list resolved dynamically from mcp._tool_manager._tools, so tools added in future PRs are exercised automatically — no per- release maintenance burden. Per-tool inputs mirror the integration- test fixture; destructive tools (memory_forget, memory_rebuild) skipped; mutating tools constrained to dry_run / round-trip. scripts/_layer3_remote_helper.py gains an exercise-all-tools subcommand wired through the FastMCP tool manager. Two regression- lock tests added to tests/test_promote_stable.sh harness (13 → 15 passing) covering helper dispatch + flag plumbing through the bash dispatcher (cmd_layer3 "$@" forwarding + EXERCISE_ALL_TOOLS=1 set). Full Python suite still 850 passing. Driver: Brian's 2026-05-22 ask after the memory_check_contradictions incident — "given the difficulty of checking each individual mnemon tool available, are we properly using unit tests to confirm that everything works as expected?" PR #158 addressed the Python-level canary; this PR addresses the deployment-environment + Fly-level canary. Together they form the test trio for catching the 2026-05-22 failure class on the next PR rather than on the next operator MCP call. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci-server-extras.yml | 65 +++++++++++++ CHANGELOG.md | 35 +++++++ scripts/_layer3_remote_helper.py | 127 ++++++++++++++++++++++++- scripts/promote_stable.sh | 42 +++++++- tests/test_promote_stable.sh | 41 ++++++++ 5 files changed, 307 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/ci-server-extras.yml diff --git a/.github/workflows/ci-server-extras.yml b/.github/workflows/ci-server-extras.yml new file mode 100644 index 0000000..a705356 --- /dev/null +++ b/.github/workflows/ci-server-extras.yml @@ -0,0 +1,65 @@ +name: CI — [server] extras only + +# Purpose: verify mnemon's production install (Fly Docker image) — which +# uses `mnemon-memory[server]` and NOTHING else — passes the suite that +# matters for production behavior. +# +# Driver: 2026-05-22 finding that `memory_check_contradictions` was +# silently broken on Fly because the LLM code path imported deps only +# available in the `[llm]` extra (NOT in `[server]`). All unit tests +# passed because they ran under `[dev]` extras, which include `[llm]` +# transitively in some operators' setups. This workflow runs the suite +# against ONLY the `[server]` extras + a minimal test framework — if a +# test fails here that passes in `ci.yml`, an optional dep has leaked +# into a production path. +# +# Composes with the all-tools integration canary in +# `tests/test_tools_integration.py` (PR #158): the canary catches +# tools that raise through their wrapper at the Python level; this +# workflow catches tools that work under `[dev]` but break under +# `[server]`. + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test-server-extras-only: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.13 + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + # Install production-equivalent extras. The Fly Dockerfile runs + # `pip install ".[server]"` — this mirrors that exactly. + - name: Install [server] extras only (production-equivalent) + run: pip install -e ".[server]" + + # Test framework installed SEPARATELY (not via `[dev]` extra) so + # we don't accidentally pull in anything via mnemon's own + # imports that would mask a server-only break. pytest itself is + # a test runner, not a mnemon runtime dep. + - name: Install test framework + run: pip install "pytest>=8.0" "pytest-asyncio>=0.23" "httpx>=0.27" + + # Verify the [llm] extras are NOT installed. If a future PR + # accidentally moves llama-cpp-python or huggingface-hub into + # [server], this assertion catches it. + - name: Assert [llm] extras NOT installed + run: | + ! pip show llama-cpp-python > /dev/null 2>&1 \ + || (echo "::error::llama-cpp-python is installed under [server] — it must remain in [llm]"; exit 1) + + # Run the full test suite under [server]-only install. Any test + # that imports something only available in [llm]/[ui] will fail + # here — that's a real bug worth catching at PR time, not after + # a Fly redeploy surfaces it client-side. + - name: Run tests under [server]-only install + run: pytest tests/ -v --tb=short diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a4c7be..54ddfaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,40 @@ # Changelog +## [0.7.0] - Unreleased + +### CI / release tooling + +- **New `.github/workflows/ci-server-extras.yml` workflow.** Installs + `mnemon-memory[server]` ONLY (the production-equivalent install + used by the Fly Docker image) plus pytest as a separate test + runner, and runs the full suite under that minimal install. Catches + the failure class that bit `memory_check_contradictions` on + 2026-05-22 — production code that imports something from `[llm]` / + `[ui]` would pass `ci.yml` (full `[dev]` extras installed) but + fail this workflow. Includes a guard assertion that + `llama-cpp-python` is NOT installed under `[server]` — so a future + PR can't accidentally move it across without flipping the + intentional "mnemon is LLM-free by default" posture. + +- **`scripts/promote_stable.sh layer3 --exercise-all-tools`.** New + opt-in flag that, after the test Fly app is up but before the + downgrade step, iterates every registered MCP tool against the + remote and asserts each returns cleanly (no opaque error envelope, + no unhandled exception, no NLI/embedder/baked-model breakage). + Composes with `tests/test_tools_integration.py` (PR #158, local- + process Python-level canary): this Fly-level probe catches the + failure modes the local canary can't see (missing baked models, + Anthropic MCP proxy timeouts, transport regressions). Tool list + resolved dynamically from `mcp._tool_manager._tools` so tools + added in future PRs are exercised automatically. Adds ~30-60s to + the layer3 run; opt-in so non-NLI-touching releases aren't taxed. + +- **`scripts/_layer3_remote_helper.py`** gains an `exercise-all-tools` + subcommand wired through the FastMCP tool manager. Two regression- + lock tests added to `tests/test_promote_stable.sh` harness (15 + passing, was 13) covering helper dispatch + flag plumbing through + the bash dispatcher. + ## [0.7.0rc2] - 2026-05-22 ### Features diff --git a/scripts/_layer3_remote_helper.py b/scripts/_layer3_remote_helper.py index 09a2e93..700c479 100644 --- a/scripts/_layer3_remote_helper.py +++ b/scripts/_layer3_remote_helper.py @@ -20,12 +20,25 @@ Usage (from promote_stable.sh): .venv/bin/python scripts/_layer3_remote_helper.py status .venv/bin/python scripts/_layer3_remote_helper.py save TITLE CONTENT CONTENT_TYPE + .venv/bin/python scripts/_layer3_remote_helper.py exercise-all-tools + +The ``exercise-all-tools`` subcommand was added 2026-05-22 alongside +the all-tools integration canary (``tests/test_tools_integration.py``). +It calls every read-only-safe MCP tool against the running remote +server and asserts each returns cleanly — surfaces Fly-specific +breakage (model not in image, Anthropic proxy timeout, etc.) that +the local Python-level integration test can't see. + +Tools that mutate state are constrained to ``dry_run=True`` invocations. +Destructive tools (``memory_forget``) are SKIPPED — the layer3 downgrade +path validates end-to-end cleanup anyway. """ from __future__ import annotations import json import sys +import time from mnemon.hooks._remote_client import call_tool_sync @@ -44,9 +57,114 @@ def _save(title: str, content: str, content_type: str) -> str: return result +# ── exercise-all-tools ───────────────────────────────────────────── +# Catches Fly-specific failures the local integration test can't see: +# missing baked models in the image, Anthropic MCP proxy timeouts, +# auth/transport regressions, etc. Composes with +# tests/test_tools_integration.py (local-process canary). + +# Tools the layer3 sequence already exercises elsewhere (skip to avoid +# double-counting + to keep this scoped to "all OTHER tools"). +_TOOLS_EXERCISED_ELSEWHERE = { + "memory_save", # exercised at "Step 4" already + "memory_status", # exercised at "Step 3" / "Step 4" +} + +# Destructive tools — skip in the layer3 read-mostly path. Downgrade +# step verifies state integrity afterwards. +_DESTRUCTIVE_TOOLS = { + "memory_forget", + "memory_rebuild", # heavy, re-embeds every doc +} + + +def _exercise_all_tools() -> int: + """Iterate the registered tool manager and call each remote-safe + tool. Returns 0 if all pass, 1 if any failed. + + Resolves tool list dynamically from the local mnemon install — so + a tool added to ``server.py`` is automatically exercised on the + next layer3 run without editing this helper. + """ + from mnemon.server import mcp + + # Use the most recent live document as the target for id-requiring + # tools. Falls back to id=1 if the remote vault somehow has no docs + # (shouldn't happen post-seed in layer3, but defensive). + timeline_raw, _ = call_tool_sync("memory_timeline", {"limit": 1}) + timeline = json.loads(timeline_raw) + target_id = timeline[0]["doc_id"] if timeline else 1 + + # Per-tool argument builder — mirrors tests/test_tools_integration.py + # _tool_inputs_for(). If a new tool ships without an entry here, + # the all-tools check will fail with a clear "no fixture" error. + inputs = { + "memory_search": {"query": "layer3 exercise"}, + "memory_get": {"id": target_id}, + "memory_timeline": {"limit": 3}, + "memory_related": {"id": target_id, "limit": 3}, + "memory_list_standing": {}, + "memory_export_vectors": {}, + "profile_get": {}, + "memory_sweep": {"dry_run": True}, + "memory_check_contradictions": {"id": target_id, "dry_run": True}, + # promote/demote round-trip — operator gesture, no destructive + # effect on the test app's state when paired. + "memory_promote": {"id": target_id}, + "memory_demote": {"id": target_id}, + "memory_pin": {"id": target_id}, + # profile_update needs both args + "profile_update": {"title": "layer3-test", "content": "layer3 exercise probe"}, + } + + registered = set(mcp._tool_manager._tools.keys()) + to_exercise = sorted( + registered + - _TOOLS_EXERCISED_ELSEWHERE + - _DESTRUCTIVE_TOOLS + ) + + failures: list[tuple[str, str, str]] = [] + for tool_name in to_exercise: + args = inputs.get(tool_name) + if args is None: + failures.append(( + tool_name, "NoFixture", + f"no input fixture in _exercise_all_tools; add one to inputs dict", + )) + print(f" ✗ {tool_name}: NO FIXTURE") + continue + + t0 = time.time() + try: + result, elapsed = call_tool_sync(tool_name, args) + n_chars = len(result) if isinstance(result, str) else 0 + # Catch opaque envelopes leaking through as clean results + if isinstance(result, str) and "Error occurred during tool execution" in result: + failures.append((tool_name, "OpaqueError", result[:200])) + print(f" ✗ {tool_name}: OPAQUE ERROR LEAK ({elapsed:.2f}s)") + else: + print(f" ✓ {tool_name}: {n_chars} chars in {elapsed:.2f}s") + except Exception as e: + failures.append((tool_name, type(e).__name__, str(e)[:200])) + print(f" ✗ {tool_name}: {type(e).__name__}: {e}") + + if failures: + print(f"\nFAILED: {len(failures)}/{len(to_exercise)} tools failed", file=sys.stderr) + for name, exc_type, msg in failures: + print(f" {name}: {exc_type}: {msg}", file=sys.stderr) + return 1 + + print(f"\nPASSED: all {len(to_exercise)} exercised tools returned cleanly") + return 0 + + def main() -> None: if len(sys.argv) < 2: - print("usage: _layer3_remote_helper.py [args...]", file=sys.stderr) + print( + "usage: _layer3_remote_helper.py [args...]", + file=sys.stderr, + ) sys.exit(2) cmd = sys.argv[1] @@ -54,9 +172,14 @@ def main() -> None: print(_total_documents()) elif cmd == "save": if len(sys.argv) < 5: - print("usage: _layer3_remote_helper.py save TITLE CONTENT CONTENT_TYPE", file=sys.stderr) + print( + "usage: _layer3_remote_helper.py save TITLE CONTENT CONTENT_TYPE", + file=sys.stderr, + ) sys.exit(2) print(_save(sys.argv[2], sys.argv[3], sys.argv[4])) + elif cmd == "exercise-all-tools": + sys.exit(_exercise_all_tools()) else: print(f"unknown command: {cmd}", file=sys.stderr) sys.exit(2) diff --git a/scripts/promote_stable.sh b/scripts/promote_stable.sh index cf46c03..4a93c07 100755 --- a/scripts/promote_stable.sh +++ b/scripts/promote_stable.sh @@ -21,6 +21,7 @@ # Usage: # scripts/promote_stable.sh preflight # read-only, runs before everything else # scripts/promote_stable.sh layer3 # E2E web test (creates+destroys test Fly app, ~15 min) +# scripts/promote_stable.sh layer3 --exercise-all-tools # also probe every MCP tool (~30-60s extra) # # pins mnemon upgrade web --mnemon-version to TARGET_VERSION # # if it's on PyPI, otherwise the latest published version # # as proxy (LAYER3_VERSION_OVERRIDE= to override). @@ -223,7 +224,32 @@ _layer3_cleanup() { } cmd_layer3() { + # Parse layer3-specific flags. Currently: + # --exercise-all-tools After upgrade, iterate every registered + # MCP tool against the test Fly app and + # assert each returns cleanly. Catches + # Fly-specific breakage (missing baked + # models, MCP proxy timeouts) that the + # local-process integration canary + # tests/test_tools_integration.py can't + # see. Added 2026-05-22. + local EXERCISE_ALL_TOOLS=0 + while [ $# -gt 0 ]; do + case "$1" in + --exercise-all-tools) + EXERCISE_ALL_TOOLS=1 + shift + ;; + *) + die "unknown layer3 flag: $1" + ;; + esac + done + echo_step "Layer-3 web test — $TARGET_VERSION E2E against test-scoped Fly app" + if [ "$EXERCISE_ALL_TOOLS" = "1" ]; then + echo " (--exercise-all-tools: every MCP tool will be invoked against the test app)" + fi flyctl auth whoami >/dev/null 2>&1 || die "flyctl not logged in" aws sts get-caller-identity >/dev/null 2>&1 || die "aws creds not configured" @@ -363,6 +389,20 @@ cmd_layer3() { [[ "$remote_count" == "4" ]] || die "expected 4 docs on remote, got '$remote_count'" echo_ok "4 docs on remote" + # Step 4.5 — exercise every MCP tool against the test app. Opt-in + # via --exercise-all-tools because it adds ~30-60s to the layer3 + # run (one HTTP round-trip per tool). Catches Fly-specific failures + # the local Python integration canary can't surface — missing + # baked models in the Docker image, Anthropic MCP proxy timeouts, + # transport regressions. Composes with tests/test_tools_integration.py + # (PR #158). + if [ "$EXERCISE_ALL_TOOLS" = "1" ]; then + echo_step "Step 4.5 — exercise all MCP tools against the test app" + "$MNEMON_VENV_BIN/python" "$REMOTE_HELPER" exercise-all-tools \ + || die "all-tools exercise failed against test app — see output above" + echo_ok "every MCP tool returned cleanly" + fi + echo_step "Step 5 — downgrade local + destroy fly app" "$M" downgrade local --destroy-fly-app local local_count @@ -544,7 +584,7 @@ fi case "${1:-}" in preflight) cmd_preflight ;; - layer3) cmd_layer3 ;; + layer3) shift; cmd_layer3 "$@" ;; publish) cmd_publish ;; verify) cmd_verify ;; *) diff --git a/tests/test_promote_stable.sh b/tests/test_promote_stable.sh index cae4375..e250f5d 100755 --- a/tests/test_promote_stable.sh +++ b/tests/test_promote_stable.sh @@ -287,6 +287,45 @@ test_layer3_uses_remote_helper_not_mnemon_status() { return 0 } +test_helper_exposes_exercise_all_tools_subcommand() { + # The 2026-05-22 layer3 extension added an exercise-all-tools + # subcommand to _layer3_remote_helper.py. Regression-lock the + # dispatcher so the entry-point doesn't silently disappear in a + # future edit. Doesn't run the actual exercise (would need a live + # Fly app); just verifies the subcommand is registered + named. + local helper="$REPO_ROOT/scripts/_layer3_remote_helper.py" + [ -f "$helper" ] || { echo " helper not found at $helper" >&2; return 1; } + + # The usage string in the bare-invocation output should mention + # the new subcommand. (Bare invocation exits non-zero with the + # usage on stderr.) + local out + out="$("$REPO_ROOT/.venv/bin/python" "$helper" 2>&1)" && return 1 + echo "$out" | grep -q "exercise-all-tools" \ + || { echo " usage text doesn't advertise exercise-all-tools: $out" >&2; return 1; } + return 0 +} + +test_layer3_passes_through_exercise_all_tools_flag() { + # The 2026-05-22 layer3 extension added a --exercise-all-tools + # flag wired through the dispatcher (cmd_layer3 "$@"). Regression- + # lock so a future refactor doesn't drop the $@ forwarding. + local script="$REPO_ROOT/scripts/promote_stable.sh" + + # Dispatcher line forwards args + grep -E 'layer3\)\s+shift;\s+cmd_layer3 "\$@"' "$script" \ + || { echo " layer3 case doesn't forward args to cmd_layer3" >&2; return 1; } + + # cmd_layer3 parses --exercise-all-tools into EXERCISE_ALL_TOOLS=1 + grep -q 'EXERCISE_ALL_TOOLS=1' "$script" \ + || { echo " cmd_layer3 doesn't set EXERCISE_ALL_TOOLS=1" >&2; return 1; } + + # And the Step 4.5 gate references it + grep -q 'EXERCISE_ALL_TOOLS' "$script" \ + || { echo " Step 4.5 doesn't gate on EXERCISE_ALL_TOOLS" >&2; return 1; } + return 0 +} + test_publish_extracts_changelog_section_correctly() { # Regression for the 2026-05-21 publish-step bug: the naive awk # range `/^## \[VER\]/,/^## \[/` extracted ZERO bytes because @@ -405,6 +444,8 @@ run_test test_sourcing_does_not_dispatch run_test test_step2_seed_contents_are_unique run_test test_remote_helper_exists_and_is_invokable run_test test_layer3_uses_remote_helper_not_mnemon_status +run_test test_helper_exposes_exercise_all_tools_subcommand +run_test test_layer3_passes_through_exercise_all_tools_flag run_test test_publish_extracts_changelog_section_correctly echo ""