cipher813 · cipher813 · May 22, 2026 · May 22, 2026
diff --git a/.github/workflows/ci-server-extras.yml b/.github/workflows/ci-server-extras.yml
@@ -0,0 +1,65 @@
+name: CI — [server] extras only
+
+# Purpose: verify mnemon's production install (Fly Docker image) — which
+# uses `mnemon-memory[server]` and NOTHING else — passes the suite that
+# matters for production behavior.
+#
+# Driver: 2026-05-22 finding that `memory_check_contradictions` was
+# silently broken on Fly because the LLM code path imported deps only
+# available in the `[llm]` extra (NOT in `[server]`). All unit tests
+# passed because they ran under `[dev]` extras, which include `[llm]`
+# transitively in some operators' setups. This workflow runs the suite
+# against ONLY the `[server]` extras + a minimal test framework — if a
+# test fails here that passes in `ci.yml`, an optional dep has leaked
+# into a production path.
+#
+# Composes with the all-tools integration canary in
+# `tests/test_tools_integration.py` (PR #158): the canary catches
+# tools that raise through their wrapper at the Python level; this
+# workflow catches tools that work under `[dev]` but break under
+# `[server]`.
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test-server-extras-only:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      # Install production-equivalent extras. The Fly Dockerfile runs
+      # `pip install ".[server]"` — this mirrors that exactly.
+      - name: Install [server] extras only (production-equivalent)
+        run: pip install -e ".[server]"
+
+      # Test framework installed SEPARATELY (not via `[dev]` extra) so
+      # we don't accidentally pull in anything via mnemon's own
+      # imports that would mask a server-only break. pytest itself is
+      # a test runner, not a mnemon runtime dep.
+      - name: Install test framework
+        run: pip install "pytest>=8.0" "pytest-asyncio>=0.23" "httpx>=0.27"
+
+      # Verify the [llm] extras are NOT installed. If a future PR
+      # accidentally moves llama-cpp-python or huggingface-hub into
+      # [server], this assertion catches it.
+      - name: Assert [llm] extras NOT installed
+        run: |
+          ! pip show llama-cpp-python > /dev/null 2>&1 \
+            || (echo "::error::llama-cpp-python is installed under [server] — it must remain in [llm]"; exit 1)
+
+      # Run the full test suite under [server]-only install. Any test
+      # that imports something only available in [llm]/[ui] will fail
+      # here — that's a real bug worth catching at PR time, not after
+      # a Fly redeploy surfaces it client-side.
+      - name: Run tests under [server]-only install
+        run: pytest tests/ -v --tb=short
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,40 @@
 # Changelog
 
+## [0.7.0] - Unreleased
+
+### CI / release tooling
+
+- **New `.github/workflows/ci-server-extras.yml` workflow.** Installs
+  `mnemon-memory[server]` ONLY (the production-equivalent install
+  used by the Fly Docker image) plus pytest as a separate test
+  runner, and runs the full suite under that minimal install. Catches
+  the failure class that bit `memory_check_contradictions` on
+  2026-05-22 — production code that imports something from `[llm]` /
+  `[ui]` would pass `ci.yml` (full `[dev]` extras installed) but
+  fail this workflow. Includes a guard assertion that
+  `llama-cpp-python` is NOT installed under `[server]` — so a future
+  PR can't accidentally move it across without flipping the
+  intentional "mnemon is LLM-free by default" posture.
+
+- **`scripts/promote_stable.sh layer3 --exercise-all-tools`.** New
+  opt-in flag that, after the test Fly app is up but before the
+  downgrade step, iterates every registered MCP tool against the
+  remote and asserts each returns cleanly (no opaque error envelope,
+  no unhandled exception, no NLI/embedder/baked-model breakage).
+  Composes with `tests/test_tools_integration.py` (PR #158, local-
+  process Python-level canary): this Fly-level probe catches the
+  failure modes the local canary can't see (missing baked models,
+  Anthropic MCP proxy timeouts, transport regressions). Tool list
+  resolved dynamically from `mcp._tool_manager._tools` so tools
+  added in future PRs are exercised automatically. Adds ~30-60s to
+  the layer3 run; opt-in so non-NLI-touching releases aren't taxed.
+
+- **`scripts/_layer3_remote_helper.py`** gains an `exercise-all-tools`
+  subcommand wired through the FastMCP tool manager. Two regression-
+  lock tests added to `tests/test_promote_stable.sh` harness (15
+  passing, was 13) covering helper dispatch + flag plumbing through
+  the bash dispatcher.
+
 ## [0.7.0rc2] - 2026-05-22
 
 ### Features

diff --git a/scripts/_layer3_remote_helper.py b/scripts/_layer3_remote_helper.py
@@ -20,12 +20,25 @@
 Usage (from promote_stable.sh):
     .venv/bin/python scripts/_layer3_remote_helper.py status
     .venv/bin/python scripts/_layer3_remote_helper.py save TITLE CONTENT CONTENT_TYPE
+    .venv/bin/python scripts/_layer3_remote_helper.py exercise-all-tools
+
+The ``exercise-all-tools`` subcommand was added 2026-05-22 alongside
+the all-tools integration canary (``tests/test_tools_integration.py``).
+It calls every read-only-safe MCP tool against the running remote
+server and asserts each returns cleanly — surfaces Fly-specific
+breakage (model not in image, Anthropic proxy timeout, etc.) that
+the local Python-level integration test can't see.
+
+Tools that mutate state are constrained to ``dry_run=True`` invocations.
+Destructive tools (``memory_forget``) are SKIPPED — the layer3 downgrade
+path validates end-to-end cleanup anyway.
 """
 
 from __future__ import annotations
 
 import json
 import sys
+import time
 
 from mnemon.hooks._remote_client import call_tool_sync
 
@@ -44,19 +57,129 @@ def _save(title: str, content: str, content_type: str) -> str:
     return result
 
 
+# ── exercise-all-tools ─────────────────────────────────────────────
+# Catches Fly-specific failures the local integration test can't see:
+# missing baked models in the image, Anthropic MCP proxy timeouts,
+# auth/transport regressions, etc. Composes with
+# tests/test_tools_integration.py (local-process canary).
+
+# Tools the layer3 sequence already exercises elsewhere (skip to avoid
+# double-counting + to keep this scoped to "all OTHER tools").
+_TOOLS_EXERCISED_ELSEWHERE = {
+    "memory_save",     # exercised at "Step 4" already
+    "memory_status",   # exercised at "Step 3" / "Step 4"
+}
+
+# Destructive tools — skip in the layer3 read-mostly path. Downgrade
+# step verifies state integrity afterwards.
+_DESTRUCTIVE_TOOLS = {
+    "memory_forget",
+    "memory_rebuild",  # heavy, re-embeds every doc
+}
+
+
+def _exercise_all_tools() -> int:
+    """Iterate the registered tool manager and call each remote-safe
+    tool. Returns 0 if all pass, 1 if any failed.
+
+    Resolves tool list dynamically from the local mnemon install — so
+    a tool added to ``server.py`` is automatically exercised on the
+    next layer3 run without editing this helper.
+    """
+    from mnemon.server import mcp
+
+    # Use the most recent live document as the target for id-requiring
+    # tools. Falls back to id=1 if the remote vault somehow has no docs
+    # (shouldn't happen post-seed in layer3, but defensive).
+    timeline_raw, _ = call_tool_sync("memory_timeline", {"limit": 1})
+    timeline = json.loads(timeline_raw)
+    target_id = timeline[0]["doc_id"] if timeline else 1
+
+    # Per-tool argument builder — mirrors tests/test_tools_integration.py
+    # _tool_inputs_for(). If a new tool ships without an entry here,
+    # the all-tools check will fail with a clear "no fixture" error.
+    inputs = {
+        "memory_search": {"query": "layer3 exercise"},
+        "memory_get": {"id": target_id},
+        "memory_timeline": {"limit": 3},
+        "memory_related": {"id": target_id, "limit": 3},
+        "memory_list_standing": {},
+        "memory_export_vectors": {},
+        "profile_get": {},
+        "memory_sweep": {"dry_run": True},
+        "memory_check_contradictions": {"id": target_id, "dry_run": True},
+        # promote/demote round-trip — operator gesture, no destructive
+        # effect on the test app's state when paired.
+        "memory_promote": {"id": target_id},
+        "memory_demote": {"id": target_id},
+        "memory_pin": {"id": target_id},
+        # profile_update needs both args
+        "profile_update": {"title": "layer3-test", "content": "layer3 exercise probe"},
+    }
+
+    registered = set(mcp._tool_manager._tools.keys())
+    to_exercise = sorted(
+        registered
+        - _TOOLS_EXERCISED_ELSEWHERE
+        - _DESTRUCTIVE_TOOLS
+    )
+
+    failures: list[tuple[str, str, str]] = []
+    for tool_name in to_exercise:
+        args = inputs.get(tool_name)
+        if args is None:
+            failures.append((
+                tool_name, "NoFixture",
+                f"no input fixture in _exercise_all_tools; add one to inputs dict",
+            ))
+            print(f"  ✗ {tool_name}: NO FIXTURE")
+            continue
+
+        t0 = time.time()
+        try:
+            result, elapsed = call_tool_sync(tool_name, args)
+            n_chars = len(result) if isinstance(result, str) else 0
+            # Catch opaque envelopes leaking through as clean results
+            if isinstance(result, str) and "Error occurred during tool execution" in result:
+                failures.append((tool_name, "OpaqueError", result[:200]))
+                print(f"  ✗ {tool_name}: OPAQUE ERROR LEAK ({elapsed:.2f}s)")
+            else:
+                print(f"  ✓ {tool_name}: {n_chars} chars in {elapsed:.2f}s")
+        except Exception as e:
+            failures.append((tool_name, type(e).__name__, str(e)[:200]))
+            print(f"  ✗ {tool_name}: {type(e).__name__}: {e}")
+
+    if failures:
+        print(f"\nFAILED: {len(failures)}/{len(to_exercise)} tools failed", file=sys.stderr)
+        for name, exc_type, msg in failures:
+            print(f"  {name}: {exc_type}: {msg}", file=sys.stderr)
+        return 1
+
+    print(f"\nPASSED: all {len(to_exercise)} exercised tools returned cleanly")
+    return 0
+
+
 def main() -> None:
     if len(sys.argv) < 2:
-        print("usage: _layer3_remote_helper.py <status|save> [args...]", file=sys.stderr)
+        print(
+            "usage: _layer3_remote_helper.py <status|save|exercise-all-tools> [args...]",
+            file=sys.stderr,
+        )
         sys.exit(2)
 
     cmd = sys.argv[1]
     if cmd == "status":
         print(_total_documents())
     elif cmd == "save":
         if len(sys.argv) < 5:
-            print("usage: _layer3_remote_helper.py save TITLE CONTENT CONTENT_TYPE", file=sys.stderr)
+            print(
+                "usage: _layer3_remote_helper.py save TITLE CONTENT CONTENT_TYPE",
+                file=sys.stderr,
+            )
             sys.exit(2)
         print(_save(sys.argv[2], sys.argv[3], sys.argv[4]))
+    elif cmd == "exercise-all-tools":
+        sys.exit(_exercise_all_tools())
     else:
         print(f"unknown command: {cmd}", file=sys.stderr)
         sys.exit(2)

diff --git a/scripts/promote_stable.sh b/scripts/promote_stable.sh
@@ -21,6 +21,7 @@
 # Usage:
 #   scripts/promote_stable.sh preflight   # read-only, runs before everything else
 #   scripts/promote_stable.sh layer3      # E2E web test (creates+destroys test Fly app, ~15 min)
+#   scripts/promote_stable.sh layer3 --exercise-all-tools   # also probe every MCP tool (~30-60s extra)
 #                                         # pins mnemon upgrade web --mnemon-version to TARGET_VERSION
 #                                         # if it's on PyPI, otherwise the latest published version
 #                                         # as proxy (LAYER3_VERSION_OVERRIDE=<ver> to override).
@@ -223,7 +224,32 @@ _layer3_cleanup() {
 }
 
 cmd_layer3() {
+    # Parse layer3-specific flags. Currently:
+    #   --exercise-all-tools  After upgrade, iterate every registered
+    #                         MCP tool against the test Fly app and
+    #                         assert each returns cleanly. Catches
+    #                         Fly-specific breakage (missing baked
+    #                         models, MCP proxy timeouts) that the
+    #                         local-process integration canary
+    #                         tests/test_tools_integration.py can't
+    #                         see. Added 2026-05-22.
+    local EXERCISE_ALL_TOOLS=0
+    while [ $# -gt 0 ]; do
+        case "$1" in
+            --exercise-all-tools)
+                EXERCISE_ALL_TOOLS=1
+                shift
+                ;;
+            *)
+                die "unknown layer3 flag: $1"
+                ;;
+        esac
+    done
+
     echo_step "Layer-3 web test — $TARGET_VERSION E2E against test-scoped Fly app"
+    if [ "$EXERCISE_ALL_TOOLS" = "1" ]; then
+        echo "  (--exercise-all-tools: every MCP tool will be invoked against the test app)"
+    fi
 
     flyctl auth whoami >/dev/null 2>&1 || die "flyctl not logged in"
     aws sts get-caller-identity >/dev/null 2>&1 || die "aws creds not configured"
@@ -363,6 +389,20 @@ cmd_layer3() {
     [[ "$remote_count" == "4" ]] || die "expected 4 docs on remote, got '$remote_count'"
     echo_ok "4 docs on remote"
 
+    # Step 4.5 — exercise every MCP tool against the test app. Opt-in
+    # via --exercise-all-tools because it adds ~30-60s to the layer3
+    # run (one HTTP round-trip per tool). Catches Fly-specific failures
+    # the local Python integration canary can't surface — missing
+    # baked models in the Docker image, Anthropic MCP proxy timeouts,
+    # transport regressions. Composes with tests/test_tools_integration.py
+    # (PR #158).
+    if [ "$EXERCISE_ALL_TOOLS" = "1" ]; then
+        echo_step "Step 4.5 — exercise all MCP tools against the test app"
+        "$MNEMON_VENV_BIN/python" "$REMOTE_HELPER" exercise-all-tools \
+            || die "all-tools exercise failed against test app — see output above"
+        echo_ok "every MCP tool returned cleanly"
+    fi
+
     echo_step "Step 5 — downgrade local + destroy fly app"
     "$M" downgrade local --destroy-fly-app
     local local_count
@@ -544,7 +584,7 @@ fi
 
 case "${1:-}" in
     preflight) cmd_preflight ;;
-    layer3)    cmd_layer3 ;;
+    layer3)    shift; cmd_layer3 "$@" ;;
     publish)   cmd_publish ;;
     verify)    cmd_verify ;;
     *)

diff --git a/tests/test_promote_stable.sh b/tests/test_promote_stable.sh
@@ -287,6 +287,45 @@ test_layer3_uses_remote_helper_not_mnemon_status() {
     return 0
 }
 
+test_helper_exposes_exercise_all_tools_subcommand() {
+    # The 2026-05-22 layer3 extension added an exercise-all-tools
+    # subcommand to _layer3_remote_helper.py. Regression-lock the
+    # dispatcher so the entry-point doesn't silently disappear in a
+    # future edit. Doesn't run the actual exercise (would need a live
+    # Fly app); just verifies the subcommand is registered + named.
+    local helper="$REPO_ROOT/scripts/_layer3_remote_helper.py"
+    [ -f "$helper" ] || { echo "    helper not found at $helper" >&2; return 1; }
+
+    # The usage string in the bare-invocation output should mention
+    # the new subcommand. (Bare invocation exits non-zero with the
+    # usage on stderr.)
+    local out
+    out="$("$REPO_ROOT/.venv/bin/python" "$helper" 2>&1)" && return 1
+    echo "$out" | grep -q "exercise-all-tools" \
+        || { echo "    usage text doesn't advertise exercise-all-tools: $out" >&2; return 1; }
+    return 0
+}
+
+test_layer3_passes_through_exercise_all_tools_flag() {
+    # The 2026-05-22 layer3 extension added a --exercise-all-tools
+    # flag wired through the dispatcher (cmd_layer3 "$@"). Regression-
+    # lock so a future refactor doesn't drop the $@ forwarding.
+    local script="$REPO_ROOT/scripts/promote_stable.sh"
+
+    # Dispatcher line forwards args
+    grep -E 'layer3\)\s+shift;\s+cmd_layer3 "\$@"' "$script" \
+        || { echo "    layer3 case doesn't forward args to cmd_layer3" >&2; return 1; }
+
+    # cmd_layer3 parses --exercise-all-tools into EXERCISE_ALL_TOOLS=1
+    grep -q 'EXERCISE_ALL_TOOLS=1' "$script" \
+        || { echo "    cmd_layer3 doesn't set EXERCISE_ALL_TOOLS=1" >&2; return 1; }
+
+    # And the Step 4.5 gate references it
+    grep -q 'EXERCISE_ALL_TOOLS' "$script" \
+        || { echo "    Step 4.5 doesn't gate on EXERCISE_ALL_TOOLS" >&2; return 1; }
+    return 0
+}
+
 test_publish_extracts_changelog_section_correctly() {
     # Regression for the 2026-05-21 publish-step bug: the naive awk
     # range `/^## \[VER\]/,/^## \[/` extracted ZERO bytes because
@@ -405,6 +444,8 @@ run_test test_sourcing_does_not_dispatch
 run_test test_step2_seed_contents_are_unique
 run_test test_remote_helper_exists_and_is_invokable
 run_test test_layer3_uses_remote_helper_not_mnemon_status
+run_test test_helper_exposes_exercise_all_tools_subcommand
+run_test test_layer3_passes_through_exercise_all_tools_flag
 run_test test_publish_extracts_changelog_section_correctly
 
 echo ""