From d4c04ba6b75342a7f9d1149603ee5c8173761df0 Mon Sep 17 00:00:00 2001
From: Brian McMahon <brian@nousergon.ai>
Date: Mon, 18 May 2026 15:02:54 -0700
Subject: [PATCH] feat(backtester): shell-run dry path for replay concordance +
 counterfactual (closes SF skip-exceptions)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the last two Saturday-SF shell-run keystone skip-exceptions
(ReplayConcordance / Counterfactual) by giving both replay Image-Lambda
handlers a verified clean no-write dry path so the keystone can route
them dry instead of pure-skipping them.

Dry event-key (verbatim): "dry_run_llm" (boolean)
  Reuses the canonical keystone key established verbatim by the
  Research Lambda in alpha-engine-data step_function.json
  (`"dry_run_llm.$": "$.research_dry"`). No invented key. Distinct from
  the handlers' pre-existing `dry_run` event key, whose
  compute-but-don't-emit-metrics semantic is intentionally left
  untouched (backward compatible — proven by retained legacy tests).
  The SF rewire (ReplayConcordance/Counterfactual states must pass
  "dry_run_llm": true under shell_run) is a SEPARATE alpha-engine-data
  follow-on PR; not in this repo.

Shared helper (canonical, no copy-paste): replay/__init__.py
  - SHELL_RUN_DRY_EVENT_KEY = "dry_run_llm"
  - is_shell_run_dry(event)      — tolerant bool/str/None coercion
  - shell_run_dry_response(...)  — benign {"status": "DRY_RUN", ...}
  Both handlers import and call the single implementation.

Per-handler short-circuit proof:
  lambda_concordance/handler.py — the dry check sits AFTER
  _ensure_init() and AFTER the deferred `from replay.batch import
  compute_and_emit_concordance` (boot + module imports run for real —
  the keystone's whole point), but BEFORE the compute_and_emit_
  concordance call. That call is the sole entry to the replay.batch
  decision_artifacts S3 scan, the langchain_anthropic / target-model
  replay calls, the CloudWatch agent_cheap_model_concordance emit, and
  the S3 summary persist. Dry path returns before all of them: zero
  external/LLM calls, zero S3/CW writes.

  lambda_counterfactual/handler.py — symmetric: dry check after
  _ensure_init() + `from replay.counterfactual import compute_and_emit`,
  before that call (the sole entry to the decision_artifacts S3 scan +
  sklearn fit + CloudWatch agent_counterfactual_rule_fit emit + S3
  per-agent persist). No LLM on this path regardless.

  Tests assert compute_and_emit[_concordance] is never called under
  dry, _ensure_init still runs, and the SF (Catch-wrapped,
  non-blocking) gets a success status.

Known SEPARATE issue (OUT OF SCOPE here — do not conflate):
  Counterfactual times out at 600s on real Saturday runs from corpus
  growth (~32,740 artifacts; last success 2026-05-13; the SF Catch
  swallows it). This PR does not touch the scan logic and does not
  attempt to fix that timeout. Aside: because the dry path skips the
  scan entirely, it incidentally avoids the timeout under shell_run —
  but the real-Saturday timeout remains a distinct bug to be tracked
  separately.

Tests: +15 handler tests (TestShellRunDryPath x2: dry short-circuits
  before scan, string-true coercion, dry=false / absent take the real
  path, legacy `dry_run` still takes the real path) + a new
  tests/test_replay_shell_run_dry.py for the shared helper. Full
  backtester suite: 1684 passed, 5 skipped, 1 deselected (parity
  deselected as usual), 0 failed. No new deps, no secrets.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lambda_concordance/handler.py               | 15 +++++
 lambda_counterfactual/handler.py            | 20 +++++++
 replay/__init__.py                          | 62 +++++++++++++++++++++
 tests/test_lambda_concordance_handler.py    | 61 ++++++++++++++++++++
 tests/test_lambda_counterfactual_handler.py | 58 +++++++++++++++++++
 tests/test_replay_shell_run_dry.py          | 59 ++++++++++++++++++++
 6 files changed, 275 insertions(+)
 create mode 100644 tests/test_replay_shell_run_dry.py

diff --git a/lambda_concordance/handler.py b/lambda_concordance/handler.py
index 02a0131..cd7265b 100644
--- a/lambda_concordance/handler.py
+++ b/lambda_concordance/handler.py
@@ -114,12 +114,27 @@ def handler(event: dict, context) -> dict:
 
     # Imports deferred until after _ensure_init so SSM-loaded secrets
     # are available for any module-level init that consults them.
+    from replay import is_shell_run_dry, shell_run_dry_response
     from replay.batch import (
         DEFAULT_MAX_ARTIFACTS,
         compute_and_emit_concordance,
     )
 
     t0 = time.time()
+
+    # Shell-run dry path (Saturday-SF keystone). Boot + module imports
+    # above have already run for real (the keystone's whole point —
+    # exercise bootstrap/import/lib-pin/transport). Return a benign
+    # success BEFORE the replay.batch scan (decision_artifacts S3
+    # discovery), BEFORE any langchain_anthropic / target-model call,
+    # and BEFORE any CloudWatch metric emit or S3 summary persist.
+    if is_shell_run_dry(event):
+        logger.info(
+            "[lambda_concordance] shell-run dry path: boot+imports OK, "
+            "skipping replay scan + Anthropic + S3/CW writes"
+        )
+        return shell_run_dry_response("lambda_concordance", t0)
+
     bucket = os.environ.get("S3_BUCKET", "alpha-engine-research")
 
     target_models = event.get("target_models") or ["claude-haiku-4-5"]
diff --git a/lambda_counterfactual/handler.py b/lambda_counterfactual/handler.py
index 7e5a8f4..08d60b5 100644
--- a/lambda_counterfactual/handler.py
+++ b/lambda_counterfactual/handler.py
@@ -94,9 +94,29 @@ def handler(event: dict, context) -> dict:
     """
     _ensure_init()
 
+    from replay import is_shell_run_dry, shell_run_dry_response
     from replay.counterfactual import compute_and_emit
 
     t0 = time.time()
+
+    # Shell-run dry path (Saturday-SF keystone). Boot + module imports
+    # above have already run for real. Return a benign success BEFORE
+    # the replay.counterfactual scan (decision_artifacts S3 discovery +
+    # sklearn fit), and BEFORE any CloudWatch metric emit or S3
+    # per-agent analysis persist. No LLM calls exist on this path.
+    #
+    # Side benefit (NOT the contract): because the corpus scan is
+    # skipped, this also sidesteps the known separate production
+    # Counterfactual 600s-timeout-on-corpus-growth bug under shell_run
+    # — that real-Saturday timeout remains a distinct out-of-scope
+    # issue tracked separately; the scan logic is untouched here.
+    if is_shell_run_dry(event):
+        logger.info(
+            "[lambda_counterfactual] shell-run dry path: boot+imports "
+            "OK, skipping replay scan + sklearn fit + S3/CW writes"
+        )
+        return shell_run_dry_response("lambda_counterfactual", t0)
+
     bucket = os.environ.get("S3_BUCKET", "alpha-engine-research")
 
     end_time_iso = event.get("end_time_iso")
diff --git a/replay/__init__.py b/replay/__init__.py
index 115f18b..5715141 100644
--- a/replay/__init__.py
+++ b/replay/__init__.py
@@ -29,3 +29,65 @@
   model would emit the same*. Together they cover the agent-
   justification triple alongside the counterfactual-rule-fit signal.
 """
+
+from __future__ import annotations
+
+import time
+from typing import Any
+
+# Canonical Saturday-SF shell-run dry-path event key. Established
+# verbatim by the shell-run keystone (alpha-engine-data
+# step_function.json) for the Research Lambda
+# (``"dry_run_llm.$": "$.research_dry"``); reused here so the
+# ReplayConcordance + Counterfactual states can be routed dry (boot +
+# imports for real, return a benign success before any scan / external
+# call / S3 / CloudWatch write) instead of pure-skipped. Distinct from
+# the handlers' pre-existing ``dry_run`` event key, which has a
+# different (compute-but-do-not-emit-metrics) semantic and is left
+# untouched for backward compatibility.
+SHELL_RUN_DRY_EVENT_KEY = "dry_run_llm"
+
+
+def is_shell_run_dry(event: dict | None) -> bool:
+    """True when the SF shell-run keystone routed this Lambda dry.
+
+    Reads the canonical ``dry_run_llm`` boolean off the invocation
+    event. Tolerates a missing/None event and string ``"true"``/``"1"``
+    forms (Step Functions string-parameter convenience), mirroring the
+    coercion the handlers already apply to ``agents``/``target_models``.
+    """
+    if not event:
+        return False
+    raw = event.get(SHELL_RUN_DRY_EVENT_KEY, False)
+    if isinstance(raw, str):
+        return raw.strip().lower() in {"true", "1", "yes"}
+    return bool(raw)
+
+
+def shell_run_dry_response(handler_name: str, t0: float) -> dict:
+    """Benign success envelope returned BEFORE the replay scan.
+
+    Returned by both replay Lambdas when ``is_shell_run_dry`` is true.
+    Hard invariant at the call site: zero external/LLM calls, zero
+    S3/CloudWatch writes, no decision_artifacts discovery — boot +
+    module imports have already run for real by the time this is
+    called. ``status`` is a recognised value the SF (Catch-wrapped,
+    non-blocking) treats as success.
+    """
+    return {
+        "status": "DRY_RUN",
+        "dry_run": True,
+        "handler": handler_name,
+        "note": (
+            "shell-run dry path: boot + imports executed; replay scan, "
+            "external/LLM calls, and all S3/CloudWatch writes skipped"
+        ),
+        "duration_seconds": round(time.time() - t0, 1),
+    }
+
+
+__all__ = [
+    "SHELL_RUN_DRY_EVENT_KEY",
+    "is_shell_run_dry",
+    "shell_run_dry_response",
+]
diff --git a/tests/test_lambda_concordance_handler.py b/tests/test_lambda_concordance_handler.py
index 0d1fc8f..35d95c4 100644
--- a/tests/test_lambda_concordance_handler.py
+++ b/tests/test_lambda_concordance_handler.py
@@ -261,3 +261,64 @@ def fake_compute(**kwargs):
                 context=None,
             )
         assert captured["agent_filter"] == ["sector_quant", "ic_cio"]
+
+
+# ── Shell-run dry path (Saturday-SF keystone) ────────────────────────────
+
+
+class TestShellRunDryPath:
+    """`dry_run_llm: true` (the canonical keystone shell-run key) must
+    short-circuit BEFORE the replay scan: no compute_and_emit_concordance
+    call (so no decision_artifacts S3 discovery, no langchain_anthropic /
+    target-model call, no CloudWatch metric emit, no S3 summary
+    persist), boot + module imports still run, and a benign success
+    envelope is returned."""
+
+    def test_dry_run_llm_short_circuits_before_scan(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init") as m_init, \
+             patch("replay.batch.compute_and_emit_concordance") as m_compute:
+            result = handler_mod.handler({"dry_run_llm": True}, context=None)
+
+        # Boot/init still ran for real (the keystone's whole point).
+        m_init.assert_called_once()
+        # The replay scan / Anthropic / S3+CW path was never entered.
+        m_compute.assert_not_called()
+        # SF (Catch-wrapped, non-blocking) treats this as success.
+        assert result["status"] == "DRY_RUN"
+        assert result["dry_run"] is True
+        assert result["handler"] == "lambda_concordance"
+        assert "duration_seconds" in result
+
+    def test_dry_run_llm_string_true_coerced(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.batch.compute_and_emit_concordance") as m_compute:
+            result = handler_mod.handler({"dry_run_llm": "true"}, context=None)
+        m_compute.assert_not_called()
+        assert result["status"] == "DRY_RUN"
+
+    def test_dry_run_llm_false_takes_real_path(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.batch.compute_and_emit_concordance",
+                   return_value=_ok_summary()) as m_compute:
+            result = handler_mod.handler({"dry_run_llm": False}, context=None)
+        m_compute.assert_called_once()
+        assert result["status"] == "OK"
+
+    def test_absent_dry_run_llm_takes_real_path(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.batch.compute_and_emit_concordance",
+                   return_value=_ok_summary()) as m_compute:
+            result = handler_mod.handler({}, context=None)
+        m_compute.assert_called_once()
+        assert result["status"] == "OK"
+
+    def test_legacy_dry_run_key_still_takes_real_path(self, handler_mod):
+        """The pre-existing `dry_run` (compute-but-don't-emit-metrics)
+        semantic is preserved — it must NOT short-circuit the scan."""
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.batch.compute_and_emit_concordance",
+                   return_value=_ok_summary()) as m_compute:
+            result = handler_mod.handler({"dry_run": True}, context=None)
+        m_compute.assert_called_once()
+        assert m_compute.call_args.kwargs["emit_metrics"] is False
+        assert result["status"] == "OK"
diff --git a/tests/test_lambda_counterfactual_handler.py b/tests/test_lambda_counterfactual_handler.py
index c4651c4..3a2ae1c 100644
--- a/tests/test_lambda_counterfactual_handler.py
+++ b/tests/test_lambda_counterfactual_handler.py
@@ -208,3 +208,61 @@ def fake_compute(**kwargs):
                 context=None,
             )
         assert captured["agent_filter"] == ["ic_cio"]
+
+
+# ── Shell-run dry path (Saturday-SF keystone) ────────────────────────────
+
+
+class TestShellRunDryPath:
+    """`dry_run_llm: true` (the canonical keystone shell-run key) must
+    short-circuit BEFORE the replay scan: no compute_and_emit call (so
+    no decision_artifacts S3 discovery, no sklearn fit, no CloudWatch
+    metric emit, no S3 per-agent persist), boot + module imports still
+    run, and a benign success envelope is returned. No LLM calls exist
+    on this handler's path regardless."""
+
+    def test_dry_run_llm_short_circuits_before_scan(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init") as m_init, \
+             patch("replay.counterfactual.compute_and_emit") as m_compute:
+            result = handler_mod.handler({"dry_run_llm": True}, context=None)
+
+        m_init.assert_called_once()
+        m_compute.assert_not_called()
+        assert result["status"] == "DRY_RUN"
+        assert result["dry_run"] is True
+        assert result["handler"] == "lambda_counterfactual"
+        assert "duration_seconds" in result
+
+    def test_dry_run_llm_string_true_coerced(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.counterfactual.compute_and_emit") as m_compute:
+            result = handler_mod.handler({"dry_run_llm": "1"}, context=None)
+        m_compute.assert_not_called()
+        assert result["status"] == "DRY_RUN"
+
+    def test_dry_run_llm_false_takes_real_path(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.counterfactual.compute_and_emit",
+                   return_value=_ok_summary()) as m_compute:
+            result = handler_mod.handler({"dry_run_llm": False}, context=None)
+        m_compute.assert_called_once()
+        assert result["status"] == "OK"
+
+    def test_absent_dry_run_llm_takes_real_path(self, handler_mod):
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.counterfactual.compute_and_emit",
+                   return_value=_ok_summary()) as m_compute:
+            result = handler_mod.handler({}, context=None)
+        m_compute.assert_called_once()
+        assert result["status"] == "OK"
+
+    def test_legacy_dry_run_key_still_takes_real_path(self, handler_mod):
+        """The pre-existing `dry_run` (compute-but-don't-emit-metrics)
+        semantic is preserved — it must NOT short-circuit the scan."""
+        with patch.object(handler_mod, "_ensure_init"), \
+             patch("replay.counterfactual.compute_and_emit",
+                   return_value=_ok_summary()) as m_compute:
+            result = handler_mod.handler({"dry_run": True}, context=None)
+        m_compute.assert_called_once()
+        assert m_compute.call_args.kwargs["emit_metrics"] is False
+        assert result["status"] == "OK"
diff --git a/tests/test_replay_shell_run_dry.py b/tests/test_replay_shell_run_dry.py
new file mode 100644
index 0000000..33bbe27
--- /dev/null
+++ b/tests/test_replay_shell_run_dry.py
@@ -0,0 +1,59 @@
+"""Unit tests for the shared shell-run dry helper in replay/__init__.py.
+
+The helper is the single canonical (no-copy-paste) implementation used
+by BOTH lambda_concordance/handler.py and lambda_counterfactual/
+handler.py to short-circuit the Saturday-SF shell-run dry path before
+any replay scan / external call / S3 / CloudWatch write.
+"""
+
+from __future__ import annotations
+
+from replay import (
+    SHELL_RUN_DRY_EVENT_KEY,
+    is_shell_run_dry,
+    shell_run_dry_response,
+)
+
+
+class TestEventKey:
+    def test_canonical_key_is_dry_run_llm(self):
+        # Verbatim match with the keystone's Research-Lambda key
+        # (`"dry_run_llm.$": "$.research_dry"` in step_function.json).
+        assert SHELL_RUN_DRY_EVENT_KEY == "dry_run_llm"
+
+
+class TestIsShellRunDry:
+    def test_true_bool(self):
+        assert is_shell_run_dry({"dry_run_llm": True}) is True
+
+    def test_false_bool(self):
+        assert is_shell_run_dry({"dry_run_llm": False}) is False
+
+    def test_absent_key(self):
+        assert is_shell_run_dry({}) is False
+
+    def test_none_event(self):
+        assert is_shell_run_dry(None) is False
+
+    def test_string_true_forms(self):
+        for v in ("true", "True", "TRUE", "1", "yes", " true "):
+            assert is_shell_run_dry({"dry_run_llm": v}) is True
+
+    def test_string_false_forms(self):
+        for v in ("false", "0", "no", ""):
+            assert is_shell_run_dry({"dry_run_llm": v}) is False
+
+    def test_legacy_dry_run_key_does_not_trigger(self):
+        # The pre-existing `dry_run` (compute-but-don't-emit) key must
+        # NOT be interpreted as the shell-run short-circuit signal.
+        assert is_shell_run_dry({"dry_run": True}) is False
+
+
+class TestShellRunDryResponse:
+    def test_envelope_shape(self):
+        resp = shell_run_dry_response("lambda_concordance", 0.0)
+        assert resp["status"] == "DRY_RUN"
+        assert resp["dry_run"] is True
+        assert resp["handler"] == "lambda_concordance"
+        assert "note" in resp
+        assert isinstance(resp["duration_seconds"], float)