From d4c04ba6b75342a7f9d1149603ee5c8173761df0 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Mon, 18 May 2026 15:02:54 -0700 Subject: [PATCH] feat(backtester): shell-run dry path for replay concordance + counterfactual (closes SF skip-exceptions) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the last two Saturday-SF shell-run keystone skip-exceptions (ReplayConcordance / Counterfactual) by giving both replay Image-Lambda handlers a verified clean no-write dry path so the keystone can route them dry instead of pure-skipping them. Dry event-key (verbatim): "dry_run_llm" (boolean) Reuses the canonical keystone key established verbatim by the Research Lambda in alpha-engine-data step_function.json (`"dry_run_llm.$": "$.research_dry"`). No invented key. Distinct from the handlers' pre-existing `dry_run` event key, whose compute-but-don't-emit-metrics semantic is intentionally left untouched (backward compatible — proven by retained legacy tests). The SF rewire (ReplayConcordance/Counterfactual states must pass "dry_run_llm": true under shell_run) is a SEPARATE alpha-engine-data follow-on PR; not in this repo. Shared helper (canonical, no copy-paste): replay/__init__.py - SHELL_RUN_DRY_EVENT_KEY = "dry_run_llm" - is_shell_run_dry(event) — tolerant bool/str/None coercion - shell_run_dry_response(...) — benign {"status": "DRY_RUN", ...} Both handlers import and call the single implementation. Per-handler short-circuit proof: lambda_concordance/handler.py — the dry check sits AFTER _ensure_init() and AFTER the deferred `from replay.batch import compute_and_emit_concordance` (boot + module imports run for real — the keystone's whole point), but BEFORE the compute_and_emit_ concordance call. That call is the sole entry to the replay.batch decision_artifacts S3 scan, the langchain_anthropic / target-model replay calls, the CloudWatch agent_cheap_model_concordance emit, and the S3 summary persist. Dry path returns before all of them: zero external/LLM calls, zero S3/CW writes. lambda_counterfactual/handler.py — symmetric: dry check after _ensure_init() + `from replay.counterfactual import compute_and_emit`, before that call (the sole entry to the decision_artifacts S3 scan + sklearn fit + CloudWatch agent_counterfactual_rule_fit emit + S3 per-agent persist). No LLM on this path regardless. Tests assert compute_and_emit[_concordance] is never called under dry, _ensure_init still runs, and the SF (Catch-wrapped, non-blocking) gets a success status. Known SEPARATE issue (OUT OF SCOPE here — do not conflate): Counterfactual times out at 600s on real Saturday runs from corpus growth (~32,740 artifacts; last success 2026-05-13; the SF Catch swallows it). This PR does not touch the scan logic and does not attempt to fix that timeout. Aside: because the dry path skips the scan entirely, it incidentally avoids the timeout under shell_run — but the real-Saturday timeout remains a distinct bug to be tracked separately. Tests: +15 handler tests (TestShellRunDryPath x2: dry short-circuits before scan, string-true coercion, dry=false / absent take the real path, legacy `dry_run` still takes the real path) + a new tests/test_replay_shell_run_dry.py for the shared helper. Full backtester suite: 1684 passed, 5 skipped, 1 deselected (parity deselected as usual), 0 failed. No new deps, no secrets. Co-Authored-By: Claude Opus 4.7 (1M context) --- lambda_concordance/handler.py | 15 +++++ lambda_counterfactual/handler.py | 20 +++++++ replay/__init__.py | 62 +++++++++++++++++++++ tests/test_lambda_concordance_handler.py | 61 ++++++++++++++++++++ tests/test_lambda_counterfactual_handler.py | 58 +++++++++++++++++++ tests/test_replay_shell_run_dry.py | 59 ++++++++++++++++++++ 6 files changed, 275 insertions(+) create mode 100644 tests/test_replay_shell_run_dry.py diff --git a/lambda_concordance/handler.py b/lambda_concordance/handler.py index 02a0131..cd7265b 100644 --- a/lambda_concordance/handler.py +++ b/lambda_concordance/handler.py @@ -114,12 +114,27 @@ def handler(event: dict, context) -> dict: # Imports deferred until after _ensure_init so SSM-loaded secrets # are available for any module-level init that consults them. + from replay import is_shell_run_dry, shell_run_dry_response from replay.batch import ( DEFAULT_MAX_ARTIFACTS, compute_and_emit_concordance, ) t0 = time.time() + + # Shell-run dry path (Saturday-SF keystone). Boot + module imports + # above have already run for real (the keystone's whole point — + # exercise bootstrap/import/lib-pin/transport). Return a benign + # success BEFORE the replay.batch scan (decision_artifacts S3 + # discovery), BEFORE any langchain_anthropic / target-model call, + # and BEFORE any CloudWatch metric emit or S3 summary persist. + if is_shell_run_dry(event): + logger.info( + "[lambda_concordance] shell-run dry path: boot+imports OK, " + "skipping replay scan + Anthropic + S3/CW writes" + ) + return shell_run_dry_response("lambda_concordance", t0) + bucket = os.environ.get("S3_BUCKET", "alpha-engine-research") target_models = event.get("target_models") or ["claude-haiku-4-5"] diff --git a/lambda_counterfactual/handler.py b/lambda_counterfactual/handler.py index 7e5a8f4..08d60b5 100644 --- a/lambda_counterfactual/handler.py +++ b/lambda_counterfactual/handler.py @@ -94,9 +94,29 @@ def handler(event: dict, context) -> dict: """ _ensure_init() + from replay import is_shell_run_dry, shell_run_dry_response from replay.counterfactual import compute_and_emit t0 = time.time() + + # Shell-run dry path (Saturday-SF keystone). Boot + module imports + # above have already run for real. Return a benign success BEFORE + # the replay.counterfactual scan (decision_artifacts S3 discovery + + # sklearn fit), and BEFORE any CloudWatch metric emit or S3 + # per-agent analysis persist. No LLM calls exist on this path. + # + # Side benefit (NOT the contract): because the corpus scan is + # skipped, this also sidesteps the known separate production + # Counterfactual 600s-timeout-on-corpus-growth bug under shell_run + # — that real-Saturday timeout remains a distinct out-of-scope + # issue tracked separately; the scan logic is untouched here. + if is_shell_run_dry(event): + logger.info( + "[lambda_counterfactual] shell-run dry path: boot+imports " + "OK, skipping replay scan + sklearn fit + S3/CW writes" + ) + return shell_run_dry_response("lambda_counterfactual", t0) + bucket = os.environ.get("S3_BUCKET", "alpha-engine-research") end_time_iso = event.get("end_time_iso") diff --git a/replay/__init__.py b/replay/__init__.py index 115f18b..5715141 100644 --- a/replay/__init__.py +++ b/replay/__init__.py @@ -29,3 +29,65 @@ model would emit the same*. Together they cover the agent- justification triple alongside the counterfactual-rule-fit signal. """ + +from __future__ import annotations + +import time +from typing import Any + +# Canonical Saturday-SF shell-run dry-path event key. Established +# verbatim by the shell-run keystone (alpha-engine-data +# step_function.json) for the Research Lambda +# (``"dry_run_llm.$": "$.research_dry"``); reused here so the +# ReplayConcordance + Counterfactual states can be routed dry (boot + +# imports for real, return a benign success before any scan / external +# call / S3 / CloudWatch write) instead of pure-skipped. Distinct from +# the handlers' pre-existing ``dry_run`` event key, which has a +# different (compute-but-do-not-emit-metrics) semantic and is left +# untouched for backward compatibility. +SHELL_RUN_DRY_EVENT_KEY = "dry_run_llm" + + +def is_shell_run_dry(event: dict | None) -> bool: + """True when the SF shell-run keystone routed this Lambda dry. + + Reads the canonical ``dry_run_llm`` boolean off the invocation + event. Tolerates a missing/None event and string ``"true"``/``"1"`` + forms (Step Functions string-parameter convenience), mirroring the + coercion the handlers already apply to ``agents``/``target_models``. + """ + if not event: + return False + raw = event.get(SHELL_RUN_DRY_EVENT_KEY, False) + if isinstance(raw, str): + return raw.strip().lower() in {"true", "1", "yes"} + return bool(raw) + + +def shell_run_dry_response(handler_name: str, t0: float) -> dict: + """Benign success envelope returned BEFORE the replay scan. + + Returned by both replay Lambdas when ``is_shell_run_dry`` is true. + Hard invariant at the call site: zero external/LLM calls, zero + S3/CloudWatch writes, no decision_artifacts discovery — boot + + module imports have already run for real by the time this is + called. ``status`` is a recognised value the SF (Catch-wrapped, + non-blocking) treats as success. + """ + return { + "status": "DRY_RUN", + "dry_run": True, + "handler": handler_name, + "note": ( + "shell-run dry path: boot + imports executed; replay scan, " + "external/LLM calls, and all S3/CloudWatch writes skipped" + ), + "duration_seconds": round(time.time() - t0, 1), + } + + +__all__ = [ + "SHELL_RUN_DRY_EVENT_KEY", + "is_shell_run_dry", + "shell_run_dry_response", +] diff --git a/tests/test_lambda_concordance_handler.py b/tests/test_lambda_concordance_handler.py index 0d1fc8f..35d95c4 100644 --- a/tests/test_lambda_concordance_handler.py +++ b/tests/test_lambda_concordance_handler.py @@ -261,3 +261,64 @@ def fake_compute(**kwargs): context=None, ) assert captured["agent_filter"] == ["sector_quant", "ic_cio"] + + +# ── Shell-run dry path (Saturday-SF keystone) ──────────────────────────── + + +class TestShellRunDryPath: + """`dry_run_llm: true` (the canonical keystone shell-run key) must + short-circuit BEFORE the replay scan: no compute_and_emit_concordance + call (so no decision_artifacts S3 discovery, no langchain_anthropic / + target-model call, no CloudWatch metric emit, no S3 summary + persist), boot + module imports still run, and a benign success + envelope is returned.""" + + def test_dry_run_llm_short_circuits_before_scan(self, handler_mod): + with patch.object(handler_mod, "_ensure_init") as m_init, \ + patch("replay.batch.compute_and_emit_concordance") as m_compute: + result = handler_mod.handler({"dry_run_llm": True}, context=None) + + # Boot/init still ran for real (the keystone's whole point). + m_init.assert_called_once() + # The replay scan / Anthropic / S3+CW path was never entered. + m_compute.assert_not_called() + # SF (Catch-wrapped, non-blocking) treats this as success. + assert result["status"] == "DRY_RUN" + assert result["dry_run"] is True + assert result["handler"] == "lambda_concordance" + assert "duration_seconds" in result + + def test_dry_run_llm_string_true_coerced(self, handler_mod): + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.batch.compute_and_emit_concordance") as m_compute: + result = handler_mod.handler({"dry_run_llm": "true"}, context=None) + m_compute.assert_not_called() + assert result["status"] == "DRY_RUN" + + def test_dry_run_llm_false_takes_real_path(self, handler_mod): + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.batch.compute_and_emit_concordance", + return_value=_ok_summary()) as m_compute: + result = handler_mod.handler({"dry_run_llm": False}, context=None) + m_compute.assert_called_once() + assert result["status"] == "OK" + + def test_absent_dry_run_llm_takes_real_path(self, handler_mod): + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.batch.compute_and_emit_concordance", + return_value=_ok_summary()) as m_compute: + result = handler_mod.handler({}, context=None) + m_compute.assert_called_once() + assert result["status"] == "OK" + + def test_legacy_dry_run_key_still_takes_real_path(self, handler_mod): + """The pre-existing `dry_run` (compute-but-don't-emit-metrics) + semantic is preserved — it must NOT short-circuit the scan.""" + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.batch.compute_and_emit_concordance", + return_value=_ok_summary()) as m_compute: + result = handler_mod.handler({"dry_run": True}, context=None) + m_compute.assert_called_once() + assert m_compute.call_args.kwargs["emit_metrics"] is False + assert result["status"] == "OK" diff --git a/tests/test_lambda_counterfactual_handler.py b/tests/test_lambda_counterfactual_handler.py index c4651c4..3a2ae1c 100644 --- a/tests/test_lambda_counterfactual_handler.py +++ b/tests/test_lambda_counterfactual_handler.py @@ -208,3 +208,61 @@ def fake_compute(**kwargs): context=None, ) assert captured["agent_filter"] == ["ic_cio"] + + +# ── Shell-run dry path (Saturday-SF keystone) ──────────────────────────── + + +class TestShellRunDryPath: + """`dry_run_llm: true` (the canonical keystone shell-run key) must + short-circuit BEFORE the replay scan: no compute_and_emit call (so + no decision_artifacts S3 discovery, no sklearn fit, no CloudWatch + metric emit, no S3 per-agent persist), boot + module imports still + run, and a benign success envelope is returned. No LLM calls exist + on this handler's path regardless.""" + + def test_dry_run_llm_short_circuits_before_scan(self, handler_mod): + with patch.object(handler_mod, "_ensure_init") as m_init, \ + patch("replay.counterfactual.compute_and_emit") as m_compute: + result = handler_mod.handler({"dry_run_llm": True}, context=None) + + m_init.assert_called_once() + m_compute.assert_not_called() + assert result["status"] == "DRY_RUN" + assert result["dry_run"] is True + assert result["handler"] == "lambda_counterfactual" + assert "duration_seconds" in result + + def test_dry_run_llm_string_true_coerced(self, handler_mod): + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.counterfactual.compute_and_emit") as m_compute: + result = handler_mod.handler({"dry_run_llm": "1"}, context=None) + m_compute.assert_not_called() + assert result["status"] == "DRY_RUN" + + def test_dry_run_llm_false_takes_real_path(self, handler_mod): + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.counterfactual.compute_and_emit", + return_value=_ok_summary()) as m_compute: + result = handler_mod.handler({"dry_run_llm": False}, context=None) + m_compute.assert_called_once() + assert result["status"] == "OK" + + def test_absent_dry_run_llm_takes_real_path(self, handler_mod): + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.counterfactual.compute_and_emit", + return_value=_ok_summary()) as m_compute: + result = handler_mod.handler({}, context=None) + m_compute.assert_called_once() + assert result["status"] == "OK" + + def test_legacy_dry_run_key_still_takes_real_path(self, handler_mod): + """The pre-existing `dry_run` (compute-but-don't-emit-metrics) + semantic is preserved — it must NOT short-circuit the scan.""" + with patch.object(handler_mod, "_ensure_init"), \ + patch("replay.counterfactual.compute_and_emit", + return_value=_ok_summary()) as m_compute: + result = handler_mod.handler({"dry_run": True}, context=None) + m_compute.assert_called_once() + assert m_compute.call_args.kwargs["emit_metrics"] is False + assert result["status"] == "OK" diff --git a/tests/test_replay_shell_run_dry.py b/tests/test_replay_shell_run_dry.py new file mode 100644 index 0000000..33bbe27 --- /dev/null +++ b/tests/test_replay_shell_run_dry.py @@ -0,0 +1,59 @@ +"""Unit tests for the shared shell-run dry helper in replay/__init__.py. + +The helper is the single canonical (no-copy-paste) implementation used +by BOTH lambda_concordance/handler.py and lambda_counterfactual/ +handler.py to short-circuit the Saturday-SF shell-run dry path before +any replay scan / external call / S3 / CloudWatch write. +""" + +from __future__ import annotations + +from replay import ( + SHELL_RUN_DRY_EVENT_KEY, + is_shell_run_dry, + shell_run_dry_response, +) + + +class TestEventKey: + def test_canonical_key_is_dry_run_llm(self): + # Verbatim match with the keystone's Research-Lambda key + # (`"dry_run_llm.$": "$.research_dry"` in step_function.json). + assert SHELL_RUN_DRY_EVENT_KEY == "dry_run_llm" + + +class TestIsShellRunDry: + def test_true_bool(self): + assert is_shell_run_dry({"dry_run_llm": True}) is True + + def test_false_bool(self): + assert is_shell_run_dry({"dry_run_llm": False}) is False + + def test_absent_key(self): + assert is_shell_run_dry({}) is False + + def test_none_event(self): + assert is_shell_run_dry(None) is False + + def test_string_true_forms(self): + for v in ("true", "True", "TRUE", "1", "yes", " true "): + assert is_shell_run_dry({"dry_run_llm": v}) is True + + def test_string_false_forms(self): + for v in ("false", "0", "no", ""): + assert is_shell_run_dry({"dry_run_llm": v}) is False + + def test_legacy_dry_run_key_does_not_trigger(self): + # The pre-existing `dry_run` (compute-but-don't-emit) key must + # NOT be interpreted as the shell-run short-circuit signal. + assert is_shell_run_dry({"dry_run": True}) is False + + +class TestShellRunDryResponse: + def test_envelope_shape(self): + resp = shell_run_dry_response("lambda_concordance", 0.0) + assert resp["status"] == "DRY_RUN" + assert resp["dry_run"] is True + assert resp["handler"] == "lambda_concordance" + assert "note" in resp + assert isinstance(resp["duration_seconds"], float)