cipher813 · cipher813 · May 3, 2026 · May 3, 2026
diff --git a/infrastructure/deploy_step_function.sh b/infrastructure/deploy_step_function.sh
@@ -7,7 +7,9 @@
 #   2. SSM agent installed on the always-on EC2 instance
 #   3. Research Lambda (alpha-engine-research-runner) deployed
 #   4. Data Phase 2 Lambda (alpha-engine-data-collector) deployed
-#   5. Repos cloned on always-on EC2: alpha-engine-data, alpha-engine-predictor,
+#   5. Eval-judge Lambda (alpha-engine-research-eval-judge) deployed via
+#      `infrastructure/deploy.sh eval_judge` from alpha-engine-research
+#   6. Repos cloned on always-on EC2: alpha-engine-data, alpha-engine-predictor,
 #      alpha-engine-backtester
 #
 # Usage:
@@ -99,6 +101,7 @@ POLICY='{
       "Action": ["lambda:InvokeFunction"],
       "Resource": [
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-runner*",
+        "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*"
       ]

diff --git a/infrastructure/deploy_step_function_daily.sh b/infrastructure/deploy_step_function_daily.sh
@@ -47,6 +47,7 @@ POLICY='{
       "Action": ["lambda:InvokeFunction"],
       "Resource": [
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-runner*",
+        "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-health-check*"

diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json
@@ -480,7 +480,7 @@
         {
           "Variable": "$.backtester_poll.Status",
           "StringEquals": "Success",
-          "Next": "SaturdayHealthCheck"
+          "Next": "CheckSkipEvalJudge"
         },
         {
           "Variable": "$.backtester_poll.Status",
@@ -502,6 +502,109 @@
       "Next": "WaitForBacktester"
     },
 
+    "CheckSkipEvalJudge": {
+      "Type": "Choice",
+      "Comment": "Skip-gate. {\"skip_eval_judge\": true} bypasses the LLM-as-judge eval step (typically used for ad-hoc reruns where eval cost is unwanted).",
+      "Choices": [
+        {
+          "And": [
+            {"Variable": "$.skip_eval_judge", "IsPresent": true},
+            {"Variable": "$.skip_eval_judge", "BooleanEquals": true}
+          ],
+          "Next": "SaturdayHealthCheck"
+        }
+      ],
+      "Default": "ComputeEvalCadence"
+    },
+
+    "ComputeEvalCadence": {
+      "Type": "Pass",
+      "Comment": "Extract day-of-month + ISO date from the SF execution start time so the next Choice state can branch the LLM-as-judge two-tier sampling. day_of_month <= 07 ⇒ first Saturday of the month ⇒ force_sonnet_pass=true (monthly Sonnet sweep per ROADMAP §1626). All other Saturdays run Haiku-only with the per-artifact <3 escalation gate inside the Lambda.",
+      "Parameters": {
+        "day_of_month.$": "States.ArrayGetItem(States.StringSplit(States.ArrayGetItem(States.StringSplit($$.Execution.StartTime, 'T'), 0), '-'), 2)",
+        "eval_date.$": "States.ArrayGetItem(States.StringSplit($$.Execution.StartTime, 'T'), 0)"
+      },
+      "ResultPath": "$.eval_cadence",
+      "Next": "CheckMonthlyCadence"
+    },
+
+    "CheckMonthlyCadence": {
+      "Type": "Choice",
+      "Comment": "First-Saturday-of-the-month detection via lexicographic compare on the zero-padded day-of-month string ('01'..'07' < '08'). Lexicographic ordering is correct here because all values are zero-padded 2-char strings.",
+      "Choices": [
+        {
+          "Variable": "$.eval_cadence.day_of_month",
+          "StringLessThan": "08",
+          "Next": "EvalJudgeFirstSaturday"
+        }
+      ],
+      "Default": "EvalJudgeWeekly"
+    },
+
+    "EvalJudgeFirstSaturday": {
+      "Type": "Task",
+      "Comment": "Monthly Sonnet sweep — force_sonnet_pass=true causes the Lambda to evaluate every captured artifact with both Haiku and Sonnet (regardless of Haiku's per-artifact <3 escalation gate). Calibration insurance: catches the case where Haiku consistently scores 5/5 and would otherwise hide a real regression.",
+      "Resource": "arn:aws:states:::lambda:invoke",
+      "Parameters": {
+        "FunctionName": "alpha-engine-research-eval-judge:live",
+        "Payload": {
+          "force_sonnet_pass": true,
+          "date.$": "$.eval_cadence.eval_date"
+        }
+      },
+      "TimeoutSeconds": 900,
+      "Retry": [
+        {
+          "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
+          "MaxAttempts": 1,
+          "IntervalSeconds": 60,
+          "BackoffRate": 1.0
+        }
+      ],
+      "Catch": [
+        {
+          "ErrorEquals": ["States.ALL"],
+          "Comment": "Eval is observability — failures must NOT halt the pipeline. The handler already returns OK/PARTIAL/ERROR rather than throwing on per-artifact issues; this Catch covers infra-level failures (Lambda timeout, transient AWS errors).",
+          "Next": "SaturdayHealthCheck",
+          "ResultPath": "$.eval_judge_error"
+        }
+      ],
+      "ResultPath": "$.eval_judge_result",
+      "Next": "SaturdayHealthCheck"
+    },
+
+    "EvalJudgeWeekly": {
+      "Type": "Task",
+      "Comment": "Weekly Haiku-only judge run with per-artifact <3 escalation to Sonnet for borderline outputs. Per-artifact rather than batch-level escalation keeps weekly judging cost bounded while preserving diagnostic depth where it matters.",
+      "Resource": "arn:aws:states:::lambda:invoke",
+      "Parameters": {
+        "FunctionName": "alpha-engine-research-eval-judge:live",
+        "Payload": {
+          "force_sonnet_pass": false,
+          "date.$": "$.eval_cadence.eval_date"
+        }
+      },
+      "TimeoutSeconds": 900,
+      "Retry": [
+        {
+          "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
+          "MaxAttempts": 1,
+          "IntervalSeconds": 60,
+          "BackoffRate": 1.0
+        }
+      ],
+      "Catch": [
+        {
+          "ErrorEquals": ["States.ALL"],
+          "Comment": "Eval is observability — failures must NOT halt the pipeline.",
+          "Next": "SaturdayHealthCheck",
+          "ResultPath": "$.eval_judge_error"
+        }
+      ],
+      "ResultPath": "$.eval_judge_result",
+      "Next": "SaturdayHealthCheck"
+    },
+
     "SaturdayHealthCheck": {
       "Type": "Task",
       "Comment": "Check data freshness after full pipeline — non-blocking (alerts on failure but does not halt). Runs from alpha-engine-dashboard post-2026-04-16 health_checker migration.",

diff --git a/tests/test_sf_eval_judge_wiring.py b/tests/test_sf_eval_judge_wiring.py
@@ -0,0 +1,192 @@
+"""Pins the LLM-as-judge wiring in the Saturday Step Functions JSON.
+
+Catches regressions like: someone re-routes CheckBacktesterStatus.Success
+back to SaturdayHealthCheck and accidentally drops the eval state, or
+flips the Default branch of the cadence Choice and ships every Saturday
+on the (more expensive) monthly Sonnet sweep.
+
+The corresponding alpha-engine-research Lambda
+(``alpha-engine-research-eval-judge:live``) is in PR #91; this test only
+asserts the SF wiring, not the handler shape.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+_SF_PATH = _REPO_ROOT / "infrastructure" / "step_function.json"
+
+
+@pytest.fixture(scope="module")
+def sf() -> dict:
+    return json.loads(_SF_PATH.read_text())
+
+
+@pytest.fixture(scope="module")
+def states(sf) -> dict:
+    return sf["States"]
+
+
+# ── State presence ────────────────────────────────────────────────────────
+
+
+class TestStatesPresent:
+    def test_all_eval_judge_states_exist(self, states):
+        for name in (
+            "CheckSkipEvalJudge",
+            "ComputeEvalCadence",
+            "CheckMonthlyCadence",
+            "EvalJudgeFirstSaturday",
+            "EvalJudgeWeekly",
+        ):
+            assert name in states, f"missing SF state: {name}"
+
+
+# ── Backtester success → eval-judge skip-gate ─────────────────────────────
+
+
+class TestBacktesterTransition:
+    def test_success_routes_to_eval_skip_gate(self, states):
+        bt = states["CheckBacktesterStatus"]
+        success_choice = next(
+            c for c in bt["Choices"] if c.get("StringEquals") == "Success"
+        )
+        assert success_choice["Next"] == "CheckSkipEvalJudge"
+
+
+# ── Skip gate ─────────────────────────────────────────────────────────────
+
+
+class TestSkipEvalJudge:
+    def test_skip_flag_bypasses_to_health_check(self, states):
+        skip = states["CheckSkipEvalJudge"]
+        choice = skip["Choices"][0]
+        # Both presence + boolean equality must be checked (matches
+        # other skip gates like CheckSkipResearch).
+        and_clauses = choice["And"]
+        assert any(
+            c.get("Variable") == "$.skip_eval_judge"
+            and c.get("BooleanEquals") is True
+            for c in and_clauses
+        )
+        assert choice["Next"] == "SaturdayHealthCheck"
+
+    def test_default_runs_eval(self, states):
+        assert states["CheckSkipEvalJudge"]["Default"] == "ComputeEvalCadence"
+
+
+# ── Cadence computation ───────────────────────────────────────────────────
+
+
+class TestComputeEvalCadence:
+    def test_extracts_day_of_month_and_eval_date(self, states):
+        params = states["ComputeEvalCadence"]["Parameters"]
+        # Both intrinsic-function expressions must be present so the
+        # downstream Choice + Payload can reference them.
+        assert "day_of_month.$" in params
+        assert "eval_date.$" in params
+        # Reference shape — protect against accidental rename of either
+        # JSONPath that would leave the Choice state matching nothing.
+        assert "$$.Execution.StartTime" in params["day_of_month.$"]
+        assert "$$.Execution.StartTime" in params["eval_date.$"]
+
+    def test_writes_to_eval_cadence_path(self, states):
+        assert states["ComputeEvalCadence"]["ResultPath"] == "$.eval_cadence"
+
+    def test_routes_to_cadence_choice(self, states):
+        assert states["ComputeEvalCadence"]["Next"] == "CheckMonthlyCadence"
+
+
+# ── Monthly cadence Choice ────────────────────────────────────────────────
+
+
+class TestCheckMonthlyCadence:
+    def test_default_is_weekly(self, states):
+        # Default = the COMMON path (every other Saturday). Must NOT
+        # be EvalJudgeFirstSaturday — that would ship every weekly run
+        # on the expensive monthly Sonnet sweep.
+        assert states["CheckMonthlyCadence"]["Default"] == "EvalJudgeWeekly"
+
+    def test_first_saturday_branch_uses_lex_compare_under_08(self, states):
+        choice = states["CheckMonthlyCadence"]["Choices"][0]
+        assert choice["Variable"] == "$.eval_cadence.day_of_month"
+        assert choice["StringLessThan"] == "08"
+        assert choice["Next"] == "EvalJudgeFirstSaturday"
+
+
+# ── Lambda invocation contract ────────────────────────────────────────────
+
+
+class TestEvalJudgeLambdaContract:
+    @pytest.mark.parametrize(
+        "state_name,expected_force_sonnet",
+        [
+            ("EvalJudgeFirstSaturday", True),
+            ("EvalJudgeWeekly", False),
+        ],
+    )
+    def test_payload_carries_correct_force_sonnet_flag(
+        self, states, state_name, expected_force_sonnet,
+    ):
+        payload = states[state_name]["Parameters"]["Payload"]
+        assert payload["force_sonnet_pass"] is expected_force_sonnet
+
+    @pytest.mark.parametrize(
+        "state_name",
+        ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
+    )
+    def test_payload_passes_eval_date(self, states, state_name):
+        payload = states[state_name]["Parameters"]["Payload"]
+        # SF passes the SF-execution-start-date so the Lambda evaluates
+        # the same partition the captures landed in (avoids UTC-rollover
+        # edge cases where the Lambda starts on day X+1).
+        assert payload["date.$"] == "$.eval_cadence.eval_date"
+
+    @pytest.mark.parametrize(
+        "state_name",
+        ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
+    )
+    def test_invokes_live_alias(self, states, state_name):
+        params = states[state_name]["Parameters"]
+        assert params["FunctionName"] == "alpha-engine-research-eval-judge:live"
+
+    @pytest.mark.parametrize(
+        "state_name",
+        ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
+    )
+    def test_timeout_matches_lambda_max(self, states, state_name):
+        # Lambda's hard timeout is 900s (set in alpha-engine-research
+        # infrastructure/deploy.sh). SF state TimeoutSeconds must not be
+        # less — otherwise SF would kill an in-progress eval prematurely.
+        assert states[state_name]["TimeoutSeconds"] == 900
+
+
+# ── Non-blocking failure semantics ────────────────────────────────────────
+
+
+class TestEvalJudgeNonBlocking:
+    @pytest.mark.parametrize(
+        "state_name",
+        ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
+    )
+    def test_success_continues_to_health_check(self, states, state_name):
+        assert states[state_name]["Next"] == "SaturdayHealthCheck"
+
+    @pytest.mark.parametrize(
+        "state_name",
+        ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
+    )
+    def test_catch_routes_to_health_check_not_failure(self, states, state_name):
+        # Eval is observability per ROADMAP §1635 — failures must NOT
+        # halt the pipeline. Routing to HandleFailure here would be a
+        # regression that shoots the whole Saturday run on a 5xx from
+        # Anthropic on the eval Lambda specifically.
+        catch = states[state_name]["Catch"][0]
+        assert catch["ErrorEquals"] == ["States.ALL"]
+        assert catch["Next"] == "SaturdayHealthCheck"
+        assert catch["Next"] != "HandleFailure"