diff --git a/infrastructure/deploy_step_function.sh b/infrastructure/deploy_step_function.sh index 15c7e2c..060dd6a 100755 --- a/infrastructure/deploy_step_function.sh +++ b/infrastructure/deploy_step_function.sh @@ -7,7 +7,9 @@ # 2. SSM agent installed on the always-on EC2 instance # 3. Research Lambda (alpha-engine-research-runner) deployed # 4. Data Phase 2 Lambda (alpha-engine-data-collector) deployed -# 5. Repos cloned on always-on EC2: alpha-engine-data, alpha-engine-predictor, +# 5. Eval-judge Lambda (alpha-engine-research-eval-judge) deployed via +# `infrastructure/deploy.sh eval_judge` from alpha-engine-research +# 6. Repos cloned on always-on EC2: alpha-engine-data, alpha-engine-predictor, # alpha-engine-backtester # # Usage: @@ -99,6 +101,7 @@ POLICY='{ "Action": ["lambda:InvokeFunction"], "Resource": [ "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-runner*", + "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*" ] diff --git a/infrastructure/deploy_step_function_daily.sh b/infrastructure/deploy_step_function_daily.sh index 21db40f..a32f5af 100755 --- a/infrastructure/deploy_step_function_daily.sh +++ b/infrastructure/deploy_step_function_daily.sh @@ -47,6 +47,7 @@ POLICY='{ "Action": ["lambda:InvokeFunction"], "Resource": [ "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-runner*", + "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-health-check*" diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json index 3709f62..7b490ac 100644 --- a/infrastructure/step_function.json +++ b/infrastructure/step_function.json @@ -480,7 +480,7 @@ { "Variable": "$.backtester_poll.Status", "StringEquals": "Success", - "Next": "SaturdayHealthCheck" + "Next": "CheckSkipEvalJudge" }, { "Variable": "$.backtester_poll.Status", @@ -502,6 +502,109 @@ "Next": "WaitForBacktester" }, + "CheckSkipEvalJudge": { + "Type": "Choice", + "Comment": "Skip-gate. {\"skip_eval_judge\": true} bypasses the LLM-as-judge eval step (typically used for ad-hoc reruns where eval cost is unwanted).", + "Choices": [ + { + "And": [ + {"Variable": "$.skip_eval_judge", "IsPresent": true}, + {"Variable": "$.skip_eval_judge", "BooleanEquals": true} + ], + "Next": "SaturdayHealthCheck" + } + ], + "Default": "ComputeEvalCadence" + }, + + "ComputeEvalCadence": { + "Type": "Pass", + "Comment": "Extract day-of-month + ISO date from the SF execution start time so the next Choice state can branch the LLM-as-judge two-tier sampling. day_of_month <= 07 ⇒ first Saturday of the month ⇒ force_sonnet_pass=true (monthly Sonnet sweep per ROADMAP §1626). All other Saturdays run Haiku-only with the per-artifact <3 escalation gate inside the Lambda.", + "Parameters": { + "day_of_month.$": "States.ArrayGetItem(States.StringSplit(States.ArrayGetItem(States.StringSplit($$.Execution.StartTime, 'T'), 0), '-'), 2)", + "eval_date.$": "States.ArrayGetItem(States.StringSplit($$.Execution.StartTime, 'T'), 0)" + }, + "ResultPath": "$.eval_cadence", + "Next": "CheckMonthlyCadence" + }, + + "CheckMonthlyCadence": { + "Type": "Choice", + "Comment": "First-Saturday-of-the-month detection via lexicographic compare on the zero-padded day-of-month string ('01'..'07' < '08'). Lexicographic ordering is correct here because all values are zero-padded 2-char strings.", + "Choices": [ + { + "Variable": "$.eval_cadence.day_of_month", + "StringLessThan": "08", + "Next": "EvalJudgeFirstSaturday" + } + ], + "Default": "EvalJudgeWeekly" + }, + + "EvalJudgeFirstSaturday": { + "Type": "Task", + "Comment": "Monthly Sonnet sweep — force_sonnet_pass=true causes the Lambda to evaluate every captured artifact with both Haiku and Sonnet (regardless of Haiku's per-artifact <3 escalation gate). Calibration insurance: catches the case where Haiku consistently scores 5/5 and would otherwise hide a real regression.", + "Resource": "arn:aws:states:::lambda:invoke", + "Parameters": { + "FunctionName": "alpha-engine-research-eval-judge:live", + "Payload": { + "force_sonnet_pass": true, + "date.$": "$.eval_cadence.eval_date" + } + }, + "TimeoutSeconds": 900, + "Retry": [ + { + "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"], + "MaxAttempts": 1, + "IntervalSeconds": 60, + "BackoffRate": 1.0 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Comment": "Eval is observability — failures must NOT halt the pipeline. The handler already returns OK/PARTIAL/ERROR rather than throwing on per-artifact issues; this Catch covers infra-level failures (Lambda timeout, transient AWS errors).", + "Next": "SaturdayHealthCheck", + "ResultPath": "$.eval_judge_error" + } + ], + "ResultPath": "$.eval_judge_result", + "Next": "SaturdayHealthCheck" + }, + + "EvalJudgeWeekly": { + "Type": "Task", + "Comment": "Weekly Haiku-only judge run with per-artifact <3 escalation to Sonnet for borderline outputs. Per-artifact rather than batch-level escalation keeps weekly judging cost bounded while preserving diagnostic depth where it matters.", + "Resource": "arn:aws:states:::lambda:invoke", + "Parameters": { + "FunctionName": "alpha-engine-research-eval-judge:live", + "Payload": { + "force_sonnet_pass": false, + "date.$": "$.eval_cadence.eval_date" + } + }, + "TimeoutSeconds": 900, + "Retry": [ + { + "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"], + "MaxAttempts": 1, + "IntervalSeconds": 60, + "BackoffRate": 1.0 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Comment": "Eval is observability — failures must NOT halt the pipeline.", + "Next": "SaturdayHealthCheck", + "ResultPath": "$.eval_judge_error" + } + ], + "ResultPath": "$.eval_judge_result", + "Next": "SaturdayHealthCheck" + }, + "SaturdayHealthCheck": { "Type": "Task", "Comment": "Check data freshness after full pipeline — non-blocking (alerts on failure but does not halt). Runs from alpha-engine-dashboard post-2026-04-16 health_checker migration.", diff --git a/tests/test_sf_eval_judge_wiring.py b/tests/test_sf_eval_judge_wiring.py new file mode 100644 index 0000000..bfea06c --- /dev/null +++ b/tests/test_sf_eval_judge_wiring.py @@ -0,0 +1,192 @@ +"""Pins the LLM-as-judge wiring in the Saturday Step Functions JSON. + +Catches regressions like: someone re-routes CheckBacktesterStatus.Success +back to SaturdayHealthCheck and accidentally drops the eval state, or +flips the Default branch of the cadence Choice and ships every Saturday +on the (more expensive) monthly Sonnet sweep. + +The corresponding alpha-engine-research Lambda +(``alpha-engine-research-eval-judge:live``) is in PR #91; this test only +asserts the SF wiring, not the handler shape. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + + +_REPO_ROOT = Path(__file__).resolve().parent.parent +_SF_PATH = _REPO_ROOT / "infrastructure" / "step_function.json" + + +@pytest.fixture(scope="module") +def sf() -> dict: + return json.loads(_SF_PATH.read_text()) + + +@pytest.fixture(scope="module") +def states(sf) -> dict: + return sf["States"] + + +# ── State presence ──────────────────────────────────────────────────────── + + +class TestStatesPresent: + def test_all_eval_judge_states_exist(self, states): + for name in ( + "CheckSkipEvalJudge", + "ComputeEvalCadence", + "CheckMonthlyCadence", + "EvalJudgeFirstSaturday", + "EvalJudgeWeekly", + ): + assert name in states, f"missing SF state: {name}" + + +# ── Backtester success → eval-judge skip-gate ───────────────────────────── + + +class TestBacktesterTransition: + def test_success_routes_to_eval_skip_gate(self, states): + bt = states["CheckBacktesterStatus"] + success_choice = next( + c for c in bt["Choices"] if c.get("StringEquals") == "Success" + ) + assert success_choice["Next"] == "CheckSkipEvalJudge" + + +# ── Skip gate ───────────────────────────────────────────────────────────── + + +class TestSkipEvalJudge: + def test_skip_flag_bypasses_to_health_check(self, states): + skip = states["CheckSkipEvalJudge"] + choice = skip["Choices"][0] + # Both presence + boolean equality must be checked (matches + # other skip gates like CheckSkipResearch). + and_clauses = choice["And"] + assert any( + c.get("Variable") == "$.skip_eval_judge" + and c.get("BooleanEquals") is True + for c in and_clauses + ) + assert choice["Next"] == "SaturdayHealthCheck" + + def test_default_runs_eval(self, states): + assert states["CheckSkipEvalJudge"]["Default"] == "ComputeEvalCadence" + + +# ── Cadence computation ─────────────────────────────────────────────────── + + +class TestComputeEvalCadence: + def test_extracts_day_of_month_and_eval_date(self, states): + params = states["ComputeEvalCadence"]["Parameters"] + # Both intrinsic-function expressions must be present so the + # downstream Choice + Payload can reference them. + assert "day_of_month.$" in params + assert "eval_date.$" in params + # Reference shape — protect against accidental rename of either + # JSONPath that would leave the Choice state matching nothing. + assert "$$.Execution.StartTime" in params["day_of_month.$"] + assert "$$.Execution.StartTime" in params["eval_date.$"] + + def test_writes_to_eval_cadence_path(self, states): + assert states["ComputeEvalCadence"]["ResultPath"] == "$.eval_cadence" + + def test_routes_to_cadence_choice(self, states): + assert states["ComputeEvalCadence"]["Next"] == "CheckMonthlyCadence" + + +# ── Monthly cadence Choice ──────────────────────────────────────────────── + + +class TestCheckMonthlyCadence: + def test_default_is_weekly(self, states): + # Default = the COMMON path (every other Saturday). Must NOT + # be EvalJudgeFirstSaturday — that would ship every weekly run + # on the expensive monthly Sonnet sweep. + assert states["CheckMonthlyCadence"]["Default"] == "EvalJudgeWeekly" + + def test_first_saturday_branch_uses_lex_compare_under_08(self, states): + choice = states["CheckMonthlyCadence"]["Choices"][0] + assert choice["Variable"] == "$.eval_cadence.day_of_month" + assert choice["StringLessThan"] == "08" + assert choice["Next"] == "EvalJudgeFirstSaturday" + + +# ── Lambda invocation contract ──────────────────────────────────────────── + + +class TestEvalJudgeLambdaContract: + @pytest.mark.parametrize( + "state_name,expected_force_sonnet", + [ + ("EvalJudgeFirstSaturday", True), + ("EvalJudgeWeekly", False), + ], + ) + def test_payload_carries_correct_force_sonnet_flag( + self, states, state_name, expected_force_sonnet, + ): + payload = states[state_name]["Parameters"]["Payload"] + assert payload["force_sonnet_pass"] is expected_force_sonnet + + @pytest.mark.parametrize( + "state_name", + ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"], + ) + def test_payload_passes_eval_date(self, states, state_name): + payload = states[state_name]["Parameters"]["Payload"] + # SF passes the SF-execution-start-date so the Lambda evaluates + # the same partition the captures landed in (avoids UTC-rollover + # edge cases where the Lambda starts on day X+1). + assert payload["date.$"] == "$.eval_cadence.eval_date" + + @pytest.mark.parametrize( + "state_name", + ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"], + ) + def test_invokes_live_alias(self, states, state_name): + params = states[state_name]["Parameters"] + assert params["FunctionName"] == "alpha-engine-research-eval-judge:live" + + @pytest.mark.parametrize( + "state_name", + ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"], + ) + def test_timeout_matches_lambda_max(self, states, state_name): + # Lambda's hard timeout is 900s (set in alpha-engine-research + # infrastructure/deploy.sh). SF state TimeoutSeconds must not be + # less — otherwise SF would kill an in-progress eval prematurely. + assert states[state_name]["TimeoutSeconds"] == 900 + + +# ── Non-blocking failure semantics ──────────────────────────────────────── + + +class TestEvalJudgeNonBlocking: + @pytest.mark.parametrize( + "state_name", + ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"], + ) + def test_success_continues_to_health_check(self, states, state_name): + assert states[state_name]["Next"] == "SaturdayHealthCheck" + + @pytest.mark.parametrize( + "state_name", + ["EvalJudgeFirstSaturday", "EvalJudgeWeekly"], + ) + def test_catch_routes_to_health_check_not_failure(self, states, state_name): + # Eval is observability per ROADMAP §1635 — failures must NOT + # halt the pipeline. Routing to HandleFailure here would be a + # regression that shoots the whole Saturday run on a 5xx from + # Anthropic on the eval Lambda specifically. + catch = states[state_name]["Catch"][0] + assert catch["ErrorEquals"] == ["States.ALL"] + assert catch["Next"] == "SaturdayHealthCheck" + assert catch["Next"] != "HandleFailure"