Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion infrastructure/deploy_step_function.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
# 2. SSM agent installed on the always-on EC2 instance
# 3. Research Lambda (alpha-engine-research-runner) deployed
# 4. Data Phase 2 Lambda (alpha-engine-data-collector) deployed
# 5. Repos cloned on always-on EC2: alpha-engine-data, alpha-engine-predictor,
# 5. Eval-judge Lambda (alpha-engine-research-eval-judge) deployed via
# `infrastructure/deploy.sh eval_judge` from alpha-engine-research
# 6. Repos cloned on always-on EC2: alpha-engine-data, alpha-engine-predictor,
# alpha-engine-backtester
#
# Usage:
Expand Down Expand Up @@ -99,6 +101,7 @@ POLICY='{
"Action": ["lambda:InvokeFunction"],
"Resource": [
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-runner*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*"
]
Expand Down
1 change: 1 addition & 0 deletions infrastructure/deploy_step_function_daily.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ POLICY='{
"Action": ["lambda:InvokeFunction"],
"Resource": [
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-runner*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-health-check*"
Expand Down
105 changes: 104 additions & 1 deletion infrastructure/step_function.json
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@
{
"Variable": "$.backtester_poll.Status",
"StringEquals": "Success",
"Next": "SaturdayHealthCheck"
"Next": "CheckSkipEvalJudge"
},
{
"Variable": "$.backtester_poll.Status",
Expand All @@ -502,6 +502,109 @@
"Next": "WaitForBacktester"
},

"CheckSkipEvalJudge": {
"Type": "Choice",
"Comment": "Skip-gate. {\"skip_eval_judge\": true} bypasses the LLM-as-judge eval step (typically used for ad-hoc reruns where eval cost is unwanted).",
"Choices": [
{
"And": [
{"Variable": "$.skip_eval_judge", "IsPresent": true},
{"Variable": "$.skip_eval_judge", "BooleanEquals": true}
],
"Next": "SaturdayHealthCheck"
}
],
"Default": "ComputeEvalCadence"
},

"ComputeEvalCadence": {
"Type": "Pass",
"Comment": "Extract day-of-month + ISO date from the SF execution start time so the next Choice state can branch the LLM-as-judge two-tier sampling. day_of_month <= 07 ⇒ first Saturday of the month ⇒ force_sonnet_pass=true (monthly Sonnet sweep per ROADMAP §1626). All other Saturdays run Haiku-only with the per-artifact <3 escalation gate inside the Lambda.",
"Parameters": {
"day_of_month.$": "States.ArrayGetItem(States.StringSplit(States.ArrayGetItem(States.StringSplit($$.Execution.StartTime, 'T'), 0), '-'), 2)",
"eval_date.$": "States.ArrayGetItem(States.StringSplit($$.Execution.StartTime, 'T'), 0)"
},
"ResultPath": "$.eval_cadence",
"Next": "CheckMonthlyCadence"
},

"CheckMonthlyCadence": {
"Type": "Choice",
"Comment": "First-Saturday-of-the-month detection via lexicographic compare on the zero-padded day-of-month string ('01'..'07' < '08'). Lexicographic ordering is correct here because all values are zero-padded 2-char strings.",
"Choices": [
{
"Variable": "$.eval_cadence.day_of_month",
"StringLessThan": "08",
"Next": "EvalJudgeFirstSaturday"
}
],
"Default": "EvalJudgeWeekly"
},

"EvalJudgeFirstSaturday": {
"Type": "Task",
"Comment": "Monthly Sonnet sweep — force_sonnet_pass=true causes the Lambda to evaluate every captured artifact with both Haiku and Sonnet (regardless of Haiku's per-artifact <3 escalation gate). Calibration insurance: catches the case where Haiku consistently scores 5/5 and would otherwise hide a real regression.",
"Resource": "arn:aws:states:::lambda:invoke",
"Parameters": {
"FunctionName": "alpha-engine-research-eval-judge:live",
"Payload": {
"force_sonnet_pass": true,
"date.$": "$.eval_cadence.eval_date"
}
},
"TimeoutSeconds": 900,
"Retry": [
{
"ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
"MaxAttempts": 1,
"IntervalSeconds": 60,
"BackoffRate": 1.0
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"Comment": "Eval is observability — failures must NOT halt the pipeline. The handler already returns OK/PARTIAL/ERROR rather than throwing on per-artifact issues; this Catch covers infra-level failures (Lambda timeout, transient AWS errors).",
"Next": "SaturdayHealthCheck",
"ResultPath": "$.eval_judge_error"
}
],
"ResultPath": "$.eval_judge_result",
"Next": "SaturdayHealthCheck"
},

"EvalJudgeWeekly": {
"Type": "Task",
"Comment": "Weekly Haiku-only judge run with per-artifact <3 escalation to Sonnet for borderline outputs. Per-artifact rather than batch-level escalation keeps weekly judging cost bounded while preserving diagnostic depth where it matters.",
"Resource": "arn:aws:states:::lambda:invoke",
"Parameters": {
"FunctionName": "alpha-engine-research-eval-judge:live",
"Payload": {
"force_sonnet_pass": false,
"date.$": "$.eval_cadence.eval_date"
}
},
"TimeoutSeconds": 900,
"Retry": [
{
"ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
"MaxAttempts": 1,
"IntervalSeconds": 60,
"BackoffRate": 1.0
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"Comment": "Eval is observability — failures must NOT halt the pipeline.",
"Next": "SaturdayHealthCheck",
"ResultPath": "$.eval_judge_error"
}
],
"ResultPath": "$.eval_judge_result",
"Next": "SaturdayHealthCheck"
},

"SaturdayHealthCheck": {
"Type": "Task",
"Comment": "Check data freshness after full pipeline — non-blocking (alerts on failure but does not halt). Runs from alpha-engine-dashboard post-2026-04-16 health_checker migration.",
Expand Down
192 changes: 192 additions & 0 deletions tests/test_sf_eval_judge_wiring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""Pins the LLM-as-judge wiring in the Saturday Step Functions JSON.

Catches regressions like: someone re-routes CheckBacktesterStatus.Success
back to SaturdayHealthCheck and accidentally drops the eval state, or
flips the Default branch of the cadence Choice and ships every Saturday
on the (more expensive) monthly Sonnet sweep.

The corresponding alpha-engine-research Lambda
(``alpha-engine-research-eval-judge:live``) is in PR #91; this test only
asserts the SF wiring, not the handler shape.
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest


_REPO_ROOT = Path(__file__).resolve().parent.parent
_SF_PATH = _REPO_ROOT / "infrastructure" / "step_function.json"


@pytest.fixture(scope="module")
def sf() -> dict:
return json.loads(_SF_PATH.read_text())


@pytest.fixture(scope="module")
def states(sf) -> dict:
return sf["States"]


# ── State presence ────────────────────────────────────────────────────────


class TestStatesPresent:
def test_all_eval_judge_states_exist(self, states):
for name in (
"CheckSkipEvalJudge",
"ComputeEvalCadence",
"CheckMonthlyCadence",
"EvalJudgeFirstSaturday",
"EvalJudgeWeekly",
):
assert name in states, f"missing SF state: {name}"


# ── Backtester success → eval-judge skip-gate ─────────────────────────────


class TestBacktesterTransition:
def test_success_routes_to_eval_skip_gate(self, states):
bt = states["CheckBacktesterStatus"]
success_choice = next(
c for c in bt["Choices"] if c.get("StringEquals") == "Success"
)
assert success_choice["Next"] == "CheckSkipEvalJudge"


# ── Skip gate ─────────────────────────────────────────────────────────────


class TestSkipEvalJudge:
def test_skip_flag_bypasses_to_health_check(self, states):
skip = states["CheckSkipEvalJudge"]
choice = skip["Choices"][0]
# Both presence + boolean equality must be checked (matches
# other skip gates like CheckSkipResearch).
and_clauses = choice["And"]
assert any(
c.get("Variable") == "$.skip_eval_judge"
and c.get("BooleanEquals") is True
for c in and_clauses
)
assert choice["Next"] == "SaturdayHealthCheck"

def test_default_runs_eval(self, states):
assert states["CheckSkipEvalJudge"]["Default"] == "ComputeEvalCadence"


# ── Cadence computation ───────────────────────────────────────────────────


class TestComputeEvalCadence:
def test_extracts_day_of_month_and_eval_date(self, states):
params = states["ComputeEvalCadence"]["Parameters"]
# Both intrinsic-function expressions must be present so the
# downstream Choice + Payload can reference them.
assert "day_of_month.$" in params
assert "eval_date.$" in params
# Reference shape — protect against accidental rename of either
# JSONPath that would leave the Choice state matching nothing.
assert "$$.Execution.StartTime" in params["day_of_month.$"]
assert "$$.Execution.StartTime" in params["eval_date.$"]

def test_writes_to_eval_cadence_path(self, states):
assert states["ComputeEvalCadence"]["ResultPath"] == "$.eval_cadence"

def test_routes_to_cadence_choice(self, states):
assert states["ComputeEvalCadence"]["Next"] == "CheckMonthlyCadence"


# ── Monthly cadence Choice ────────────────────────────────────────────────


class TestCheckMonthlyCadence:
def test_default_is_weekly(self, states):
# Default = the COMMON path (every other Saturday). Must NOT
# be EvalJudgeFirstSaturday — that would ship every weekly run
# on the expensive monthly Sonnet sweep.
assert states["CheckMonthlyCadence"]["Default"] == "EvalJudgeWeekly"

def test_first_saturday_branch_uses_lex_compare_under_08(self, states):
choice = states["CheckMonthlyCadence"]["Choices"][0]
assert choice["Variable"] == "$.eval_cadence.day_of_month"
assert choice["StringLessThan"] == "08"
assert choice["Next"] == "EvalJudgeFirstSaturday"


# ── Lambda invocation contract ────────────────────────────────────────────


class TestEvalJudgeLambdaContract:
@pytest.mark.parametrize(
"state_name,expected_force_sonnet",
[
("EvalJudgeFirstSaturday", True),
("EvalJudgeWeekly", False),
],
)
def test_payload_carries_correct_force_sonnet_flag(
self, states, state_name, expected_force_sonnet,
):
payload = states[state_name]["Parameters"]["Payload"]
assert payload["force_sonnet_pass"] is expected_force_sonnet

@pytest.mark.parametrize(
"state_name",
["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
)
def test_payload_passes_eval_date(self, states, state_name):
payload = states[state_name]["Parameters"]["Payload"]
# SF passes the SF-execution-start-date so the Lambda evaluates
# the same partition the captures landed in (avoids UTC-rollover
# edge cases where the Lambda starts on day X+1).
assert payload["date.$"] == "$.eval_cadence.eval_date"

@pytest.mark.parametrize(
"state_name",
["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
)
def test_invokes_live_alias(self, states, state_name):
params = states[state_name]["Parameters"]
assert params["FunctionName"] == "alpha-engine-research-eval-judge:live"

@pytest.mark.parametrize(
"state_name",
["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
)
def test_timeout_matches_lambda_max(self, states, state_name):
# Lambda's hard timeout is 900s (set in alpha-engine-research
# infrastructure/deploy.sh). SF state TimeoutSeconds must not be
# less — otherwise SF would kill an in-progress eval prematurely.
assert states[state_name]["TimeoutSeconds"] == 900


# ── Non-blocking failure semantics ────────────────────────────────────────


class TestEvalJudgeNonBlocking:
@pytest.mark.parametrize(
"state_name",
["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
)
def test_success_continues_to_health_check(self, states, state_name):
assert states[state_name]["Next"] == "SaturdayHealthCheck"

@pytest.mark.parametrize(
"state_name",
["EvalJudgeFirstSaturday", "EvalJudgeWeekly"],
)
def test_catch_routes_to_health_check_not_failure(self, states, state_name):
# Eval is observability per ROADMAP §1635 — failures must NOT
# halt the pipeline. Routing to HandleFailure here would be a
# regression that shoots the whole Saturday run on a 5xx from
# Anthropic on the eval Lambda specifically.
catch = states[state_name]["Catch"][0]
assert catch["ErrorEquals"] == ["States.ALL"]
assert catch["Next"] == "SaturdayHealthCheck"
assert catch["Next"] != "HandleFailure"
Loading