diff --git a/infrastructure/deploy_step_function.sh b/infrastructure/deploy_step_function.sh index 895f8a4..57671e1 100755 --- a/infrastructure/deploy_step_function.sh +++ b/infrastructure/deploy_step_function.sh @@ -106,6 +106,7 @@ POLICY='{ "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-rolling-mean*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-rationale-clustering*", + "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-replay-concordance*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*", "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*" ] diff --git a/infrastructure/iam/github-actions-lambda-deploy.json b/infrastructure/iam/github-actions-lambda-deploy.json index 395a3ca..c7dad2f 100644 --- a/infrastructure/iam/github-actions-lambda-deploy.json +++ b/infrastructure/iam/github-actions-lambda-deploy.json @@ -47,6 +47,7 @@ "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-judge", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-rolling-mean", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-health-check" @@ -69,6 +70,8 @@ "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-rolling-mean:*", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering:*", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance:*", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference:*", "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector", diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json index ac4184d..c173072 100644 --- a/infrastructure/step_function.json +++ b/infrastructure/step_function.json @@ -645,7 +645,7 @@ {"Variable": "$.skip_rationale_clustering", "IsPresent": true}, {"Variable": "$.skip_rationale_clustering", "BooleanEquals": true} ], - "Next": "SaturdayHealthCheck" + "Next": "CheckSkipReplayConcordance" } ], "Default": "RationaleClustering" @@ -674,11 +674,60 @@ { "ErrorEquals": ["States.ALL"], "Comment": "Eval observability layer — failures must NOT halt the pipeline.", - "Next": "SaturdayHealthCheck", + "Next": "CheckSkipReplayConcordance", "ResultPath": "$.rationale_clustering_error" } ], "ResultPath": "$.rationale_clustering_result", + "Next": "CheckSkipReplayConcordance" + }, + + "CheckSkipReplayConcordance": { + "Type": "Choice", + "Comment": "Skip-gate. {\"skip_replay_concordance\": true} bypasses the cheap-model concordance Lambda (used for ad-hoc reruns where Anthropic spend is unwanted, or while iterating on the comparison scorers in alpha-engine-backtester replay/comparison.py). Independent of skip_rationale_clustering — the two are different agent-justification signals (clustering = cross-week templating; concordance = same-input cross-model agreement). Standalone invocation of the Lambda is always available regardless of this flag.", + "Choices": [ + { + "And": [ + {"Variable": "$.skip_replay_concordance", "IsPresent": true}, + {"Variable": "$.skip_replay_concordance", "BooleanEquals": true} + ], + "Next": "SaturdayHealthCheck" + } + ], + "Default": "ReplayConcordance" + }, + + "ReplayConcordance": { + "Type": "Task", + "Comment": "Weekly cheap-model concordance Lambda (ROADMAP P0, agent-justification gate signal #3). Replays each captured DecisionArtifact in the trailing 8-week window under the configured target model(s) via langchain_anthropic.with_structured_output against the canonical agent_schemas Pydantic contract; aggregates agreement_score per (agent_id_base, target_model); emits CW metric agent_cheap_model_concordance; persists per-target summary JSON to decision_artifacts/_replay_summary/{YYYY-MM-DD}/{target_model}.json. Default target: claude-haiku-4-5 (Sonnet→Haiku concordance ≈ \"is the larger model earning its cost?\"). Runs every Saturday after RationaleClustering.", + "Resource": "arn:aws:states:::lambda:invoke", + "Parameters": { + "FunctionName": "alpha-engine-replay-concordance:live", + "Payload": { + "end_time_iso.$": "$$.Execution.StartTime", + "target_models": ["claude-haiku-4-5"], + "window_days": 56, + "max_artifacts": 150 + } + }, + "TimeoutSeconds": 900, + "Retry": [ + { + "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"], + "MaxAttempts": 1, + "IntervalSeconds": 60, + "BackoffRate": 1.0 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Comment": "Eval observability layer — concordance failure must NOT halt the pipeline. Same Catch pattern as eval-judge / eval-rolling-mean / rationale-clustering.", + "Next": "SaturdayHealthCheck", + "ResultPath": "$.replay_concordance_error" + } + ], + "ResultPath": "$.replay_concordance_result", "Next": "SaturdayHealthCheck" }, diff --git a/tests/test_sf_eval_judge_wiring.py b/tests/test_sf_eval_judge_wiring.py index 19ec9b1..fe63cc6 100644 --- a/tests/test_sf_eval_judge_wiring.py +++ b/tests/test_sf_eval_judge_wiring.py @@ -46,6 +46,8 @@ def test_all_eval_judge_states_exist(self, states): "EvalRollingMean", "CheckSkipRationaleClustering", "RationaleClustering", + "CheckSkipReplayConcordance", + "ReplayConcordance", ): assert name in states, f"missing SF state: {name}" @@ -276,7 +278,13 @@ def test_retries_on_transient_lambda_errors(self, states): class TestSkipRationaleClustering: - def test_skip_flag_bypasses_to_health_check(self, states): + def test_skip_flag_bypasses_to_concordance_gate(self, states): + """Skipping clustering must NOT also skip concordance — they + are independent agent-justification signals (clustering = cross- + week templating; concordance = same-input cross-model agreement). + The skip path lands on CheckSkipReplayConcordance rather than + SaturdayHealthCheck so the concordance Lambda still fires + unless its own skip flag is set.""" skip = states["CheckSkipRationaleClustering"] choice = skip["Choices"][0] and_clauses = choice["And"] @@ -285,7 +293,10 @@ def test_skip_flag_bypasses_to_health_check(self, states): and c.get("BooleanEquals") is True for c in and_clauses ) - assert choice["Next"] == "SaturdayHealthCheck" + assert choice["Next"] == "CheckSkipReplayConcordance" + # Critically NOT routed directly to SaturdayHealthCheck — that + # would bundle-skip both observability paths. + assert choice["Next"] != "SaturdayHealthCheck" def test_default_runs_clustering(self, states): assert states["CheckSkipRationaleClustering"]["Default"] == "RationaleClustering" @@ -297,31 +308,88 @@ def test_invokes_live_alias(self, states): assert params["FunctionName"] == "alpha-engine-research-rationale-clustering:live" def test_payload_passes_execution_start_time(self, states): - # SF passes its own start time so the clustering window aligns - # with the SF execution date — same alignment principle as the - # rolling-mean state above. payload = states["RationaleClustering"]["Parameters"]["Payload"] assert payload["end_time_iso.$"] == "$$.Execution.StartTime" def test_timeout_matches_lambda_cap(self, states): - # Clustering Lambda is configured with timeout=600s - # (alpha-engine-research infrastructure/deploy.sh) — SF state - # TimeoutSeconds must equal that ceiling. assert states["RationaleClustering"]["TimeoutSeconds"] == 600 + def test_success_continues_to_concordance_gate(self, states): + # Clustering converges to CheckSkipReplayConcordance (the gate + # in front of the cheap-model concordance Lambda) rather than + # directly to SaturdayHealthCheck. + assert states["RationaleClustering"]["Next"] == "CheckSkipReplayConcordance" + + def test_catch_routes_to_concordance_gate_not_failure(self, states): + catch = states["RationaleClustering"]["Catch"][0] + assert catch["ErrorEquals"] == ["States.ALL"] + assert catch["Next"] == "CheckSkipReplayConcordance" + assert catch["Next"] != "HandleFailure" + + def test_retries_on_transient_lambda_errors(self, states): + retry = states["RationaleClustering"]["Retry"][0] + assert "Lambda.ServiceException" in retry["ErrorEquals"] + assert "Lambda.TooManyRequestsException" in retry["ErrorEquals"] + assert retry["MaxAttempts"] == 1 + + +# ── Replay concordance skip-gate + state ───────────────────────────────── + + +class TestSkipReplayConcordance: + def test_skip_flag_bypasses_to_health_check(self, states): + skip = states["CheckSkipReplayConcordance"] + choice = skip["Choices"][0] + and_clauses = choice["And"] + assert any( + c.get("Variable") == "$.skip_replay_concordance" + and c.get("BooleanEquals") is True + for c in and_clauses + ) + assert choice["Next"] == "SaturdayHealthCheck" + + def test_default_runs_concordance(self, states): + assert states["CheckSkipReplayConcordance"]["Default"] == "ReplayConcordance" + + +class TestReplayConcordance: + def test_invokes_live_alias(self, states): + params = states["ReplayConcordance"]["Parameters"] + assert params["FunctionName"] == "alpha-engine-replay-concordance:live" + + def test_payload_carries_required_fields(self, states): + payload = states["ReplayConcordance"]["Parameters"]["Payload"] + # SF execution start time aligns the window with the SF run date. + assert payload["end_time_iso.$"] == "$$.Execution.StartTime" + # Default target_models pinned at the Saturday SF level so the + # production cadence is reproducible — operator overrides via + # SF input parameters when running ad-hoc against a different + # target model. + assert payload["target_models"] == ["claude-haiku-4-5"] + assert payload["window_days"] == 56 # 8 weeks + # max_artifacts cap fits the 900s Lambda timeout at ~3-5 sec + # per replay call (150 × 5 = 750s, comfortable). + assert payload["max_artifacts"] == 150 + + def test_timeout_matches_lambda_cap(self, states): + # Concordance Lambda is configured with timeout=900s + # (alpha-engine-backtester infrastructure/deploy_concordance.sh). + assert states["ReplayConcordance"]["TimeoutSeconds"] == 900 + def test_success_continues_to_health_check(self, states): - assert states["RationaleClustering"]["Next"] == "SaturdayHealthCheck" + assert states["ReplayConcordance"]["Next"] == "SaturdayHealthCheck" def test_catch_routes_to_health_check_not_failure(self, states): - # Eval is observability — clustering failures must NOT halt - # the pipeline (matches eval-judge + eval-rolling-mean posture). - catch = states["RationaleClustering"]["Catch"][0] + # Eval is observability — concordance failures must NOT halt + # the pipeline (matches eval-judge + eval-rolling-mean + + # rationale-clustering posture). + catch = states["ReplayConcordance"]["Catch"][0] assert catch["ErrorEquals"] == ["States.ALL"] assert catch["Next"] == "SaturdayHealthCheck" assert catch["Next"] != "HandleFailure" def test_retries_on_transient_lambda_errors(self, states): - retry = states["RationaleClustering"]["Retry"][0] + retry = states["ReplayConcordance"]["Retry"][0] assert "Lambda.ServiceException" in retry["ErrorEquals"] assert "Lambda.TooManyRequestsException" in retry["ErrorEquals"] assert retry["MaxAttempts"] == 1