From b6916bac0798480453d1c29a6acd769de31e387e Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Sat, 30 May 2026 08:59:17 -0700 Subject: [PATCH] =?UTF-8?q?feat(sf):=20research=20perf=20=E2=80=94=20skip?= =?UTF-8?q?=5Fdry=5Frun=5Fgate=20in=20scheduled=20path=20+=20named=20timeo?= =?UTF-8?q?ut=20alarm=20(L4464)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two safe, additive parts of the L4464 fix (the load-bearing universe-reduction ships in alpha-engine-research #256): 1. skip_dry_run_gate=true on the Saturday SF Research Lambda payload. The in-handler stub-LLM dry-run gate ran a FULL second graph pass — including a real ~4-min fetch_data — before the real pass (~8 min of the 900s budget). Wiring is validated by CI + the Friday shell-run preflight; the gate stays available for manual/dev invokes. test_sf_payload_uniqueness registry updated + a value-pin test added. 2. setup_research_runner_timeout_alarm.sh — CloudWatch alarm on the research-runner Lambda Duration Maximum >= 870000 ms (30s below the 900s ceiling). A hard Lambda timeout runs no in-process code and does NOT hit the Errors metric, so the existing -errors alarm missed it (operator saw only a generic PipelineFailure). This names the timeout cause and gives an early-warning on near-miss overruns. Routes to alpha-engine-alerts. NOT included: the Predictor-∥-Scanner topology move (an 11-state restructure of the Scanner/RAG/regime-substrate chain into the Research parallel branch). That's optimization-only and warrants its own fully-wiring-tested PR — filed as a follow-up. Suite 1707 passing. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../setup_research_runner_timeout_alarm.sh | 70 ++++++++++++++++ infrastructure/step_function.json | 3 +- tests/test_sf_payload_uniqueness.py | 2 +- ...test_sf_research_perf_and_timeout_alarm.py | 81 +++++++++++++++++++ 4 files changed, 154 insertions(+), 2 deletions(-) create mode 100755 infrastructure/setup_research_runner_timeout_alarm.sh create mode 100644 tests/test_sf_research_perf_and_timeout_alarm.py diff --git a/infrastructure/setup_research_runner_timeout_alarm.sh b/infrastructure/setup_research_runner_timeout_alarm.sh new file mode 100755 index 0000000..3a501f1 --- /dev/null +++ b/infrastructure/setup_research_runner_timeout_alarm.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# setup_research_runner_timeout_alarm.sh — One-shot CloudWatch alarm for the +# alpha-engine-research-runner Lambda approaching its 900s timeout (L4464). +# +# Why this exists: the 2026-05-30 Research run hit States.Timeout at the 900s +# Lambda hard ceiling and was SIGKILL'd before writing signals.json. A hard +# Lambda timeout runs NO in-process code, so it cannot self-alert; and it does +# NOT increment the Lambda Errors metric, so the existing +# alpha-engine-research-runner-errors alarm does not catch it. The operator +# only saw a generic SF PipelineFailure. This alarm names the timeout cause. +# +# Mechanism: Lambda emits a Duration datapoint (~900000 ms) even for a +# timed-out invocation (the billed duration). We alarm on Duration Maximum +# >= 870000 ms (30s below the ceiling) so it fires on a timeout AND on a +# near-miss overrun — an early warning that the run is creeping toward the +# budget even before it fails. The L1995 Phase 5 universe reduction +# (research #256) should keep real runs at ~10 min; this is the regression +# backstop, not the fix. +# +# Idempotent: safe to re-run. Notification target reuses alpha-engine-alerts +# (the pipeline-failure inbox), mirroring setup_eval_quality_alarm.sh. +# +# Usage: ./infrastructure/setup_research_runner_timeout_alarm.sh + +set -euo pipefail + +REGION="${AWS_REGION:-us-east-1}" +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text --region "$REGION") +SNS_TOPIC_ARN="arn:aws:sns:${REGION}:${ACCOUNT_ID}:alpha-engine-alerts" +ALARM_NAME="alpha-engine-research-runner-timeout" +FUNCTION_NAME="alpha-engine-research-runner" +THRESHOLD="870000" # ms — 30s below the 900s Lambda ceiling + +echo "Configuring CloudWatch alarm: $ALARM_NAME" +echo " Region: $REGION" +echo " SNS topic: $SNS_TOPIC_ARN" +echo " Threshold: Duration Maximum >= ${THRESHOLD} ms (function $FUNCTION_NAME)" + +# Verify the SNS topic exists — fail fast rather than create an alarm with a +# broken target. +if ! aws sns get-topic-attributes \ + --topic-arn "$SNS_TOPIC_ARN" \ + --region "$REGION" > /dev/null 2>&1; then + echo "ERROR: SNS topic $SNS_TOPIC_ARN not found. Run deploy_step_function.sh first." >&2 + exit 1 +fi + +# Period 86400 (24h) Maximum with EvaluationPeriods=1: Research runs weekly +# (Saturday), so a 24h window contains at most one run; its Duration Maximum +# is evaluated directly. treat-missing-data=notBreaching keeps the alarm +# quiet on the ~6 days/week with no invocation. +aws cloudwatch put-metric-alarm \ + --region "$REGION" \ + --alarm-name "$ALARM_NAME" \ + --alarm-description "Fires when the alpha-engine-research-runner Lambda Duration approaches its 900s ceiling (>= ${THRESHOLD} ms) — a timeout or near-miss overrun. A hard Lambda timeout runs no in-process code and does NOT hit the Errors metric, so this is the only timeout-specific signal. Backstop for the L4464 / L1995-Phase-5 regression class (signals.json went stale 8 days when this fired silently). Names the cause; does not gate deploy." \ + --comparison-operator "GreaterThanOrEqualToThreshold" \ + --evaluation-periods 1 \ + --period 86400 \ + --statistic Maximum \ + --threshold "$THRESHOLD" \ + --treat-missing-data "notBreaching" \ + --namespace "AWS/Lambda" \ + --metric-name "Duration" \ + --dimensions "Name=FunctionName,Value=${FUNCTION_NAME}" \ + --alarm-actions "$SNS_TOPIC_ARN" \ + --ok-actions "$SNS_TOPIC_ARN" + +echo "" +echo "Alarm $ALARM_NAME configured." +echo "Validation: aws cloudwatch describe-alarms --alarm-names $ALARM_NAME --region $REGION --query 'MetricAlarms[0].StateValue' --output text" diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json index 42372e9..26f8b2f 100644 --- a/infrastructure/step_function.json +++ b/infrastructure/step_function.json @@ -618,13 +618,14 @@ }, "Research": { "Type": "Task", - "Comment": "Research Lambda \u2014 generates signals.json", + "Comment": "Research Lambda \u2014 generates signals.json. skip_dry_run_gate=true (L4464 perf): the scheduled production path skips the in-handler stub-LLM dry-run gate, which ran a FULL second graph pass (incl. a real ~4-min fetch_data) before the real pass \u2014 ~8 min of the 900s budget. Wiring is validated by CI + the Friday shell-run preflight; the gate stays available for manual/dev invokes (default off-skip). Composes with the L1995 Phase 5 universe reduction (research #256) which is the load-bearing timeout fix.", "Resource": "arn:aws:states:::lambda:invoke", "Parameters": { "FunctionName": "alpha-engine-research-runner:live", "Payload": { "weekly_run": true, "force": true, + "skip_dry_run_gate": true, "dry_run_llm.$": "$.research_dry" } }, diff --git a/tests/test_sf_payload_uniqueness.py b/tests/test_sf_payload_uniqueness.py index 41ddb49..5404e12 100644 --- a/tests/test_sf_payload_uniqueness.py +++ b/tests/test_sf_payload_uniqueness.py @@ -74,7 +74,7 @@ def _flatten_states(sf_doc: dict) -> dict: "Scanner": frozenset({"dry_run_llm.$", "run_date.$"}), "RegimeSubstrate": frozenset({"action.$"}), "RegimeRetrospectiveEval": frozenset({"action.$"}), - "Research": frozenset({"dry_run_llm.$", "force", "weekly_run"}), + "Research": frozenset({"dry_run_llm.$", "force", "weekly_run", "skip_dry_run_gate"}), "DataPhase2": frozenset({"dry_run.$", "phase"}), "EvalJudgeSubmitFirstSaturday": frozenset( {"date.$", "dry_run_llm.$", "force_sonnet_pass"} diff --git a/tests/test_sf_research_perf_and_timeout_alarm.py b/tests/test_sf_research_perf_and_timeout_alarm.py new file mode 100644 index 0000000..c0b8508 --- /dev/null +++ b/tests/test_sf_research_perf_and_timeout_alarm.py @@ -0,0 +1,81 @@ +"""L4464 — Research-stage perf cleanup + named timeout alarm. + +Pins: + 1. The Saturday SF Research Lambda payload sets skip_dry_run_gate=true so + the scheduled production path skips the in-handler stub-LLM dry-run gate + (a full second graph pass + a redundant ~4-min fetch_data — ~8 min of + the 900s budget). The gate's wiring validation lives in CI + the Friday + shell-run preflight, not the hot path. + 2. The research-runner timeout alarm script exists and alarms on the + Lambda Duration approaching the 900s ceiling (a timeout-specific signal + the existing -errors alarm misses, since a hard timeout doesn't hit the + Errors metric and runs no in-process code). +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path + +import pytest + +_REPO = Path(__file__).resolve().parent.parent +_SF = _REPO / "infrastructure" / "step_function.json" +_ALARM = _REPO / "infrastructure" / "setup_research_runner_timeout_alarm.sh" + + +def _find_state(states: dict, name: str) -> dict | None: + """Recursively locate a state by name (it lives inside a Parallel branch).""" + if name in states: + return states[name] + for s in states.values(): + for br in s.get("Branches", []) or []: + found = _find_state(br.get("States", {}), name) + if found: + return found + return None + + +@pytest.fixture(scope="module") +def research_payload() -> dict: + sf = json.loads(_SF.read_text()) + research = _find_state(sf["States"], "Research") + assert research is not None, "Research state not found in SF" + return research["Parameters"]["Payload"] + + +class TestSkipDryRunGate: + def test_skip_dry_run_gate_present_and_true(self, research_payload): + assert research_payload.get("skip_dry_run_gate") is True, ( + "Research payload must set skip_dry_run_gate=true so the scheduled " + "production path skips the redundant stub graph pass + double " + "fetch_data (L4464 perf)." + ) + + def test_research_dry_path_preserved(self, research_payload): + # The shell-run dry signal must still thread through (Friday preflight). + assert research_payload.get("dry_run_llm.$") == "$.research_dry" + + +class TestTimeoutAlarm: + @pytest.fixture(scope="class") + def alarm_src(self) -> str: + assert _ALARM.exists(), f"{_ALARM.name} must exist (L4464 named timeout alarm)" + return _ALARM.read_text() + + def test_alarms_on_lambda_duration(self, alarm_src): + assert '--namespace "AWS/Lambda"' in alarm_src + assert '--metric-name "Duration"' in alarm_src + assert "alpha-engine-research-runner" in alarm_src + + def test_threshold_near_900s_ceiling(self, alarm_src): + # 30s below the 900000ms ceiling — fires on timeout AND near-miss. + assert re.search(r'THRESHOLD="8[0-9]{5}"', alarm_src), ( + "threshold should be just below the 900000ms Lambda ceiling" + ) + assert "GreaterThanOrEqualToThreshold" in alarm_src + assert "Maximum" in alarm_src + + def test_routes_to_alerts_topic(self, alarm_src): + assert "alpha-engine-alerts" in alarm_src