Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions infrastructure/setup_research_runner_timeout_alarm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env bash
# setup_research_runner_timeout_alarm.sh — One-shot CloudWatch alarm for the
# alpha-engine-research-runner Lambda approaching its 900s timeout (L4464).
#
# Why this exists: the 2026-05-30 Research run hit States.Timeout at the 900s
# Lambda hard ceiling and was SIGKILL'd before writing signals.json. A hard
# Lambda timeout runs NO in-process code, so it cannot self-alert; and it does
# NOT increment the Lambda Errors metric, so the existing
# alpha-engine-research-runner-errors alarm does not catch it. The operator
# only saw a generic SF PipelineFailure. This alarm names the timeout cause.
#
# Mechanism: Lambda emits a Duration datapoint (~900000 ms) even for a
# timed-out invocation (the billed duration). We alarm on Duration Maximum
# >= 870000 ms (30s below the ceiling) so it fires on a timeout AND on a
# near-miss overrun — an early warning that the run is creeping toward the
# budget even before it fails. The L1995 Phase 5 universe reduction
# (research #256) should keep real runs at ~10 min; this is the regression
# backstop, not the fix.
#
# Idempotent: safe to re-run. Notification target reuses alpha-engine-alerts
# (the pipeline-failure inbox), mirroring setup_eval_quality_alarm.sh.
#
# Usage: ./infrastructure/setup_research_runner_timeout_alarm.sh

set -euo pipefail

REGION="${AWS_REGION:-us-east-1}"
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text --region "$REGION")
SNS_TOPIC_ARN="arn:aws:sns:${REGION}:${ACCOUNT_ID}:alpha-engine-alerts"
ALARM_NAME="alpha-engine-research-runner-timeout"
FUNCTION_NAME="alpha-engine-research-runner"
THRESHOLD="870000" # ms — 30s below the 900s Lambda ceiling

echo "Configuring CloudWatch alarm: $ALARM_NAME"
echo " Region: $REGION"
echo " SNS topic: $SNS_TOPIC_ARN"
echo " Threshold: Duration Maximum >= ${THRESHOLD} ms (function $FUNCTION_NAME)"

# Verify the SNS topic exists — fail fast rather than create an alarm with a
# broken target.
if ! aws sns get-topic-attributes \
--topic-arn "$SNS_TOPIC_ARN" \
--region "$REGION" > /dev/null 2>&1; then
echo "ERROR: SNS topic $SNS_TOPIC_ARN not found. Run deploy_step_function.sh first." >&2
exit 1
fi

# Period 86400 (24h) Maximum with EvaluationPeriods=1: Research runs weekly
# (Saturday), so a 24h window contains at most one run; its Duration Maximum
# is evaluated directly. treat-missing-data=notBreaching keeps the alarm
# quiet on the ~6 days/week with no invocation.
aws cloudwatch put-metric-alarm \
--region "$REGION" \
--alarm-name "$ALARM_NAME" \
--alarm-description "Fires when the alpha-engine-research-runner Lambda Duration approaches its 900s ceiling (>= ${THRESHOLD} ms) — a timeout or near-miss overrun. A hard Lambda timeout runs no in-process code and does NOT hit the Errors metric, so this is the only timeout-specific signal. Backstop for the L4464 / L1995-Phase-5 regression class (signals.json went stale 8 days when this fired silently). Names the cause; does not gate deploy." \
--comparison-operator "GreaterThanOrEqualToThreshold" \
--evaluation-periods 1 \
--period 86400 \
--statistic Maximum \
--threshold "$THRESHOLD" \
--treat-missing-data "notBreaching" \
--namespace "AWS/Lambda" \
--metric-name "Duration" \
--dimensions "Name=FunctionName,Value=${FUNCTION_NAME}" \
--alarm-actions "$SNS_TOPIC_ARN" \
--ok-actions "$SNS_TOPIC_ARN"

echo ""
echo "Alarm $ALARM_NAME configured."
echo "Validation: aws cloudwatch describe-alarms --alarm-names $ALARM_NAME --region $REGION --query 'MetricAlarms[0].StateValue' --output text"
3 changes: 2 additions & 1 deletion infrastructure/step_function.json
Original file line number Diff line number Diff line change
Expand Up @@ -618,13 +618,14 @@
},
"Research": {
"Type": "Task",
"Comment": "Research Lambda \u2014 generates signals.json",
"Comment": "Research Lambda \u2014 generates signals.json. skip_dry_run_gate=true (L4464 perf): the scheduled production path skips the in-handler stub-LLM dry-run gate, which ran a FULL second graph pass (incl. a real ~4-min fetch_data) before the real pass \u2014 ~8 min of the 900s budget. Wiring is validated by CI + the Friday shell-run preflight; the gate stays available for manual/dev invokes (default off-skip). Composes with the L1995 Phase 5 universe reduction (research #256) which is the load-bearing timeout fix.",
"Resource": "arn:aws:states:::lambda:invoke",
"Parameters": {
"FunctionName": "alpha-engine-research-runner:live",
"Payload": {
"weekly_run": true,
"force": true,
"skip_dry_run_gate": true,
"dry_run_llm.$": "$.research_dry"
}
},
Expand Down
2 changes: 1 addition & 1 deletion tests/test_sf_payload_uniqueness.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _flatten_states(sf_doc: dict) -> dict:
"Scanner": frozenset({"dry_run_llm.$", "run_date.$"}),
"RegimeSubstrate": frozenset({"action.$"}),
"RegimeRetrospectiveEval": frozenset({"action.$"}),
"Research": frozenset({"dry_run_llm.$", "force", "weekly_run"}),
"Research": frozenset({"dry_run_llm.$", "force", "weekly_run", "skip_dry_run_gate"}),
"DataPhase2": frozenset({"dry_run.$", "phase"}),
"EvalJudgeSubmitFirstSaturday": frozenset(
{"date.$", "dry_run_llm.$", "force_sonnet_pass"}
Expand Down
81 changes: 81 additions & 0 deletions tests/test_sf_research_perf_and_timeout_alarm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""L4464 — Research-stage perf cleanup + named timeout alarm.

Pins:
1. The Saturday SF Research Lambda payload sets skip_dry_run_gate=true so
the scheduled production path skips the in-handler stub-LLM dry-run gate
(a full second graph pass + a redundant ~4-min fetch_data — ~8 min of
the 900s budget). The gate's wiring validation lives in CI + the Friday
shell-run preflight, not the hot path.
2. The research-runner timeout alarm script exists and alarms on the
Lambda Duration approaching the 900s ceiling (a timeout-specific signal
the existing -errors alarm misses, since a hard timeout doesn't hit the
Errors metric and runs no in-process code).
"""

from __future__ import annotations

import json
import re
from pathlib import Path

import pytest

_REPO = Path(__file__).resolve().parent.parent
_SF = _REPO / "infrastructure" / "step_function.json"
_ALARM = _REPO / "infrastructure" / "setup_research_runner_timeout_alarm.sh"


def _find_state(states: dict, name: str) -> dict | None:
"""Recursively locate a state by name (it lives inside a Parallel branch)."""
if name in states:
return states[name]
for s in states.values():
for br in s.get("Branches", []) or []:
found = _find_state(br.get("States", {}), name)
if found:
return found
return None


@pytest.fixture(scope="module")
def research_payload() -> dict:
sf = json.loads(_SF.read_text())
research = _find_state(sf["States"], "Research")
assert research is not None, "Research state not found in SF"
return research["Parameters"]["Payload"]


class TestSkipDryRunGate:
def test_skip_dry_run_gate_present_and_true(self, research_payload):
assert research_payload.get("skip_dry_run_gate") is True, (
"Research payload must set skip_dry_run_gate=true so the scheduled "
"production path skips the redundant stub graph pass + double "
"fetch_data (L4464 perf)."
)

def test_research_dry_path_preserved(self, research_payload):
# The shell-run dry signal must still thread through (Friday preflight).
assert research_payload.get("dry_run_llm.$") == "$.research_dry"


class TestTimeoutAlarm:
@pytest.fixture(scope="class")
def alarm_src(self) -> str:
assert _ALARM.exists(), f"{_ALARM.name} must exist (L4464 named timeout alarm)"
return _ALARM.read_text()

def test_alarms_on_lambda_duration(self, alarm_src):
assert '--namespace "AWS/Lambda"' in alarm_src
assert '--metric-name "Duration"' in alarm_src
assert "alpha-engine-research-runner" in alarm_src

def test_threshold_near_900s_ceiling(self, alarm_src):
# 30s below the 900000ms ceiling — fires on timeout AND near-miss.
assert re.search(r'THRESHOLD="8[0-9]{5}"', alarm_src), (
"threshold should be just below the 900000ms Lambda ceiling"
)
assert "GreaterThanOrEqualToThreshold" in alarm_src
assert "Maximum" in alarm_src

def test_routes_to_alerts_topic(self, alarm_src):
assert "alpha-engine-alerts" in alarm_src
Loading