cipher813 · cipher813 · May 30, 2026 · May 30, 2026
diff --git a/infrastructure/setup_research_runner_timeout_alarm.sh b/infrastructure/setup_research_runner_timeout_alarm.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# setup_research_runner_timeout_alarm.sh — One-shot CloudWatch alarm for the
+# alpha-engine-research-runner Lambda approaching its 900s timeout (L4464).
+#
+# Why this exists: the 2026-05-30 Research run hit States.Timeout at the 900s
+# Lambda hard ceiling and was SIGKILL'd before writing signals.json. A hard
+# Lambda timeout runs NO in-process code, so it cannot self-alert; and it does
+# NOT increment the Lambda Errors metric, so the existing
+# alpha-engine-research-runner-errors alarm does not catch it. The operator
+# only saw a generic SF PipelineFailure. This alarm names the timeout cause.
+#
+# Mechanism: Lambda emits a Duration datapoint (~900000 ms) even for a
+# timed-out invocation (the billed duration). We alarm on Duration Maximum
+# >= 870000 ms (30s below the ceiling) so it fires on a timeout AND on a
+# near-miss overrun — an early warning that the run is creeping toward the
+# budget even before it fails. The L1995 Phase 5 universe reduction
+# (research #256) should keep real runs at ~10 min; this is the regression
+# backstop, not the fix.
+#
+# Idempotent: safe to re-run. Notification target reuses alpha-engine-alerts
+# (the pipeline-failure inbox), mirroring setup_eval_quality_alarm.sh.
+#
+# Usage: ./infrastructure/setup_research_runner_timeout_alarm.sh
+
+set -euo pipefail
+
+REGION="${AWS_REGION:-us-east-1}"
+ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text --region "$REGION")
+SNS_TOPIC_ARN="arn:aws:sns:${REGION}:${ACCOUNT_ID}:alpha-engine-alerts"
+ALARM_NAME="alpha-engine-research-runner-timeout"
+FUNCTION_NAME="alpha-engine-research-runner"
+THRESHOLD="870000"   # ms — 30s below the 900s Lambda ceiling
+
+echo "Configuring CloudWatch alarm: $ALARM_NAME"
+echo "  Region:     $REGION"
+echo "  SNS topic:  $SNS_TOPIC_ARN"
+echo "  Threshold:  Duration Maximum >= ${THRESHOLD} ms (function $FUNCTION_NAME)"
+
+# Verify the SNS topic exists — fail fast rather than create an alarm with a
+# broken target.
+if ! aws sns get-topic-attributes \
+    --topic-arn "$SNS_TOPIC_ARN" \
+    --region "$REGION" > /dev/null 2>&1; then
+  echo "ERROR: SNS topic $SNS_TOPIC_ARN not found. Run deploy_step_function.sh first." >&2
+  exit 1
+fi
+
+# Period 86400 (24h) Maximum with EvaluationPeriods=1: Research runs weekly
+# (Saturday), so a 24h window contains at most one run; its Duration Maximum
+# is evaluated directly. treat-missing-data=notBreaching keeps the alarm
+# quiet on the ~6 days/week with no invocation.
+aws cloudwatch put-metric-alarm \
+  --region "$REGION" \
+  --alarm-name "$ALARM_NAME" \
+  --alarm-description "Fires when the alpha-engine-research-runner Lambda Duration approaches its 900s ceiling (>= ${THRESHOLD} ms) — a timeout or near-miss overrun. A hard Lambda timeout runs no in-process code and does NOT hit the Errors metric, so this is the only timeout-specific signal. Backstop for the L4464 / L1995-Phase-5 regression class (signals.json went stale 8 days when this fired silently). Names the cause; does not gate deploy." \
+  --comparison-operator "GreaterThanOrEqualToThreshold" \
+  --evaluation-periods 1 \
+  --period 86400 \
+  --statistic Maximum \
+  --threshold "$THRESHOLD" \
+  --treat-missing-data "notBreaching" \
+  --namespace "AWS/Lambda" \
+  --metric-name "Duration" \
+  --dimensions "Name=FunctionName,Value=${FUNCTION_NAME}" \
+  --alarm-actions "$SNS_TOPIC_ARN" \
+  --ok-actions "$SNS_TOPIC_ARN"
+
+echo ""
+echo "Alarm $ALARM_NAME configured."
+echo "Validation: aws cloudwatch describe-alarms --alarm-names $ALARM_NAME --region $REGION --query 'MetricAlarms[0].StateValue' --output text"
diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json
@@ -618,13 +618,14 @@
             },
             "Research": {
               "Type": "Task",
-              "Comment": "Research Lambda \u2014 generates signals.json",
+              "Comment": "Research Lambda \u2014 generates signals.json. skip_dry_run_gate=true (L4464 perf): the scheduled production path skips the in-handler stub-LLM dry-run gate, which ran a FULL second graph pass (incl. a real ~4-min fetch_data) before the real pass \u2014 ~8 min of the 900s budget. Wiring is validated by CI + the Friday shell-run preflight; the gate stays available for manual/dev invokes (default off-skip). Composes with the L1995 Phase 5 universe reduction (research #256) which is the load-bearing timeout fix.",
               "Resource": "arn:aws:states:::lambda:invoke",
               "Parameters": {
                 "FunctionName": "alpha-engine-research-runner:live",
                 "Payload": {
                   "weekly_run": true,
                   "force": true,
+                  "skip_dry_run_gate": true,
                   "dry_run_llm.$": "$.research_dry"
                 }
               },

diff --git a/tests/test_sf_payload_uniqueness.py b/tests/test_sf_payload_uniqueness.py
@@ -74,7 +74,7 @@ def _flatten_states(sf_doc: dict) -> dict:
     "Scanner": frozenset({"dry_run_llm.$", "run_date.$"}),
     "RegimeSubstrate": frozenset({"action.$"}),
     "RegimeRetrospectiveEval": frozenset({"action.$"}),
-    "Research": frozenset({"dry_run_llm.$", "force", "weekly_run"}),
+    "Research": frozenset({"dry_run_llm.$", "force", "weekly_run", "skip_dry_run_gate"}),
     "DataPhase2": frozenset({"dry_run.$", "phase"}),
     "EvalJudgeSubmitFirstSaturday": frozenset(
         {"date.$", "dry_run_llm.$", "force_sonnet_pass"}

diff --git a/tests/test_sf_research_perf_and_timeout_alarm.py b/tests/test_sf_research_perf_and_timeout_alarm.py
@@ -0,0 +1,81 @@
+"""L4464 — Research-stage perf cleanup + named timeout alarm.
+
+Pins:
+  1. The Saturday SF Research Lambda payload sets skip_dry_run_gate=true so
+     the scheduled production path skips the in-handler stub-LLM dry-run gate
+     (a full second graph pass + a redundant ~4-min fetch_data — ~8 min of
+     the 900s budget). The gate's wiring validation lives in CI + the Friday
+     shell-run preflight, not the hot path.
+  2. The research-runner timeout alarm script exists and alarms on the
+     Lambda Duration approaching the 900s ceiling (a timeout-specific signal
+     the existing -errors alarm misses, since a hard timeout doesn't hit the
+     Errors metric and runs no in-process code).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+
+import pytest
+
+_REPO = Path(__file__).resolve().parent.parent
+_SF = _REPO / "infrastructure" / "step_function.json"
+_ALARM = _REPO / "infrastructure" / "setup_research_runner_timeout_alarm.sh"
+
+
+def _find_state(states: dict, name: str) -> dict | None:
+    """Recursively locate a state by name (it lives inside a Parallel branch)."""
+    if name in states:
+        return states[name]
+    for s in states.values():
+        for br in s.get("Branches", []) or []:
+            found = _find_state(br.get("States", {}), name)
+            if found:
+                return found
+    return None
+
+
+@pytest.fixture(scope="module")
+def research_payload() -> dict:
+    sf = json.loads(_SF.read_text())
+    research = _find_state(sf["States"], "Research")
+    assert research is not None, "Research state not found in SF"
+    return research["Parameters"]["Payload"]
+
+
+class TestSkipDryRunGate:
+    def test_skip_dry_run_gate_present_and_true(self, research_payload):
+        assert research_payload.get("skip_dry_run_gate") is True, (
+            "Research payload must set skip_dry_run_gate=true so the scheduled "
+            "production path skips the redundant stub graph pass + double "
+            "fetch_data (L4464 perf)."
+        )
+
+    def test_research_dry_path_preserved(self, research_payload):
+        # The shell-run dry signal must still thread through (Friday preflight).
+        assert research_payload.get("dry_run_llm.$") == "$.research_dry"
+
+
+class TestTimeoutAlarm:
+    @pytest.fixture(scope="class")
+    def alarm_src(self) -> str:
+        assert _ALARM.exists(), f"{_ALARM.name} must exist (L4464 named timeout alarm)"
+        return _ALARM.read_text()
+
+    def test_alarms_on_lambda_duration(self, alarm_src):
+        assert '--namespace "AWS/Lambda"' in alarm_src
+        assert '--metric-name "Duration"' in alarm_src
+        assert "alpha-engine-research-runner" in alarm_src
+
+    def test_threshold_near_900s_ceiling(self, alarm_src):
+        # 30s below the 900000ms ceiling — fires on timeout AND near-miss.
+        assert re.search(r'THRESHOLD="8[0-9]{5}"', alarm_src), (
+            "threshold should be just below the 900000ms Lambda ceiling"
+        )
+        assert "GreaterThanOrEqualToThreshold" in alarm_src
+        assert "Maximum" in alarm_src
+
+    def test_routes_to_alerts_topic(self, alarm_src):
+        assert "alpha-engine-alerts" in alarm_src