From b6916bac0798480453d1c29a6acd769de31e387e Mon Sep 17 00:00:00 2001
From: Brian McMahon <brian@nousergon.ai>
Date: Sat, 30 May 2026 08:59:17 -0700
Subject: [PATCH] =?UTF-8?q?feat(sf):=20research=20perf=20=E2=80=94=20skip?=
 =?UTF-8?q?=5Fdry=5Frun=5Fgate=20in=20scheduled=20path=20+=20named=20timeo?=
 =?UTF-8?q?ut=20alarm=20(L4464)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two safe, additive parts of the L4464 fix (the load-bearing universe-reduction
ships in alpha-engine-research #256):

1. skip_dry_run_gate=true on the Saturday SF Research Lambda payload. The
   in-handler stub-LLM dry-run gate ran a FULL second graph pass — including a
   real ~4-min fetch_data — before the real pass (~8 min of the 900s budget).
   Wiring is validated by CI + the Friday shell-run preflight; the gate stays
   available for manual/dev invokes. test_sf_payload_uniqueness registry
   updated + a value-pin test added.

2. setup_research_runner_timeout_alarm.sh — CloudWatch alarm on the
   research-runner Lambda Duration Maximum >= 870000 ms (30s below the 900s
   ceiling). A hard Lambda timeout runs no in-process code and does NOT hit
   the Errors metric, so the existing -errors alarm missed it (operator saw
   only a generic PipelineFailure). This names the timeout cause and gives an
   early-warning on near-miss overruns. Routes to alpha-engine-alerts.

NOT included: the Predictor-∥-Scanner topology move (an 11-state restructure
of the Scanner/RAG/regime-substrate chain into the Research parallel branch).
That's optimization-only and warrants its own fully-wiring-tested PR — filed
as a follow-up. Suite 1707 passing.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../setup_research_runner_timeout_alarm.sh    | 70 ++++++++++++++++
 infrastructure/step_function.json             |  3 +-
 tests/test_sf_payload_uniqueness.py           |  2 +-
 ...test_sf_research_perf_and_timeout_alarm.py | 81 +++++++++++++++++++
 4 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100755 infrastructure/setup_research_runner_timeout_alarm.sh
 create mode 100644 tests/test_sf_research_perf_and_timeout_alarm.py

diff --git a/infrastructure/setup_research_runner_timeout_alarm.sh b/infrastructure/setup_research_runner_timeout_alarm.sh
new file mode 100755
index 0000000..3a501f1
--- /dev/null
+++ b/infrastructure/setup_research_runner_timeout_alarm.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# setup_research_runner_timeout_alarm.sh — One-shot CloudWatch alarm for the
+# alpha-engine-research-runner Lambda approaching its 900s timeout (L4464).
+#
+# Why this exists: the 2026-05-30 Research run hit States.Timeout at the 900s
+# Lambda hard ceiling and was SIGKILL'd before writing signals.json. A hard
+# Lambda timeout runs NO in-process code, so it cannot self-alert; and it does
+# NOT increment the Lambda Errors metric, so the existing
+# alpha-engine-research-runner-errors alarm does not catch it. The operator
+# only saw a generic SF PipelineFailure. This alarm names the timeout cause.
+#
+# Mechanism: Lambda emits a Duration datapoint (~900000 ms) even for a
+# timed-out invocation (the billed duration). We alarm on Duration Maximum
+# >= 870000 ms (30s below the ceiling) so it fires on a timeout AND on a
+# near-miss overrun — an early warning that the run is creeping toward the
+# budget even before it fails. The L1995 Phase 5 universe reduction
+# (research #256) should keep real runs at ~10 min; this is the regression
+# backstop, not the fix.
+#
+# Idempotent: safe to re-run. Notification target reuses alpha-engine-alerts
+# (the pipeline-failure inbox), mirroring setup_eval_quality_alarm.sh.
+#
+# Usage: ./infrastructure/setup_research_runner_timeout_alarm.sh
+
+set -euo pipefail
+
+REGION="${AWS_REGION:-us-east-1}"
+ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text --region "$REGION")
+SNS_TOPIC_ARN="arn:aws:sns:${REGION}:${ACCOUNT_ID}:alpha-engine-alerts"
+ALARM_NAME="alpha-engine-research-runner-timeout"
+FUNCTION_NAME="alpha-engine-research-runner"
+THRESHOLD="870000"   # ms — 30s below the 900s Lambda ceiling
+
+echo "Configuring CloudWatch alarm: $ALARM_NAME"
+echo "  Region:     $REGION"
+echo "  SNS topic:  $SNS_TOPIC_ARN"
+echo "  Threshold:  Duration Maximum >= ${THRESHOLD} ms (function $FUNCTION_NAME)"
+
+# Verify the SNS topic exists — fail fast rather than create an alarm with a
+# broken target.
+if ! aws sns get-topic-attributes \
+    --topic-arn "$SNS_TOPIC_ARN" \
+    --region "$REGION" > /dev/null 2>&1; then
+  echo "ERROR: SNS topic $SNS_TOPIC_ARN not found. Run deploy_step_function.sh first." >&2
+  exit 1
+fi
+
+# Period 86400 (24h) Maximum with EvaluationPeriods=1: Research runs weekly
+# (Saturday), so a 24h window contains at most one run; its Duration Maximum
+# is evaluated directly. treat-missing-data=notBreaching keeps the alarm
+# quiet on the ~6 days/week with no invocation.
+aws cloudwatch put-metric-alarm \
+  --region "$REGION" \
+  --alarm-name "$ALARM_NAME" \
+  --alarm-description "Fires when the alpha-engine-research-runner Lambda Duration approaches its 900s ceiling (>= ${THRESHOLD} ms) — a timeout or near-miss overrun. A hard Lambda timeout runs no in-process code and does NOT hit the Errors metric, so this is the only timeout-specific signal. Backstop for the L4464 / L1995-Phase-5 regression class (signals.json went stale 8 days when this fired silently). Names the cause; does not gate deploy." \
+  --comparison-operator "GreaterThanOrEqualToThreshold" \
+  --evaluation-periods 1 \
+  --period 86400 \
+  --statistic Maximum \
+  --threshold "$THRESHOLD" \
+  --treat-missing-data "notBreaching" \
+  --namespace "AWS/Lambda" \
+  --metric-name "Duration" \
+  --dimensions "Name=FunctionName,Value=${FUNCTION_NAME}" \
+  --alarm-actions "$SNS_TOPIC_ARN" \
+  --ok-actions "$SNS_TOPIC_ARN"
+
+echo ""
+echo "Alarm $ALARM_NAME configured."
+echo "Validation: aws cloudwatch describe-alarms --alarm-names $ALARM_NAME --region $REGION --query 'MetricAlarms[0].StateValue' --output text"
diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json
index 42372e9..26f8b2f 100644
--- a/infrastructure/step_function.json
+++ b/infrastructure/step_function.json
@@ -618,13 +618,14 @@
             },
             "Research": {
               "Type": "Task",
-              "Comment": "Research Lambda \u2014 generates signals.json",
+              "Comment": "Research Lambda \u2014 generates signals.json. skip_dry_run_gate=true (L4464 perf): the scheduled production path skips the in-handler stub-LLM dry-run gate, which ran a FULL second graph pass (incl. a real ~4-min fetch_data) before the real pass \u2014 ~8 min of the 900s budget. Wiring is validated by CI + the Friday shell-run preflight; the gate stays available for manual/dev invokes (default off-skip). Composes with the L1995 Phase 5 universe reduction (research #256) which is the load-bearing timeout fix.",
               "Resource": "arn:aws:states:::lambda:invoke",
               "Parameters": {
                 "FunctionName": "alpha-engine-research-runner:live",
                 "Payload": {
                   "weekly_run": true,
                   "force": true,
+                  "skip_dry_run_gate": true,
                   "dry_run_llm.$": "$.research_dry"
                 }
               },
diff --git a/tests/test_sf_payload_uniqueness.py b/tests/test_sf_payload_uniqueness.py
index 41ddb49..5404e12 100644
--- a/tests/test_sf_payload_uniqueness.py
+++ b/tests/test_sf_payload_uniqueness.py
@@ -74,7 +74,7 @@ def _flatten_states(sf_doc: dict) -> dict:
     "Scanner": frozenset({"dry_run_llm.$", "run_date.$"}),
     "RegimeSubstrate": frozenset({"action.$"}),
     "RegimeRetrospectiveEval": frozenset({"action.$"}),
-    "Research": frozenset({"dry_run_llm.$", "force", "weekly_run"}),
+    "Research": frozenset({"dry_run_llm.$", "force", "weekly_run", "skip_dry_run_gate"}),
     "DataPhase2": frozenset({"dry_run.$", "phase"}),
     "EvalJudgeSubmitFirstSaturday": frozenset(
         {"date.$", "dry_run_llm.$", "force_sonnet_pass"}
diff --git a/tests/test_sf_research_perf_and_timeout_alarm.py b/tests/test_sf_research_perf_and_timeout_alarm.py
new file mode 100644
index 0000000..c0b8508
--- /dev/null
+++ b/tests/test_sf_research_perf_and_timeout_alarm.py
@@ -0,0 +1,81 @@
+"""L4464 — Research-stage perf cleanup + named timeout alarm.
+
+Pins:
+  1. The Saturday SF Research Lambda payload sets skip_dry_run_gate=true so
+     the scheduled production path skips the in-handler stub-LLM dry-run gate
+     (a full second graph pass + a redundant ~4-min fetch_data — ~8 min of
+     the 900s budget). The gate's wiring validation lives in CI + the Friday
+     shell-run preflight, not the hot path.
+  2. The research-runner timeout alarm script exists and alarms on the
+     Lambda Duration approaching the 900s ceiling (a timeout-specific signal
+     the existing -errors alarm misses, since a hard timeout doesn't hit the
+     Errors metric and runs no in-process code).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+
+import pytest
+
+_REPO = Path(__file__).resolve().parent.parent
+_SF = _REPO / "infrastructure" / "step_function.json"
+_ALARM = _REPO / "infrastructure" / "setup_research_runner_timeout_alarm.sh"
+
+
+def _find_state(states: dict, name: str) -> dict | None:
+    """Recursively locate a state by name (it lives inside a Parallel branch)."""
+    if name in states:
+        return states[name]
+    for s in states.values():
+        for br in s.get("Branches", []) or []:
+            found = _find_state(br.get("States", {}), name)
+            if found:
+                return found
+    return None
+
+
+@pytest.fixture(scope="module")
+def research_payload() -> dict:
+    sf = json.loads(_SF.read_text())
+    research = _find_state(sf["States"], "Research")
+    assert research is not None, "Research state not found in SF"
+    return research["Parameters"]["Payload"]
+
+
+class TestSkipDryRunGate:
+    def test_skip_dry_run_gate_present_and_true(self, research_payload):
+        assert research_payload.get("skip_dry_run_gate") is True, (
+            "Research payload must set skip_dry_run_gate=true so the scheduled "
+            "production path skips the redundant stub graph pass + double "
+            "fetch_data (L4464 perf)."
+        )
+
+    def test_research_dry_path_preserved(self, research_payload):
+        # The shell-run dry signal must still thread through (Friday preflight).
+        assert research_payload.get("dry_run_llm.$") == "$.research_dry"
+
+
+class TestTimeoutAlarm:
+    @pytest.fixture(scope="class")
+    def alarm_src(self) -> str:
+        assert _ALARM.exists(), f"{_ALARM.name} must exist (L4464 named timeout alarm)"
+        return _ALARM.read_text()
+
+    def test_alarms_on_lambda_duration(self, alarm_src):
+        assert '--namespace "AWS/Lambda"' in alarm_src
+        assert '--metric-name "Duration"' in alarm_src
+        assert "alpha-engine-research-runner" in alarm_src
+
+    def test_threshold_near_900s_ceiling(self, alarm_src):
+        # 30s below the 900000ms ceiling — fires on timeout AND near-miss.
+        assert re.search(r'THRESHOLD="8[0-9]{5}"', alarm_src), (
+            "threshold should be just below the 900000ms Lambda ceiling"
+        )
+        assert "GreaterThanOrEqualToThreshold" in alarm_src
+        assert "Maximum" in alarm_src
+
+    def test_routes_to_alerts_topic(self, alarm_src):
+        assert "alpha-engine-alerts" in alarm_src