diff --git a/infrastructure/setup_substrate_alarms.sh b/infrastructure/setup_substrate_alarms.sh new file mode 100755 index 0000000..8349026 --- /dev/null +++ b/infrastructure/setup_substrate_alarms.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +# setup_substrate_alarms.sh — One-shot CloudWatch alarm setup for the +# Phase 2 → 3 transparency-substrate health checker (alpha-engine-lib +# transparency.py + transparency_inventory.yaml). +# +# Idempotent: safe to re-run after threshold tweaks. Creates one alarm +# per inventory row plus one aggregate failure alarm. All point to the +# existing alpha-engine-alerts SNS topic. +# +# Per-row alarm: +# alpha-engine-substrate- +# Fires when SubstrateRowOK metric for that row drops below 1 +# (the substrate checker emits 1 for ok/not_yet_effective, 0 for fail). +# +# Aggregate alarm: +# alpha-engine-substrate-aggregate-failures +# Fires when SubstrateChecksFailed > 0 in the trailing 24h window — +# safety net in case a row alarm gets accidentally deleted. +# +# Period 86400 (24h) with EvaluationPeriods=1 means each alarm checks +# the most recent 24h window. treat-missing-data=notBreaching keeps +# weekly-cadence rows quiet between emissions (a row that emits once +# per Sat SF only has a datapoint every 7 days; missing days are not +# alarms, only emitted-and-failed days are). +# +# Row enumeration is sourced from the lib's transparency_inventory.yaml +# so this script stays in sync with the inventory automatically — no +# hardcoded row list to drift. +# +# Usage: +# pip install alpha-engine-lib==0.5.0 # (or activate a venv with it) +# ./infrastructure/setup_substrate_alarms.sh + +set -euo pipefail + +REGION="${AWS_REGION:-us-east-1}" +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text --region "$REGION") +SNS_TOPIC_ARN="arn:aws:sns:${REGION}:${ACCOUNT_ID}:alpha-engine-alerts" +NAMESPACE="AlphaEngine/Substrate" +PER_ROW_METRIC="SubstrateRowOK" +AGGREGATE_METRIC="SubstrateChecksFailed" + +echo "Configuring CloudWatch alarms for transparency substrate" +echo " Region: $REGION" +echo " SNS topic: $SNS_TOPIC_ARN" +echo " Namespace: $NAMESPACE" + +# Verify the SNS topic exists — fail fast rather than create alarms +# with broken targets. +if ! aws sns get-topic-attributes \ + --topic-arn "$SNS_TOPIC_ARN" \ + --region "$REGION" > /dev/null 2>&1; then + echo "ERROR: SNS topic $SNS_TOPIC_ARN not found. Run deploy_step_function.sh first." >&2 + exit 1 +fi + +# Pull row IDs from the lib's inventory YAML — single source of truth. +# Adding a row to the YAML and re-running this script automatically +# adds the corresponding alarm. Removing a row leaves a stale alarm, +# which surfaces as INSUFFICIENT_DATA — safer than silently deleting. +ROW_IDS=$(python3 -c " +from alpha_engine_lib.transparency import load_inventory +print(' '.join(r['id'] for r in load_inventory()['inventory'])) +") + +if [[ -z "$ROW_IDS" ]]; then + echo "ERROR: could not enumerate inventory rows. Is alpha-engine-lib installed?" >&2 + exit 1 +fi + +echo "" +echo "Creating per-row alarms for: $ROW_IDS" +echo "" + +# --- Per-row alarms --------------------------------------------------------- + +for row_id in $ROW_IDS; do + alarm_name="alpha-engine-substrate-${row_id}" + echo "==> $alarm_name" + + aws cloudwatch put-metric-alarm \ + --region "$REGION" \ + --alarm-name "$alarm_name" \ + --alarm-description "Fires when the transparency-substrate row '$row_id' fails to emit a passing measurement. The check is row-driven (alpha_engine_lib.transparency); the SF Sat pipeline runs --cadence weekly which sweeps weekly + daily rows. This alarm decrements the Phase 2 → 3 observation gate denominator for this row when it fires. treat-missing-data=notBreaching keeps weekly-cadence rows quiet between Sat-SF emissions." \ + --comparison-operator "LessThanThreshold" \ + --evaluation-periods 1 \ + --period 86400 \ + --statistic "Minimum" \ + --threshold 1 \ + --treat-missing-data "notBreaching" \ + --namespace "$NAMESPACE" \ + --metric-name "$PER_ROW_METRIC" \ + --dimensions "Name=RowID,Value=$row_id" \ + --alarm-actions "$SNS_TOPIC_ARN" \ + --ok-actions "$SNS_TOPIC_ARN" > /dev/null +done + +# --- Aggregate failure alarm ------------------------------------------------ + +aggregate_name="alpha-engine-substrate-aggregate-failures" +echo "==> $aggregate_name" + +aws cloudwatch put-metric-alarm \ + --region "$REGION" \ + --alarm-name "$aggregate_name" \ + --alarm-description "Aggregate safety-net alarm for the transparency substrate. Fires when SubstrateChecksFailed > 0 in any 24h window. Catches the case where a per-row alarm has been accidentally deleted — the per-row alarms are authoritative for which row failed; this alarm only confirms the substrate is observing failures. treat-missing-data=notBreaching means a substrate run with all rows passing emits zero failures and the alarm stays OK." \ + --comparison-operator "GreaterThanThreshold" \ + --evaluation-periods 1 \ + --period 86400 \ + --statistic "Maximum" \ + --threshold 0 \ + --treat-missing-data "notBreaching" \ + --namespace "$NAMESPACE" \ + --metric-name "$AGGREGATE_METRIC" \ + --alarm-actions "$SNS_TOPIC_ARN" \ + --ok-actions "$SNS_TOPIC_ARN" > /dev/null + +echo "" +echo "All substrate alarms configured." +echo "" +echo "Validation:" +echo " aws cloudwatch describe-alarms --region $REGION \\" +echo " --alarm-name-prefix alpha-engine-substrate- \\" +echo " --query 'MetricAlarms[].[AlarmName,StateValue]' --output table" +echo "" +echo "First firing eligibility: per-row alarms remain INSUFFICIENT_DATA until the first weekly substrate run emits the metric (Sat SF) — they will not page during the gap. Rows with effective_date > today emit value=1 (counted as healthy) so they stay quiet until their grace period expires." diff --git a/requirements.txt b/requirements.txt index 448eb83..94b35d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ arcticdb>=6.11 # previously listed above as direct deps; kept those direct lines for now to # avoid breaking pinning during the migration. Drop the duplicate direct # pgvector/psycopg2-binary pins once the migration soaks. -alpha-engine-lib[arcticdb,flow_doctor,rag] @ git+https://github.com/cipher813/alpha-engine-lib@v0.3.0 +alpha-engine-lib[arcticdb,flow_doctor,rag] @ git+https://github.com/cipher813/alpha-engine-lib@v0.5.0 diff --git a/tests/test_substrate_alarms_script.py b/tests/test_substrate_alarms_script.py new file mode 100644 index 0000000..b709e90 --- /dev/null +++ b/tests/test_substrate_alarms_script.py @@ -0,0 +1,141 @@ +"""Pins the substrate alarm setup script to the lib's metric namespace. + +The setup_substrate_alarms.sh script is idempotent and run once per +threshold change, but its alarms are useless if the namespace + metric +name don't match what alpha_engine_lib.transparency.emit_cloudwatch_metrics +publishes. + +These tests catch that drift class: +- Namespace mismatch → alarms never fire +- Metric name mismatch → alarms never fire +- SNS topic typo → alarms fire but no one is paged +- Row enumeration syntactically broken → script aborts before creating + per-row alarms but still creates the aggregate (deceptive partial + success) +""" + +from __future__ import annotations + +import re +import shutil +import subprocess +from pathlib import Path + +import pytest + + +_REPO_ROOT = Path(__file__).resolve().parent.parent +_SCRIPT = _REPO_ROOT / "infrastructure" / "setup_substrate_alarms.sh" + + +@pytest.fixture(scope="module") +def script_text() -> str: + return _SCRIPT.read_text() + + +def test_script_exists_and_is_executable(): + assert _SCRIPT.is_file() + # Must be executable so the operator can run it directly. + assert _SCRIPT.stat().st_mode & 0o111 + + +def test_bash_syntax_is_valid(): + bash = shutil.which("bash") + if bash is None: + pytest.skip("bash not available") + result = subprocess.run([bash, "-n", str(_SCRIPT)], capture_output=True) + assert result.returncode == 0, result.stderr.decode() + + +class TestNamespaceAlignmentWithLib: + """The script's namespace + metric names must match what the lib emits.""" + + def test_namespace_matches_lib_constant(self, script_text): + from alpha_engine_lib.transparency import DEFAULT_NAMESPACE_OUT + + assert f'NAMESPACE="{DEFAULT_NAMESPACE_OUT}"' in script_text, ( + f"Script namespace must match alpha_engine_lib.transparency." + f"DEFAULT_NAMESPACE_OUT={DEFAULT_NAMESPACE_OUT!r} — otherwise " + f"alarms attach to a namespace nothing emits to." + ) + + def test_per_row_metric_matches_lib(self, script_text): + # The lib emits per-row metrics named "SubstrateRowOK" with a + # RowID dimension; the alarm must reference the exact same + # metric name. + assert 'PER_ROW_METRIC="SubstrateRowOK"' in script_text + + def test_aggregate_metric_matches_lib(self, script_text): + assert 'AGGREGATE_METRIC="SubstrateChecksFailed"' in script_text + + +class TestSNSTarget: + """The alarm target must be the existing alpha-engine-alerts topic.""" + + def test_sns_topic_is_alpha_engine_alerts(self, script_text): + # Matches the topic created by deploy_step_function.sh, reused + # across pipeline alerts. + assert "alpha-engine-alerts" in script_text + + def test_topic_existence_check_runs_before_alarm_creation(self, script_text): + # Pattern: get-topic-attributes ... exit 1 must appear before + # any put-metric-alarm call. This avoids creating alarms with + # broken SNS targets (silent paging failures). + topic_check_pos = script_text.find("get-topic-attributes") + first_alarm_pos = script_text.find("put-metric-alarm") + assert topic_check_pos != -1 + assert first_alarm_pos != -1 + assert topic_check_pos < first_alarm_pos + + +class TestRowEnumeration: + """Row IDs come from the lib's inventory — no hardcoded list to drift.""" + + def test_row_enumeration_uses_lib(self, script_text): + assert "from alpha_engine_lib.transparency import load_inventory" in script_text + + def test_row_enumeration_iterates_inventory(self, script_text): + # Sanity-check the comprehension still iterates the inventory key. + assert re.search( + r"r\['id'\] for r in load_inventory\(\)\['inventory'\]", + script_text, + ) + + def test_aborts_when_enumeration_returns_empty(self, script_text): + # The script must hard-exit if ROW_IDS is empty — otherwise it + # silently skips per-row alarms and creates only the aggregate. + assert "could not enumerate inventory rows" in script_text + + +class TestAlarmSemantics: + """Per-row + aggregate alarms must use the right comparison + statistic.""" + + def test_per_row_alarm_fires_below_one(self, script_text): + # The lib emits 1=ok/pending, 0=fail. Alarm must fire when the + # value drops below 1. + assert '--comparison-operator "LessThanThreshold"' in script_text + assert "--threshold 1" in script_text + + def test_per_row_alarm_uses_minimum_statistic(self, script_text): + # Minimum across the period — if any datapoint is 0, the alarm + # fires. Average would let a single fail get diluted. + assert '--statistic "Minimum"' in script_text + + def test_aggregate_alarm_fires_above_zero(self, script_text): + assert '--comparison-operator "GreaterThanThreshold"' in script_text + # Aggregate threshold lives near the aggregate alarm definition. + agg_block = script_text[script_text.find("aggregate_name"):] + assert "--threshold 0" in agg_block + + def test_treat_missing_data_is_not_breaching(self, script_text): + # Weekly-cadence rows emit once per Sat SF — between emissions, + # CloudWatch sees missing data. notBreaching means missing days + # don't fire alarms; only emitted-and-failed days fire. + assert '--treat-missing-data "notBreaching"' in script_text + + +class TestRegionDefault: + def test_region_defaults_to_us_east_1(self, script_text): + # All Alpha Engine infra lives in us-east-1; matches the + # eval-quality alarm script. + assert 'REGION="${AWS_REGION:-us-east-1}"' in script_text