Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions infrastructure/setup_substrate_alarms.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env bash
# setup_substrate_alarms.sh — One-shot CloudWatch alarm setup for the
# Phase 2 → 3 transparency-substrate health checker (alpha-engine-lib
# transparency.py + transparency_inventory.yaml).
#
# Idempotent: safe to re-run after threshold tweaks. Creates one alarm
# per inventory row plus one aggregate failure alarm. All point to the
# existing alpha-engine-alerts SNS topic.
#
# Per-row alarm:
# alpha-engine-substrate-<row_id>
# Fires when SubstrateRowOK metric for that row drops below 1
# (the substrate checker emits 1 for ok/not_yet_effective, 0 for fail).
#
# Aggregate alarm:
# alpha-engine-substrate-aggregate-failures
# Fires when SubstrateChecksFailed > 0 in the trailing 24h window —
# safety net in case a row alarm gets accidentally deleted.
#
# Period 86400 (24h) with EvaluationPeriods=1 means each alarm checks
# the most recent 24h window. treat-missing-data=notBreaching keeps
# weekly-cadence rows quiet between emissions (a row that emits once
# per Sat SF only has a datapoint every 7 days; missing days are not
# alarms, only emitted-and-failed days are).
#
# Row enumeration is sourced from the lib's transparency_inventory.yaml
# so this script stays in sync with the inventory automatically — no
# hardcoded row list to drift.
#
# Usage:
# pip install alpha-engine-lib==0.5.0 # (or activate a venv with it)
# ./infrastructure/setup_substrate_alarms.sh

set -euo pipefail

REGION="${AWS_REGION:-us-east-1}"
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text --region "$REGION")
SNS_TOPIC_ARN="arn:aws:sns:${REGION}:${ACCOUNT_ID}:alpha-engine-alerts"
NAMESPACE="AlphaEngine/Substrate"
PER_ROW_METRIC="SubstrateRowOK"
AGGREGATE_METRIC="SubstrateChecksFailed"

echo "Configuring CloudWatch alarms for transparency substrate"
echo " Region: $REGION"
echo " SNS topic: $SNS_TOPIC_ARN"
echo " Namespace: $NAMESPACE"

# Verify the SNS topic exists — fail fast rather than create alarms
# with broken targets.
if ! aws sns get-topic-attributes \
--topic-arn "$SNS_TOPIC_ARN" \
--region "$REGION" > /dev/null 2>&1; then
echo "ERROR: SNS topic $SNS_TOPIC_ARN not found. Run deploy_step_function.sh first." >&2
exit 1
fi

# Pull row IDs from the lib's inventory YAML — single source of truth.
# Adding a row to the YAML and re-running this script automatically
# adds the corresponding alarm. Removing a row leaves a stale alarm,
# which surfaces as INSUFFICIENT_DATA — safer than silently deleting.
ROW_IDS=$(python3 -c "
from alpha_engine_lib.transparency import load_inventory
print(' '.join(r['id'] for r in load_inventory()['inventory']))
")

if [[ -z "$ROW_IDS" ]]; then
echo "ERROR: could not enumerate inventory rows. Is alpha-engine-lib installed?" >&2
exit 1
fi

echo ""
echo "Creating per-row alarms for: $ROW_IDS"
echo ""

# --- Per-row alarms ---------------------------------------------------------

for row_id in $ROW_IDS; do
alarm_name="alpha-engine-substrate-${row_id}"
echo "==> $alarm_name"

aws cloudwatch put-metric-alarm \
--region "$REGION" \
--alarm-name "$alarm_name" \
--alarm-description "Fires when the transparency-substrate row '$row_id' fails to emit a passing measurement. The check is row-driven (alpha_engine_lib.transparency); the SF Sat pipeline runs --cadence weekly which sweeps weekly + daily rows. This alarm decrements the Phase 2 → 3 observation gate denominator for this row when it fires. treat-missing-data=notBreaching keeps weekly-cadence rows quiet between Sat-SF emissions." \
--comparison-operator "LessThanThreshold" \
--evaluation-periods 1 \
--period 86400 \
--statistic "Minimum" \
--threshold 1 \
--treat-missing-data "notBreaching" \
--namespace "$NAMESPACE" \
--metric-name "$PER_ROW_METRIC" \
--dimensions "Name=RowID,Value=$row_id" \
--alarm-actions "$SNS_TOPIC_ARN" \
--ok-actions "$SNS_TOPIC_ARN" > /dev/null
done

# --- Aggregate failure alarm ------------------------------------------------

aggregate_name="alpha-engine-substrate-aggregate-failures"
echo "==> $aggregate_name"

aws cloudwatch put-metric-alarm \
--region "$REGION" \
--alarm-name "$aggregate_name" \
--alarm-description "Aggregate safety-net alarm for the transparency substrate. Fires when SubstrateChecksFailed > 0 in any 24h window. Catches the case where a per-row alarm has been accidentally deleted — the per-row alarms are authoritative for which row failed; this alarm only confirms the substrate is observing failures. treat-missing-data=notBreaching means a substrate run with all rows passing emits zero failures and the alarm stays OK." \
--comparison-operator "GreaterThanThreshold" \
--evaluation-periods 1 \
--period 86400 \
--statistic "Maximum" \
--threshold 0 \
--treat-missing-data "notBreaching" \
--namespace "$NAMESPACE" \
--metric-name "$AGGREGATE_METRIC" \
--alarm-actions "$SNS_TOPIC_ARN" \
--ok-actions "$SNS_TOPIC_ARN" > /dev/null

echo ""
echo "All substrate alarms configured."
echo ""
echo "Validation:"
echo " aws cloudwatch describe-alarms --region $REGION \\"
echo " --alarm-name-prefix alpha-engine-substrate- \\"
echo " --query 'MetricAlarms[].[AlarmName,StateValue]' --output table"
echo ""
echo "First firing eligibility: per-row alarms remain INSUFFICIENT_DATA until the first weekly substrate run emits the metric (Sat SF) — they will not page during the gap. Rows with effective_date > today emit value=1 (counted as healthy) so they stay quiet until their grace period expires."
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ arcticdb>=6.11
# previously listed above as direct deps; kept those direct lines for now to
# avoid breaking pinning during the migration. Drop the duplicate direct
# pgvector/psycopg2-binary pins once the migration soaks.
alpha-engine-lib[arcticdb,flow_doctor,rag] @ git+https://github.com/cipher813/alpha-engine-lib@v0.3.0
alpha-engine-lib[arcticdb,flow_doctor,rag] @ git+https://github.com/cipher813/alpha-engine-lib@v0.5.0
141 changes: 141 additions & 0 deletions tests/test_substrate_alarms_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Pins the substrate alarm setup script to the lib's metric namespace.

The setup_substrate_alarms.sh script is idempotent and run once per
threshold change, but its alarms are useless if the namespace + metric
name don't match what alpha_engine_lib.transparency.emit_cloudwatch_metrics
publishes.

These tests catch that drift class:
- Namespace mismatch → alarms never fire
- Metric name mismatch → alarms never fire
- SNS topic typo → alarms fire but no one is paged
- Row enumeration syntactically broken → script aborts before creating
per-row alarms but still creates the aggregate (deceptive partial
success)
"""

from __future__ import annotations

import re
import shutil
import subprocess
from pathlib import Path

import pytest


_REPO_ROOT = Path(__file__).resolve().parent.parent
_SCRIPT = _REPO_ROOT / "infrastructure" / "setup_substrate_alarms.sh"


@pytest.fixture(scope="module")
def script_text() -> str:
return _SCRIPT.read_text()


def test_script_exists_and_is_executable():
assert _SCRIPT.is_file()
# Must be executable so the operator can run it directly.
assert _SCRIPT.stat().st_mode & 0o111


def test_bash_syntax_is_valid():
bash = shutil.which("bash")
if bash is None:
pytest.skip("bash not available")
result = subprocess.run([bash, "-n", str(_SCRIPT)], capture_output=True)
assert result.returncode == 0, result.stderr.decode()


class TestNamespaceAlignmentWithLib:
"""The script's namespace + metric names must match what the lib emits."""

def test_namespace_matches_lib_constant(self, script_text):
from alpha_engine_lib.transparency import DEFAULT_NAMESPACE_OUT

assert f'NAMESPACE="{DEFAULT_NAMESPACE_OUT}"' in script_text, (
f"Script namespace must match alpha_engine_lib.transparency."
f"DEFAULT_NAMESPACE_OUT={DEFAULT_NAMESPACE_OUT!r} — otherwise "
f"alarms attach to a namespace nothing emits to."
)

def test_per_row_metric_matches_lib(self, script_text):
# The lib emits per-row metrics named "SubstrateRowOK" with a
# RowID dimension; the alarm must reference the exact same
# metric name.
assert 'PER_ROW_METRIC="SubstrateRowOK"' in script_text

def test_aggregate_metric_matches_lib(self, script_text):
assert 'AGGREGATE_METRIC="SubstrateChecksFailed"' in script_text


class TestSNSTarget:
"""The alarm target must be the existing alpha-engine-alerts topic."""

def test_sns_topic_is_alpha_engine_alerts(self, script_text):
# Matches the topic created by deploy_step_function.sh, reused
# across pipeline alerts.
assert "alpha-engine-alerts" in script_text

def test_topic_existence_check_runs_before_alarm_creation(self, script_text):
# Pattern: get-topic-attributes ... exit 1 must appear before
# any put-metric-alarm call. This avoids creating alarms with
# broken SNS targets (silent paging failures).
topic_check_pos = script_text.find("get-topic-attributes")
first_alarm_pos = script_text.find("put-metric-alarm")
assert topic_check_pos != -1
assert first_alarm_pos != -1
assert topic_check_pos < first_alarm_pos


class TestRowEnumeration:
"""Row IDs come from the lib's inventory — no hardcoded list to drift."""

def test_row_enumeration_uses_lib(self, script_text):
assert "from alpha_engine_lib.transparency import load_inventory" in script_text

def test_row_enumeration_iterates_inventory(self, script_text):
# Sanity-check the comprehension still iterates the inventory key.
assert re.search(
r"r\['id'\] for r in load_inventory\(\)\['inventory'\]",
script_text,
)

def test_aborts_when_enumeration_returns_empty(self, script_text):
# The script must hard-exit if ROW_IDS is empty — otherwise it
# silently skips per-row alarms and creates only the aggregate.
assert "could not enumerate inventory rows" in script_text


class TestAlarmSemantics:
"""Per-row + aggregate alarms must use the right comparison + statistic."""

def test_per_row_alarm_fires_below_one(self, script_text):
# The lib emits 1=ok/pending, 0=fail. Alarm must fire when the
# value drops below 1.
assert '--comparison-operator "LessThanThreshold"' in script_text
assert "--threshold 1" in script_text

def test_per_row_alarm_uses_minimum_statistic(self, script_text):
# Minimum across the period — if any datapoint is 0, the alarm
# fires. Average would let a single fail get diluted.
assert '--statistic "Minimum"' in script_text

def test_aggregate_alarm_fires_above_zero(self, script_text):
assert '--comparison-operator "GreaterThanThreshold"' in script_text
# Aggregate threshold lives near the aggregate alarm definition.
agg_block = script_text[script_text.find("aggregate_name"):]
assert "--threshold 0" in agg_block

def test_treat_missing_data_is_not_breaching(self, script_text):
# Weekly-cadence rows emit once per Sat SF — between emissions,
# CloudWatch sees missing data. notBreaching means missing days
# don't fire alarms; only emitted-and-failed days fire.
assert '--treat-missing-data "notBreaching"' in script_text


class TestRegionDefault:
def test_region_defaults_to_us_east_1(self, script_text):
# All Alpha Engine infra lives in us-east-1; matches the
# eval-quality alarm script.
assert 'REGION="${AWS_REGION:-us-east-1}"' in script_text
Loading