From 012beb3f25133069a0006dc02eee4d0d9667d54d Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Thu, 21 May 2026 16:38:46 -0700 Subject: [PATCH] feat(deploy): Telegram + SNS alert on canary rollback (L221) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Independent-channel surveillance on the canary-rollback path that fired silently 10 consecutive times across 2 days in the #274 retrospective. Best-effort lib alerts.publish before exit 1; trailing || true never overrides the deploy's exit code. The 4 sub-Lambda deploys (spot-orphan-reaper / changelog-cloudwatch-mirror / eod-success-friday-shell-trigger / sf-telegram-notifier) don't have canary/rollback paths — bootstrap-style deploys without a gate — so no edit needed there. The changelog-incident-mirror already uses lib alerts (per the L143/L146 fleet pass). Co-Authored-By: Claude Opus 4.7 (1M context) --- infrastructure/deploy.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/infrastructure/deploy.sh b/infrastructure/deploy.sh index 97f9067..f986711 100755 --- a/infrastructure/deploy.sh +++ b/infrastructure/deploy.sh @@ -196,6 +196,16 @@ if [ "$CANARY_STATUS" != "OK" ] && [ "$CANARY_STATUS" != "SKIPPED" ]; then --region "$REGION" 2>/dev/null || true echo " Rolled back to version $PREV_VERSION" fi + # Independent-channel surveillance per ROADMAP L221 — this exact + # rollback chain fired silently 10 consecutive times across 2 days + # (alpha-engine-data #274 retrospective) before Brian noticed the + # GitHub Actions red-icon. Best-effort; trailing || true never + # overrides the deploy's exit 1. + python3 -m alpha_engine_lib.alerts publish \ + --severity error \ + --source "alpha-engine-data/infrastructure/deploy.sh" \ + --message "Canary rolled back: ${FUNCTION_NAME} canary returned status='${CANARY_STATUS}', live alias reverted v${VERSION}→v${PREV_VERSION}. See CloudWatch /aws/lambda/${FUNCTION_NAME} for payload." \ + || true exit 1 fi echo " Canary passed (status=$CANARY_STATUS)"