From 60125514763bafa4e5aff6ff452ac4a67bc58655 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Fri, 10 Apr 2026 20:16:30 -0700 Subject: [PATCH] Version-control IAM policies; add lambda:InvokeFunction to deploy role MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the canary IAM gap that has caused every auto-deploy since #12 to report red even when the Lambda update itself succeeded: AccessDeniedException: User: .../github-actions-lambda-deploy/... is not authorized to perform: lambda:InvokeFunction on resource: arn:aws:lambda:us-east-1:711398986525:function: alpha-engine-data-collector:live The github-actions-lambda-deploy OIDC role was created ad-hoc when the GitHub Actions auto-deploy workflow was introduced in #12. It had ECR push + Lambda UpdateFunctionCode/UpdateAlias, but not InvokeFunction — so infrastructure/deploy.sh's post-update canary step (aws lambda invoke with dry_run=true) failed with AccessDenied, the rollback then also failed silently because the script's || true swallowed the error, and every deploy since has been leaving the alias stranded on whatever version just got published with no safety net. Three deploys in a row (#13, #14, #16) all looked like failures despite the underlying Lambda being updated. This PR does two things: 1. Adds infrastructure/iam/ as the new home for version-controlled IAM policies. It's intentionally low-ceremony — flat JSON files, one per role, applied via a small idempotent shell script. No CloudFormation, no Terraform. For a 5-module infra-light project, a flat directory is the right amount of rigor. Migrate to CFN later if the blast radius grows. 2. Adds a new LambdaInvokeCanary statement to the existing deploy-role policy, granting lambda:InvokeFunction on all 5 alpha-engine Lambdas and their aliases/versions. Scoped narrowly to the same functions the role already has UpdateFunctionCode on, so the blast radius is unchanged: an attacker with ECR push + UpdateFunctionCode can already run arbitrary code in these Lambdas. Applied live via `infrastructure/iam/apply.sh github-actions-lambda-deploy` before committing — so the next deploy workflow run actually passes the canary step. Also cleaned up the orphaned old `deploy-lambdas` inline policy (the new file's name is `github-actions-lambda-deploy-policy`, matching the convention of filename == role name). Why this matters beyond tonight: every IAM change from here on is diffable, reviewable, and recoverable. If a future PR drops a permission, code review catches it at PR time instead of surfacing as a mysterious AccessDenied in production. Follow-up: the deploy.sh script's rollback-on-canary-failure logic still uses `|| true` to swallow errors silently, which is why the stranded alias never got rolled back. That's a separate PR. Co-Authored-By: Claude Opus 4.6 (1M context) --- infrastructure/iam/apply.sh | 92 +++++++++++++++++++ .../iam/github-actions-lambda-deploy.json | 70 ++++++++++++++ 2 files changed, 162 insertions(+) create mode 100755 infrastructure/iam/apply.sh create mode 100644 infrastructure/iam/github-actions-lambda-deploy.json diff --git a/infrastructure/iam/apply.sh b/infrastructure/iam/apply.sh new file mode 100755 index 0000000..23d6d79 --- /dev/null +++ b/infrastructure/iam/apply.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# +# apply.sh — Apply all IAM policies in this directory to their matching roles. +# +# Each JSON file in this directory is treated as a role policy document. The +# filename (minus .json) is BOTH the target IAM role name AND the inline +# policy name. This keeps the mapping trivial: one file == one role == one +# policy. If you need multiple policies per role, put them in multiple files +# and accept the duplicate role name. +# +# This is intentionally low-ceremony — no CloudFormation, no Terraform. For +# a 5-module infra-light project, a flat JSON directory + idempotent apply +# script is the right amount of rigor. If the blast radius grows, migrate +# to CloudFormation/Terraform. +# +# Usage: +# ./infrastructure/iam/apply.sh # apply every policy +# ./infrastructure/iam/apply.sh github-actions-lambda-deploy # one role +# ./infrastructure/iam/apply.sh --dry-run # print planned commands +# +# Prerequisites: +# - AWS CLI configured with iam:PutRolePolicy on the target roles +# - The target IAM roles already exist (this script only updates inline +# policies; it does NOT create the roles themselves, because the trust +# policies differ and are outside the scope of a simple flat-file +# approach) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REGION="${AWS_REGION:-us-east-1}" + +DRY_RUN=0 +TARGET_ROLE="" + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=1 ;; + -h|--help) + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 + ;; + *) TARGET_ROLE="$arg" ;; + esac +done + +apply_one() { + local file="$1" + local role + role="$(basename "$file" .json)" + local policy_name="${role}-policy" + + # Validate JSON locally before shipping it to IAM + if ! python3 -c "import json; json.load(open('$file'))" 2>/dev/null; then + echo "ERROR: $file is not valid JSON — skipping" >&2 + return 1 + fi + + echo "Applying $file -> role=$role policy=$policy_name" + if [ "$DRY_RUN" = 1 ]; then + echo " [dry-run] aws iam put-role-policy --role-name $role --policy-name $policy_name --policy-document file://$file --region $REGION" + return 0 + fi + + aws iam put-role-policy \ + --role-name "$role" \ + --policy-name "$policy_name" \ + --policy-document "file://$file" \ + --region "$REGION" + echo " OK" +} + +cd "$SCRIPT_DIR" + +if [ -n "$TARGET_ROLE" ]; then + file="${TARGET_ROLE}.json" + if [ ! -f "$file" ]; then + echo "ERROR: $file not found in $SCRIPT_DIR" >&2 + exit 1 + fi + apply_one "$file" +else + shopt -s nullglob + files=( *.json ) + if [ ${#files[@]} -eq 0 ]; then + echo "No .json policy files found in $SCRIPT_DIR" + exit 0 + fi + for file in "${files[@]}"; do + apply_one "$file" + done +fi diff --git a/infrastructure/iam/github-actions-lambda-deploy.json b/infrastructure/iam/github-actions-lambda-deploy.json new file mode 100644 index 0000000..28e4ff2 --- /dev/null +++ b/infrastructure/iam/github-actions-lambda-deploy.json @@ -0,0 +1,70 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "ECRAuth", + "Effect": "Allow", + "Action": "ecr:GetAuthorizationToken", + "Resource": "*" + }, + { + "Sid": "ECRPush", + "Effect": "Allow", + "Action": [ + "ecr:BatchCheckLayerAvailability", + "ecr:InitiateLayerUpload", + "ecr:UploadLayerPart", + "ecr:CompleteLayerUpload", + "ecr:PutImage", + "ecr:BatchGetImage", + "ecr:DescribeImages", + "ecr:DescribeRepositories" + ], + "Resource": [ + "arn:aws:ecr:us-east-1:711398986525:repository/alpha-engine-research-runner", + "arn:aws:ecr:us-east-1:711398986525:repository/alpha-engine-research-alerts", + "arn:aws:ecr:us-east-1:711398986525:repository/alpha-engine-predictor", + "arn:aws:ecr:us-east-1:711398986525:repository/alpha-engine-data-collector", + "arn:aws:ecr:us-east-1:711398986525:repository/alpha-engine-health-check" + ] + }, + { + "Sid": "LambdaUpdate", + "Effect": "Allow", + "Action": [ + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:GetFunction", + "lambda:GetFunctionConfiguration", + "lambda:PublishVersion", + "lambda:UpdateAlias" + ], + "Resource": [ + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-runner", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-alerts", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-health-check" + ] + }, + { + "Sid": "LambdaInvokeCanary", + "Effect": "Allow", + "Action": [ + "lambda:InvokeFunction" + ], + "Resource": [ + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-runner", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-runner:*", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-alerts", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-alerts:*", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference:*", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector:*", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-health-check", + "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-health-check:*" + ] + } + ] +}