From ad90bd3e8baafefbdcf9fa2702aaebab2f801750 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Sat, 11 Apr 2026 14:10:36 -0700 Subject: [PATCH] Embed git pull + hard-fail exit propagation in Saturday Step Function SSM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three stacked bugs in the Step Function's SSM commands that left the Saturday pipeline blind to DataPhase1 failures and running stale EC2 code for an unknown duration: 1. No `git pull` — every SSM command ran whatever was checked out on /home/ec2-user/alpha-engine-{data,predictor,backtester,research} last time an operator manually pulled. PR #18's hard-fail fix was merged to main hours ago but today's pipeline runs were still executing pre-#18 code. 2. `| tee` without `set -o pipefail` — DataPhase1, RAGIngestion, and HealthCheck piped Python output through tee without pipefail, so tee's exit code (always 0) masked the Python exit code. Even when main() raised SystemExit(1), SSM reported Success. 3. `echo "EXIT_CODE=$?"` as the final command on DataPhase1 and RAGIngestion — this was cosmetic decoration that made the shell script exit with echo's exit code (always 0), losing whatever had happened earlier in the script. Pure write-only code. All six SSM commands now start with `set -eo pipefail`, pull their repo from origin main with --ff-only (fails loudly if EC2 has diverged), and drop the cosmetic echo. DriftDetection pulls both alpha-engine-data and alpha-engine-predictor since it consumes both. HealthCheck pulls alpha-engine-data. This also answers the "what can we do to prevent this issue?" question at the first layer: git pull in the SSM command itself. Next layers (per session discussion): - Medium-term: emit git SHA into each phase manifest for drift detection monitoring - Long-term: immutable artifacts — tar/container with git-SHA tag uploaded to S3 on merge, SSM extracts instead of git pull ## Live deployment Applied directly to the live state machine via `aws stepfunctions update-state-machine` (revision ac2011c6) so the rerun can pick it up immediately. This PR is the repo-side record. ## Test plan - [x] JSON validates - [ ] New execution picks up new definition - [ ] DataPhase1 runs new code (PR #18 hard-fail), fails fast if any collector is non-ok - [ ] No "EXIT_CODE=0" lines in CloudWatch despite Python errors Co-Authored-By: Claude Opus 4.6 (1M context) --- infrastructure/step_function.json | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json index 6457aa1..67c561f 100644 --- a/infrastructure/step_function.json +++ b/infrastructure/step_function.json @@ -12,11 +12,12 @@ "InstanceIds.$": "$.ec2_instance_id", "Parameters": { "commands": [ + "set -eo pipefail", "cd /home/ec2-user/alpha-engine-data", + "git pull --ff-only origin main", "set -a && source /home/ec2-user/.alpha-engine.env && set +a", "source .venv/bin/activate", - "python weekly_collector.py --phase 1 2>&1 | tee /var/log/data-phase1.log", - "echo \"EXIT_CODE=$?\"" + "python weekly_collector.py --phase 1 2>&1 | tee /var/log/data-phase1.log" ], "executionTimeout": ["1800"] }, @@ -106,11 +107,12 @@ "InstanceIds.$": "$.ec2_instance_id", "Parameters": { "commands": [ + "set -eo pipefail", "cd /home/ec2-user/alpha-engine-data", + "git pull --ff-only origin main", "set -a && source /home/ec2-user/.alpha-engine.env && set +a", "source .venv/bin/activate", - "bash rag/pipelines/run_weekly_ingestion.sh 2>&1 | tee /var/log/rag-ingestion.log", - "echo \"EXIT_CODE=$?\"" + "bash rag/pipelines/run_weekly_ingestion.sh 2>&1 | tee /var/log/rag-ingestion.log" ], "executionTimeout": ["1800"] }, @@ -279,10 +281,11 @@ "InstanceIds.$": "$.ec2_instance_id", "Parameters": { "commands": [ + "set -eo pipefail", "cd /home/ec2-user/alpha-engine-predictor", + "git pull --ff-only origin main", "export HOME=/home/ec2-user", "set -a && source /home/ec2-user/.alpha-engine.env && set +a", - "set -o pipefail", "bash infrastructure/spot_train.sh --full-only 2>&1 | tee /var/log/predictor-training.log" ], "executionTimeout": ["5400"] @@ -374,9 +377,11 @@ "InstanceIds.$": "$.ec2_instance_id", "Parameters": { "commands": [ + "set -eo pipefail", "export HOME=/home/ec2-user", + "cd /home/ec2-user/alpha-engine-data && git pull --ff-only origin main", + "cd /home/ec2-user/alpha-engine-predictor && git pull --ff-only origin main", "set -a && source /home/ec2-user/.alpha-engine.env && set +a", - "set -o pipefail", "export PYTHONPATH=/home/ec2-user/alpha-engine-predictor", "/home/ec2-user/alpha-engine-data/.venv/bin/python -m monitoring.drift_detector --alert 2>&1 | tee /var/log/drift-detection.log" ], @@ -406,10 +411,11 @@ "InstanceIds.$": "$.ec2_instance_id", "Parameters": { "commands": [ + "set -eo pipefail", "cd /home/ec2-user/alpha-engine-backtester", + "git pull --ff-only origin main", "export HOME=/home/ec2-user", "set -a && source /home/ec2-user/.alpha-engine.env && set +a", - "set -o pipefail", "bash infrastructure/spot_backtest.sh 2>&1 | tee /var/log/backtester.log" ], "executionTimeout": ["7200"] @@ -501,7 +507,9 @@ "InstanceIds.$": "$.ec2_instance_id", "Parameters": { "commands": [ + "set -eo pipefail", "cd /home/ec2-user/alpha-engine-data", + "git pull --ff-only origin main", "source .venv/bin/activate", "python health_checker.py --alert 2>&1 | tee /var/log/health-check.log" ],