From 67b3d4b40ce0da91dd633f2a4725e16b18d28fb2 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Thu, 9 Apr 2026 08:05:53 -0700 Subject: [PATCH] feat: add RunMorningPlanner + RunDaemon to daily Step Function Step Function previously ended at StartExecutorEC2 and relied on systemd boot triggers to run the executor. This fails on re-runs when the instance is already running (systemd only triggers on cold boot). Added: - WaitForInstanceReady (120s boot wait) - RunMorningPlanner (SSM: executor/main.py with IB Gateway wait) - WaitForMorningPlanner + CheckMorningPlannerStatus (SSM polling loop) - RunDaemon (SSM: systemctl restart daemon) - PipelineComplete (fallback if daemon restart fails) Also fixed IAM: split SSM into SendCommand (scoped to both instances) and GetCommandInvocation (requires Resource: *), matching earlier fix that was only applied at runtime. Co-Authored-By: Claude Opus 4.6 (1M context) --- infrastructure/deploy_step_function_daily.sh | 13 +- infrastructure/step_function_daily.json | 135 ++++++++++++++++++- 2 files changed, 144 insertions(+), 4 deletions(-) diff --git a/infrastructure/deploy_step_function_daily.sh b/infrastructure/deploy_step_function_daily.sh index 1467d6c..6aa7325 100755 --- a/infrastructure/deploy_step_function_daily.sh +++ b/infrastructure/deploy_step_function_daily.sh @@ -53,14 +53,21 @@ POLICY='{ ] }, { - "Sid": "SSMRunCommand", + "Sid": "SSMSendCommand", "Effect": "Allow", - "Action": ["ssm:SendCommand", "ssm:GetCommandInvocation"], + "Action": ["ssm:SendCommand"], "Resource": [ "arn:aws:ssm:'"$REGION"'::document/AWS-RunShellScript", - "arn:aws:ec2:'"$REGION"':'"$ACCOUNT_ID"':instance/'"$MICRO_INSTANCE"'" + "arn:aws:ec2:'"$REGION"':'"$ACCOUNT_ID"':instance/'"$MICRO_INSTANCE"'", + "arn:aws:ec2:'"$REGION"':'"$ACCOUNT_ID"':instance/'"$TRADING_INSTANCE"'" ] }, + { + "Sid": "SSMGetCommandInvocation", + "Effect": "Allow", + "Action": ["ssm:GetCommandInvocation"], + "Resource": "*" + }, { "Sid": "EC2Start", "Effect": "Allow", diff --git a/infrastructure/step_function_daily.json b/infrastructure/step_function_daily.json index b815e99..61eb4a4 100644 --- a/infrastructure/step_function_daily.json +++ b/infrastructure/step_function_daily.json @@ -352,7 +352,7 @@ "StartExecutorEC2": { "Type": "Task", - "Comment": "Start the trading EC2 instance for executor + daemon", + "Comment": "Start the trading EC2 instance (no-op if already running)", "Resource": "arn:aws:states:::aws-sdk:ec2:startInstances", "Parameters": { "InstanceIds.$": "$.trading_instance_id" @@ -373,9 +373,142 @@ } ], "ResultPath": "$.ec2_start_result", + "Next": "WaitForInstanceReady" + }, + + "WaitForInstanceReady": { + "Type": "Wait", + "Comment": "Wait for instance to boot + IB Gateway to authenticate (~90s cold boot, ~0s if already running)", + "Seconds": 120, + "Next": "RunMorningPlanner" + }, + + "RunMorningPlanner": { + "Type": "Task", + "Comment": "Run morning order-book planner via SSM", + "Resource": "arn:aws:states:::aws-sdk:ssm:sendCommand", + "Parameters": { + "DocumentName": "AWS-RunShellScript", + "InstanceIds.$": "$.trading_instance_id", + "Parameters": { + "commands": [ + "cd /home/ec2-user/alpha-engine", + "set -a && source /home/ec2-user/.alpha-engine.env && set +a", + "source .venv/bin/activate", + "/home/ec2-user/alpha-engine/infrastructure/wait-for-ibgateway.sh", + "python executor/main.py 2>&1 | tee -a /var/log/executor.log" + ], + "executionTimeout": ["300"] + }, + "TimeoutSeconds": 300 + }, + "TimeoutSeconds": 360, + "Retry": [ + { + "ErrorEquals": ["States.TaskFailed"], + "MaxAttempts": 1, + "IntervalSeconds": 60, + "BackoffRate": 1.0 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Next": "HandleFailure", + "ResultPath": "$.error" + } + ], + "ResultPath": "$.planner_result", + "Next": "WaitForMorningPlanner" + }, + + "WaitForMorningPlanner": { + "Type": "Task", + "Resource": "arn:aws:states:::aws-sdk:ssm:getCommandInvocation", + "Parameters": { + "CommandId.$": "$.planner_result.Command.CommandId", + "InstanceId.$": "$.trading_instance_id[0]" + }, + "Retry": [ + { + "ErrorEquals": ["Ssm.InvocationDoesNotExistException"], + "MaxAttempts": 10, + "IntervalSeconds": 10, + "BackoffRate": 1.5 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Next": "HandleFailure", + "ResultPath": "$.error" + } + ], + "ResultPath": "$.planner_poll", + "Next": "CheckMorningPlannerStatus" + }, + + "CheckMorningPlannerStatus": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.planner_poll.Status", + "StringEquals": "Success", + "Next": "RunDaemon" + }, + { + "Variable": "$.planner_poll.Status", + "StringEquals": "InProgress", + "Next": "MorningPlannerWait" + }, + { + "Variable": "$.planner_poll.Status", + "StringEquals": "Pending", + "Next": "MorningPlannerWait" + } + ], + "Default": "HandleFailure" + }, + + "MorningPlannerWait": { + "Type": "Wait", + "Seconds": 15, + "Next": "WaitForMorningPlanner" + }, + + "RunDaemon": { + "Type": "Task", + "Comment": "Restart the intraday daemon via SSM", + "Resource": "arn:aws:states:::aws-sdk:ssm:sendCommand", + "Parameters": { + "DocumentName": "AWS-RunShellScript", + "InstanceIds.$": "$.trading_instance_id", + "Parameters": { + "commands": [ + "sudo systemctl restart alpha-engine-daemon.service", + "echo 'Daemon restarted'" + ], + "executionTimeout": ["30"] + }, + "TimeoutSeconds": 30 + }, + "TimeoutSeconds": 60, + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Comment": "Daemon restart failure is non-fatal — systemd boot trigger is backup", + "Next": "PipelineComplete", + "ResultPath": "$.daemon_error" + } + ], + "ResultPath": "$.daemon_result", "End": true }, + "PipelineComplete": { + "Type": "Succeed" + }, + "HandleFailure": { "Type": "Task", "Comment": "Failure alert via SNS",