Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions infrastructure/spot_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,15 @@ done
# reads it verbatim (no command-substitution scanning) — matching the
# pattern alpha-engine-data PR 2 (#330) and alpha-engine-backtester
# PR 3 (#251) adopted for their migrations.
# L394 cascade: --diagnostics-bucket + --diagnostics-prefix activate the
# lib v0.39.0 chokepoint that writes a JSON failure record (status +
# command_id + 4KB stdout/stderr tails + instance_id) to
# s3://${S3_BUCKET}/_spot_diagnostics/ae-predictor/{YYYY-MM-DD}.json on
# terminal non-Success. Best-effort write inside the lib — S3 failure
# swallowed; inner SSM exit always preserved. Substrate is failure-only
# (no-op on Success). Per-repo subprefix discriminates cascade A
# (ae-data) + cascade B (ae-backtester) sibling writes — lib's
# {date}.json key shape would otherwise clobber within a shared prefix.
run_ssm() {
local description="$1" script="$2" timeout_s="${3:-3600}"
printf '%s' "$script" | "$LIB_PYTHON" -m alpha_engine_lib.ssm_dispatcher run \
Expand All @@ -223,6 +232,8 @@ run_ssm() {
--output-bucket "$S3_BUCKET" \
--output-key-prefix "${S3_STAGING_PREFIX}/ssm-output" \
--region "$AWS_REGION" \
--diagnostics-bucket "$S3_BUCKET" \
--diagnostics-prefix "_spot_diagnostics/ae-predictor" \
--script-stdin
}

Expand Down
2 changes: 1 addition & 1 deletion requirements-lambda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# (root file used by training EC2 + local dev; this file used by the
# Dockerfile in the Lambda image). Drift between them shipped a stale
# lib to prod on 2026-05-07.
alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.33.0
alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.39.0
numpy>=1.24.0,<2
pandas>=2.0.0
pyarrow>=14.0.0
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ scikit-learn>=1.3.0

# Shared lib (used by inference + training entrypoints). Public since
# 2026-05-03 — pip installs from git+https with no auth.
alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.33.0
alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.39.0

# Training dependencies (not needed in Lambda)
arcticdb>=6.11
Expand Down
32 changes: 32 additions & 0 deletions tests/test_spot_train_ssm_lib_chokepoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,38 @@ def test_run_ssm_uses_lib_dispatcher():
)


def test_run_ssm_passes_diagnostics_flags():
"""L394 cascade — ``run_ssm`` MUST pass both ``--diagnostics-bucket``
and ``--diagnostics-prefix`` so terminal non-Success in any spot
SSM step writes a JSON failure record to
``s3://${S3_BUCKET}/_spot_diagnostics/ae-predictor/{date}.json`` per
the lib v0.39.0 contract. Both flags must be present — lib's partial-
config guard makes a missing flag a silent no-op."""
text = _SCRIPT.read_text()
m = re.search(r"^run_ssm\(\)\s*\{.*?^\}", text, re.MULTILINE | re.DOTALL)
assert m, "no run_ssm() helper found in spot_train.sh"
body = m.group(0)
assert "--diagnostics-bucket" in body, (
"spot_train.sh run_ssm() does not pass --diagnostics-bucket to "
"the lib CLI. L394 cascade requires both --diagnostics-bucket "
"and --diagnostics-prefix together; without --diagnostics-bucket "
"the lib's partial-config guard makes the diagnostics-write a "
"silent no-op even on terminal non-Success."
)
assert "--diagnostics-prefix" in body, (
"spot_train.sh run_ssm() does not pass --diagnostics-prefix to "
"the lib CLI."
)
# Per-repo subprefix discriminates cascade A (ae-data) + cascade B
# (ae-backtester) sibling writes — lib's {date}.json key shape would
# otherwise clobber within a shared prefix.
assert "_spot_diagnostics/ae-predictor" in body, (
"spot_train.sh --diagnostics-prefix must scope to "
"_spot_diagnostics/ae-predictor so ae-data + ae-backtester "
"cascade siblings write to disjoint S3 namespaces."
)


def test_no_inline_aws_ssm_send_command():
"""The script MUST NOT call ``aws ssm send-command`` directly in a
non-comment line. That's the pre-lift pattern L342 explicitly
Expand Down
Loading