From 153475d4a5afbd4d77ff56007384c10f8b7b8ed7 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Wed, 27 May 2026 11:56:14 -0700 Subject: [PATCH] =?UTF-8?q?feat(spot-train):=20wire=20L394=20diagnostics-w?= =?UTF-8?q?rite=20flags=20+=20bump=20lib=20v0.33.0=20=E2=86=92=20v0.39.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L394 cascade C of 3 (sibling to alpha-engine-data #334 cascade A + alpha-engine-backtester #254 cascade B). Activates the lib v0.39.0 ssm_dispatcher diagnostics-write substrate for ae- predictor's spot SSM dispatch — on terminal non-Success the lib CLI writes a JSON failure record (status + command_id + 4KB stdout/stderr tails + instance_id) to s3://${S3_BUCKET}/ _spot_diagnostics/ae-predictor/{YYYY-MM-DD}.json. Best-effort posture inside the lib — S3 failure swallowed; inner SSM exit always preserved. Substrate is failure-only (no-op on Success). Changes: - requirements.txt + requirements-lambda.txt: lib pin v0.33.0 → v0.39.0 in lockstep. Carries forward 6 intervening lib substrate bumps (v0.34 LLMJudgeReranker deletion, v0.35 ssm_dispatcher lift, v0.36 Option-D execution-picker, v0.37 anthropic_payload chokepoint, v0.38 universe_writer_lock + PyPI summary guard, v0.39 ssm_dispatcher diagnostics-write). - spot_train.sh::run_ssm: append --diagnostics-bucket $S3_BUCKET + --diagnostics-prefix _spot_diagnostics/ae-predictor to the lib CLI invocation. Per-repo subprefix discriminates cascade A (ae-data) + cascade B (ae-backtester) sibling writes — lib's {date}.json key shape would otherwise clobber within a shared prefix. - tests/test_spot_train_ssm_lib_chokepoint.py: new test_run_ssm_passes_diagnostics_flags pins both flags + per- repo subprefix. Composes with the existing predictor manifest.peak_rss_mb instrumentation (PR #193) — peak_rss covers the OOM-class on EVERY run (success or failure); diagnostics-write covers the broader class (non-OOM terminal failures: missing S3 input, IAM drift, ArcticDB connect timeout) failure-only. Suite 1210 → 1211. Co-Authored-By: Claude Opus 4.7 (1M context) --- infrastructure/spot_train.sh | 11 +++++++ requirements-lambda.txt | 2 +- requirements.txt | 2 +- tests/test_spot_train_ssm_lib_chokepoint.py | 32 +++++++++++++++++++++ 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/infrastructure/spot_train.sh b/infrastructure/spot_train.sh index 59e4516..8ca926f 100755 --- a/infrastructure/spot_train.sh +++ b/infrastructure/spot_train.sh @@ -214,6 +214,15 @@ done # reads it verbatim (no command-substitution scanning) — matching the # pattern alpha-engine-data PR 2 (#330) and alpha-engine-backtester # PR 3 (#251) adopted for their migrations. +# L394 cascade: --diagnostics-bucket + --diagnostics-prefix activate the +# lib v0.39.0 chokepoint that writes a JSON failure record (status + +# command_id + 4KB stdout/stderr tails + instance_id) to +# s3://${S3_BUCKET}/_spot_diagnostics/ae-predictor/{YYYY-MM-DD}.json on +# terminal non-Success. Best-effort write inside the lib — S3 failure +# swallowed; inner SSM exit always preserved. Substrate is failure-only +# (no-op on Success). Per-repo subprefix discriminates cascade A +# (ae-data) + cascade B (ae-backtester) sibling writes — lib's +# {date}.json key shape would otherwise clobber within a shared prefix. run_ssm() { local description="$1" script="$2" timeout_s="${3:-3600}" printf '%s' "$script" | "$LIB_PYTHON" -m alpha_engine_lib.ssm_dispatcher run \ @@ -223,6 +232,8 @@ run_ssm() { --output-bucket "$S3_BUCKET" \ --output-key-prefix "${S3_STAGING_PREFIX}/ssm-output" \ --region "$AWS_REGION" \ + --diagnostics-bucket "$S3_BUCKET" \ + --diagnostics-prefix "_spot_diagnostics/ae-predictor" \ --script-stdin } diff --git a/requirements-lambda.txt b/requirements-lambda.txt index 4787727..86e2430 100644 --- a/requirements-lambda.txt +++ b/requirements-lambda.txt @@ -5,7 +5,7 @@ # (root file used by training EC2 + local dev; this file used by the # Dockerfile in the Lambda image). Drift between them shipped a stale # lib to prod on 2026-05-07. -alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.33.0 +alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.39.0 numpy>=1.24.0,<2 pandas>=2.0.0 pyarrow>=14.0.0 diff --git a/requirements.txt b/requirements.txt index 38cd6b3..a7aa467 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ scikit-learn>=1.3.0 # Shared lib (used by inference + training entrypoints). Public since # 2026-05-03 — pip installs from git+https with no auth. -alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.33.0 +alpha-engine-lib[arcticdb,flow_doctor] @ git+https://github.com/cipher813/alpha-engine-lib@v0.39.0 # Training dependencies (not needed in Lambda) arcticdb>=6.11 diff --git a/tests/test_spot_train_ssm_lib_chokepoint.py b/tests/test_spot_train_ssm_lib_chokepoint.py index b1fd6ba..1537c21 100644 --- a/tests/test_spot_train_ssm_lib_chokepoint.py +++ b/tests/test_spot_train_ssm_lib_chokepoint.py @@ -93,6 +93,38 @@ def test_run_ssm_uses_lib_dispatcher(): ) +def test_run_ssm_passes_diagnostics_flags(): + """L394 cascade — ``run_ssm`` MUST pass both ``--diagnostics-bucket`` + and ``--diagnostics-prefix`` so terminal non-Success in any spot + SSM step writes a JSON failure record to + ``s3://${S3_BUCKET}/_spot_diagnostics/ae-predictor/{date}.json`` per + the lib v0.39.0 contract. Both flags must be present — lib's partial- + config guard makes a missing flag a silent no-op.""" + text = _SCRIPT.read_text() + m = re.search(r"^run_ssm\(\)\s*\{.*?^\}", text, re.MULTILINE | re.DOTALL) + assert m, "no run_ssm() helper found in spot_train.sh" + body = m.group(0) + assert "--diagnostics-bucket" in body, ( + "spot_train.sh run_ssm() does not pass --diagnostics-bucket to " + "the lib CLI. L394 cascade requires both --diagnostics-bucket " + "and --diagnostics-prefix together; without --diagnostics-bucket " + "the lib's partial-config guard makes the diagnostics-write a " + "silent no-op even on terminal non-Success." + ) + assert "--diagnostics-prefix" in body, ( + "spot_train.sh run_ssm() does not pass --diagnostics-prefix to " + "the lib CLI." + ) + # Per-repo subprefix discriminates cascade A (ae-data) + cascade B + # (ae-backtester) sibling writes — lib's {date}.json key shape would + # otherwise clobber within a shared prefix. + assert "_spot_diagnostics/ae-predictor" in body, ( + "spot_train.sh --diagnostics-prefix must scope to " + "_spot_diagnostics/ae-predictor so ae-data + ae-backtester " + "cascade siblings write to disjoint S3 namespaces." + ) + + def test_no_inline_aws_ssm_send_command(): """The script MUST NOT call ``aws ssm send-command`` directly in a non-comment line. That's the pre-lift pattern L342 explicitly