diff --git a/infrastructure/spot_train.sh b/infrastructure/spot_train.sh index cc99a35..39b4959 100755 --- a/infrastructure/spot_train.sh +++ b/infrastructure/spot_train.sh @@ -16,6 +16,11 @@ # ./infrastructure/spot_train.sh # smoke (dry_run) then full # ./infrastructure/spot_train.sh --full-only # full training only (Saturday SF) # ./infrastructure/spot_train.sh --smoke-only # smoke only, then terminate +# ./infrastructure/spot_train.sh --preflight-only # boot + import/lib-pin + +# # ArcticDB connectivity probe, +# # then exit 0 — NO training, +# # NO promotion, ZERO S3/config +# # writes (Friday shell_run dry path) # ./infrastructure/spot_train.sh --instance-type c5.2xlarge # override type # # Prerequisites: @@ -30,6 +35,9 @@ # 2. Wait for the SSM agent to register (no SSH) # 3. Stage config/predictor.yaml to S3; spot bootstraps + fetches it # 4. Run smoke (dry_run=True), then full training (dry_run=False) +# — OR, under --preflight-only, run the import/lib-pin + read-only +# ArcticDB connectivity probe and exit 0 (no training, no promotion, +# no S3/config writes; Friday shell_run dry path) # 5. Terminate the spot instance + clean the S3 staging prefix # # Rollback: `git revert` this commit restores the SSH/SCP script. Port 22 @@ -68,11 +76,12 @@ IAM_PROFILE="alpha-engine-executor-profile" REPO_URL="https://github.com/cipher813/alpha-engine-predictor.git" # public repo, no auth # Parse flags -MODE="both" # both | full-only | smoke-only +MODE="both" # both | full-only | smoke-only | preflight-only while [ $# -gt 0 ]; do case "$1" in --full-only) MODE="full-only" ;; --smoke-only) MODE="smoke-only" ;; + --preflight-only) MODE="preflight-only" ;; --instance-type) shift; INSTANCE_TYPE="$1" ;; esac shift @@ -278,6 +287,100 @@ $PIP list --format=columns | grep -iE 'numpy|pandas|lightgbm|scikit-learn|scipy| DEPS )" 900 +# ── Preflight-only (Friday shell_run dry path) ──────────────────────────────── +# Boot + lib-pin/import + read-only ArcticDB/universe-freshness probe, then +# exit 0. This runs the SAME bootstrap+deps steps the real Saturday run uses +# (so it catches lib-pin drift, sys.path breakage, image gaps, SSM timeouts, +# stale ArcticDB) but stops HERE — before the smoke step and before the +# full-training step. +# +# Hard invariant under this mode: +# • run_meta_training() is NEVER invoked → NO model training, NO walk-forward. +# • The `if not dry_run:` upload/promote block in meta_trainer.py is never +# reached → NO weights/meta/* write, NO manifest, NO dated archive. +# • train_handler.main()'s training_summary / triple-barrier-gate / email / +# health-status writes are never reached (they live after run_meta_training). +# • The probe imports the training package + runs TrainingPreflight (env + +# S3-bucket *reachability* check — no object writes) + a read-only +# ArcticDB `list_symbols()` / latest-index probe. No put_object, no +# config write, no external API (yfinance/Anthropic) call. +# The `exit 0` is a clean dispatcher exit; `trap cleanup EXIT` still fires +# (terminates the spot, clears the S3 staging prefix — staging cleanup only). +if [ "$MODE" = "preflight-only" ]; then + echo "" + echo "═══════════════════════════════════════════════════════════════" + echo " PREFLIGHT-ONLY (no training, no promotion, no writes)" + echo "═══════════════════════════════════════════════════════════════" + run_ssm "preflight-only" "$(cat <<'PREFLIGHT' +set -eo pipefail +export HOME=/home/ec2-user XDG_CACHE_HOME=/tmp AWS_REGION=us-east-1 AWS_DEFAULT_REGION=us-east-1 +cd /home/ec2-user/predictor +command -v python3.12 >/dev/null && PY=python3.12 || PY=python3 +$PY - <<'PYEOF' +import os, sys +sys.path.insert(0, '.') +os.environ.setdefault('S3_BUCKET', os.environ.get('S3_BUCKET', 'alpha-engine-research')) +bucket = os.environ.get('S3_BUCKET', 'alpha-engine-research') + +import logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s') +log = logging.getLogger('preflight-only') + +# 1. Import the training package (catches sys.path / lib-pin / image gaps). +# Importing train_handler transitively imports the lib + training stack +# WITHOUT invoking main(), so no training runs. +log.info('[1/3] Importing training package...') +import alpha_engine_lib # lib-pin presence (version asserted by requirements.txt pin) +from training import train_handler # noqa: F401 (import-only; main() NOT called) +from training.preflight import TrainingPreflight +log.info(' OK — alpha_engine_lib + training.train_handler import clean') + +# 2. Reuse the EXISTING training preflight (env vars + S3 bucket +# *reachability*; check_s3_bucket is a read/head, no object write). +log.info('[2/3] Running TrainingPreflight (env + S3 connectivity)...') +TrainingPreflight(bucket=bucket).run() +log.info(' OK — env vars present, S3 bucket reachable') + +# 3. Read-only ArcticDB connectivity + universe-freshness probe. +# list_symbols() + a single read().tail(1) — NO download_from_arctic(), +# NO parquet writes, NO training array build. Mirrors the connectivity +# the real run depends on without doing any work. +log.info('[3/3] ArcticDB connectivity + universe-freshness probe...') +from store.arctic_reader import _get_arctic +arctic = _get_arctic(bucket) +universe = arctic.get_library('universe') +symbols = universe.list_symbols() +n = len(symbols) +if n == 0: + raise RuntimeError( + 'ArcticDB universe library is empty/unreachable — ' + 'Saturday DataPhase1 + weekly backfill have not run cleanly.' + ) +probe = sorted(symbols)[0] +df_tail = universe.read(probe).data.tail(1) +latest = df_tail.index.max() if not df_tail.empty else 'n/a' +log.info(' OK — universe has %d symbols; %s latest index=%s', n, probe, latest) + +print() +print('=' * 60) +print(' PREFLIGHT-ONLY RESULT: PASS') +print('=' * 60) +print(f' Imports: alpha_engine_lib + training stack clean') +print(f' TrainingPreflight: PASS (env + S3 reachable)') +print(f' ArcticDB: {n} universe symbols (probe {probe} latest={latest})') +print(f' Training: SKIPPED (no run_meta_training call)') +print(f' Promotion: SKIPPED (no weights/meta write)') +print(f' S3/config writes: NONE') +print('=' * 60) +PYEOF +PREFLIGHT +)" 600 + echo "" + echo "==> Preflight-only mode — PASS. No training, no promotion, no writes." + echo " Exiting 0 BEFORE smoke + full-training steps." + exit 0 +fi + # ── Smoke test (dry_run=True) ───────────────────────────────────────────────── if [ "$MODE" != "full-only" ]; then echo "" diff --git a/tests/test_spot_train_preflight_only.py b/tests/test_spot_train_preflight_only.py new file mode 100644 index 0000000..6961e5d --- /dev/null +++ b/tests/test_spot_train_preflight_only.py @@ -0,0 +1,126 @@ +"""Pins spot_train.sh `--preflight-only` to the Friday shell_run dry-path +hard invariant: boot + import/lib-pin + read-only ArcticDB connectivity +probe, then `exit 0` BEFORE the smoke step and BEFORE the full-training +step — with NO model training, NO weight promotion, and ZERO S3/config +writes. + +Owed-item #2 of ROADMAP "Friday shell-run — per-module dry-path +activation" (P1). Static-analysis test (mirrors +test_spot_train_aws_region.py) — the spot_train.sh path is SSM/EC2 and +cannot be exercised in CI; these assertions guard the structural +invariant against a future edit that would let preflight-only fall +through into training. +""" + +from __future__ import annotations + +from pathlib import Path + +_SCRIPT = Path(__file__).resolve().parent.parent / "infrastructure" / "spot_train.sh" + + +def _text() -> str: + return _SCRIPT.read_text() + + +def test_spot_train_exists(): + assert _SCRIPT.is_file() + + +def test_preflight_only_flag_parses(): + text = _text() + assert "--preflight-only) MODE=\"preflight-only\" ;;" in text, ( + "--preflight-only flag not wired into the flag parser" + ) + + +def test_preflight_only_branch_exists_and_exits_zero(): + text = _text() + assert 'if [ "$MODE" = "preflight-only" ]; then' in text, ( + "no dedicated preflight-only branch found" + ) + # The branch must terminate with `exit 0` (clean dispatcher exit; + # trap cleanup still terminates the spot + clears S3 staging). + branch = text.split('if [ "$MODE" = "preflight-only" ]; then', 1)[1] + branch = branch.split("# ── Smoke test", 1)[0] + assert "exit 0" in branch, "preflight-only branch must exit 0" + + +def test_preflight_only_runs_before_smoke_and_full_training(): + text = _text() + i_branch = text.index('if [ "$MODE" = "preflight-only" ]; then') + i_smoke = text.index("# ── Smoke test (dry_run=True)") + i_full = text.index("# ── Full training (dry_run=False)") + assert i_branch < i_smoke < i_full, ( + "preflight-only branch must precede the smoke and full-training " + "sections so its exit 0 short-circuits before any training" + ) + + +def test_preflight_only_does_not_invoke_training_or_promotion(): + """The preflight-only SSM step body must not call run_meta_training / + train_handler.main / dry_run training — those are the train+promote + entrypoints. It may only import the package + run TrainingPreflight + + the read-only ArcticDB probe.""" + text = _text() + start = text.index('run_ssm "preflight-only"') + # End of the preflight-only run_ssm heredoc payload. + end = text.index("PREFLIGHT\n)\"", start) + payload = text[start:end] + + # Forbidden as *executable references* (call/import forms), not as + # mentions inside the human-readable proof print() strings. We strip + # the print(...) / log.info(...) diagnostic lines first so the proof + # text ("no run_meta_training call") doesn't false-positive. + code_lines = [ + ln + for ln in payload.splitlines() + if not ln.lstrip().startswith(("print(", "log.info(", "#")) + ] + code = "\n".join(code_lines) + forbidden = [ + "run_meta_training(", + "import run_meta_training", + "train_main(", + "from training.train_handler import main", + "download_from_arctic", # that is the training data DOWNLOAD, not a probe + "put_object", + "upload_file", + ] + for token in forbidden: + assert token not in code, ( + f"preflight-only step must NOT reference {token!r} — " + f"it would break the no-train/no-promote/no-write invariant" + ) + + # Positive: it must do the reuse-not-rebuild things. + assert "TrainingPreflight" in payload, ( + "preflight-only must reuse the existing training/preflight.py " + "TrainingPreflight (do not rebuild a parallel preflight)" + ) + assert "import alpha_engine_lib" in payload, ( + "preflight-only must import alpha_engine_lib to catch lib-pin drift" + ) + assert "list_symbols()" in payload, ( + "preflight-only must do a read-only ArcticDB universe probe" + ) + + +def test_preflight_only_step_keeps_aws_region_export(): + """Same #247 regression guard as test_spot_train_aws_region.py — the + new step's env line must still export AWS_REGION/AWS_DEFAULT_REGION + (TrainingPreflight.check_env_vars('AWS_REGION') hard-requires it).""" + text = _text() + start = text.index('run_ssm "preflight-only"') + end = text.index("PREFLIGHT\n)\"", start) + payload = text[start:end] + export_lines = [ + ln + for ln in payload.splitlines() + if ln.startswith("export HOME=/home/ec2-user XDG_CACHE_HOME=/tmp") + ] + assert export_lines, "preflight-only step missing its per-step export line" + for ln in export_lines: + assert "AWS_REGION=" in ln and "AWS_DEFAULT_REGION=" in ln, ( + f"preflight-only export line missing AWS region vars: {ln!r}" + )