Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 104 additions & 1 deletion infrastructure/spot_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
# ./infrastructure/spot_train.sh # smoke (dry_run) then full
# ./infrastructure/spot_train.sh --full-only # full training only (Saturday SF)
# ./infrastructure/spot_train.sh --smoke-only # smoke only, then terminate
# ./infrastructure/spot_train.sh --preflight-only # boot + import/lib-pin +
# # ArcticDB connectivity probe,
# # then exit 0 — NO training,
# # NO promotion, ZERO S3/config
# # writes (Friday shell_run dry path)
# ./infrastructure/spot_train.sh --instance-type c5.2xlarge # override type
#
# Prerequisites:
Expand All @@ -30,6 +35,9 @@
# 2. Wait for the SSM agent to register (no SSH)
# 3. Stage config/predictor.yaml to S3; spot bootstraps + fetches it
# 4. Run smoke (dry_run=True), then full training (dry_run=False)
# — OR, under --preflight-only, run the import/lib-pin + read-only
# ArcticDB connectivity probe and exit 0 (no training, no promotion,
# no S3/config writes; Friday shell_run dry path)
# 5. Terminate the spot instance + clean the S3 staging prefix
#
# Rollback: `git revert` this commit restores the SSH/SCP script. Port 22
Expand Down Expand Up @@ -68,11 +76,12 @@ IAM_PROFILE="alpha-engine-executor-profile"
REPO_URL="https://github.com/cipher813/alpha-engine-predictor.git" # public repo, no auth

# Parse flags
MODE="both" # both | full-only | smoke-only
MODE="both" # both | full-only | smoke-only | preflight-only
while [ $# -gt 0 ]; do
case "$1" in
--full-only) MODE="full-only" ;;
--smoke-only) MODE="smoke-only" ;;
--preflight-only) MODE="preflight-only" ;;
--instance-type) shift; INSTANCE_TYPE="$1" ;;
esac
shift
Expand Down Expand Up @@ -278,6 +287,100 @@ $PIP list --format=columns | grep -iE 'numpy|pandas|lightgbm|scikit-learn|scipy|
DEPS
)" 900

# ── Preflight-only (Friday shell_run dry path) ────────────────────────────────
# Boot + lib-pin/import + read-only ArcticDB/universe-freshness probe, then
# exit 0. This runs the SAME bootstrap+deps steps the real Saturday run uses
# (so it catches lib-pin drift, sys.path breakage, image gaps, SSM timeouts,
# stale ArcticDB) but stops HERE — before the smoke step and before the
# full-training step.
#
# Hard invariant under this mode:
# • run_meta_training() is NEVER invoked → NO model training, NO walk-forward.
# • The `if not dry_run:` upload/promote block in meta_trainer.py is never
# reached → NO weights/meta/* write, NO manifest, NO dated archive.
# • train_handler.main()'s training_summary / triple-barrier-gate / email /
# health-status writes are never reached (they live after run_meta_training).
# • The probe imports the training package + runs TrainingPreflight (env +
# S3-bucket *reachability* check — no object writes) + a read-only
# ArcticDB `list_symbols()` / latest-index probe. No put_object, no
# config write, no external API (yfinance/Anthropic) call.
# The `exit 0` is a clean dispatcher exit; `trap cleanup EXIT` still fires
# (terminates the spot, clears the S3 staging prefix — staging cleanup only).
if [ "$MODE" = "preflight-only" ]; then
echo ""
echo "═══════════════════════════════════════════════════════════════"
echo " PREFLIGHT-ONLY (no training, no promotion, no writes)"
echo "═══════════════════════════════════════════════════════════════"
run_ssm "preflight-only" "$(cat <<'PREFLIGHT'
set -eo pipefail
export HOME=/home/ec2-user XDG_CACHE_HOME=/tmp AWS_REGION=us-east-1 AWS_DEFAULT_REGION=us-east-1
cd /home/ec2-user/predictor
command -v python3.12 >/dev/null && PY=python3.12 || PY=python3
$PY - <<'PYEOF'
import os, sys
sys.path.insert(0, '.')
os.environ.setdefault('S3_BUCKET', os.environ.get('S3_BUCKET', 'alpha-engine-research'))
bucket = os.environ.get('S3_BUCKET', 'alpha-engine-research')

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s')
log = logging.getLogger('preflight-only')

# 1. Import the training package (catches sys.path / lib-pin / image gaps).
# Importing train_handler transitively imports the lib + training stack
# WITHOUT invoking main(), so no training runs.
log.info('[1/3] Importing training package...')
import alpha_engine_lib # lib-pin presence (version asserted by requirements.txt pin)
from training import train_handler # noqa: F401 (import-only; main() NOT called)
from training.preflight import TrainingPreflight
log.info(' OK — alpha_engine_lib + training.train_handler import clean')

# 2. Reuse the EXISTING training preflight (env vars + S3 bucket
# *reachability*; check_s3_bucket is a read/head, no object write).
log.info('[2/3] Running TrainingPreflight (env + S3 connectivity)...')
TrainingPreflight(bucket=bucket).run()
log.info(' OK — env vars present, S3 bucket reachable')

# 3. Read-only ArcticDB connectivity + universe-freshness probe.
# list_symbols() + a single read().tail(1) — NO download_from_arctic(),
# NO parquet writes, NO training array build. Mirrors the connectivity
# the real run depends on without doing any work.
log.info('[3/3] ArcticDB connectivity + universe-freshness probe...')
from store.arctic_reader import _get_arctic
arctic = _get_arctic(bucket)
universe = arctic.get_library('universe')
symbols = universe.list_symbols()
n = len(symbols)
if n == 0:
raise RuntimeError(
'ArcticDB universe library is empty/unreachable — '
'Saturday DataPhase1 + weekly backfill have not run cleanly.'
)
probe = sorted(symbols)[0]
df_tail = universe.read(probe).data.tail(1)
latest = df_tail.index.max() if not df_tail.empty else 'n/a'
log.info(' OK — universe has %d symbols; %s latest index=%s', n, probe, latest)

print()
print('=' * 60)
print(' PREFLIGHT-ONLY RESULT: PASS')
print('=' * 60)
print(f' Imports: alpha_engine_lib + training stack clean')
print(f' TrainingPreflight: PASS (env + S3 reachable)')
print(f' ArcticDB: {n} universe symbols (probe {probe} latest={latest})')
print(f' Training: SKIPPED (no run_meta_training call)')
print(f' Promotion: SKIPPED (no weights/meta write)')
print(f' S3/config writes: NONE')
print('=' * 60)
PYEOF
PREFLIGHT
)" 600
echo ""
echo "==> Preflight-only mode — PASS. No training, no promotion, no writes."
echo " Exiting 0 BEFORE smoke + full-training steps."
exit 0
fi

# ── Smoke test (dry_run=True) ─────────────────────────────────────────────────
if [ "$MODE" != "full-only" ]; then
echo ""
Expand Down
126 changes: 126 additions & 0 deletions tests/test_spot_train_preflight_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Pins spot_train.sh `--preflight-only` to the Friday shell_run dry-path
hard invariant: boot + import/lib-pin + read-only ArcticDB connectivity
probe, then `exit 0` BEFORE the smoke step and BEFORE the full-training
step — with NO model training, NO weight promotion, and ZERO S3/config
writes.

Owed-item #2 of ROADMAP "Friday shell-run — per-module dry-path
activation" (P1). Static-analysis test (mirrors
test_spot_train_aws_region.py) — the spot_train.sh path is SSM/EC2 and
cannot be exercised in CI; these assertions guard the structural
invariant against a future edit that would let preflight-only fall
through into training.
"""

from __future__ import annotations

from pathlib import Path

_SCRIPT = Path(__file__).resolve().parent.parent / "infrastructure" / "spot_train.sh"


def _text() -> str:
return _SCRIPT.read_text()


def test_spot_train_exists():
assert _SCRIPT.is_file()


def test_preflight_only_flag_parses():
text = _text()
assert "--preflight-only) MODE=\"preflight-only\" ;;" in text, (
"--preflight-only flag not wired into the flag parser"
)


def test_preflight_only_branch_exists_and_exits_zero():
text = _text()
assert 'if [ "$MODE" = "preflight-only" ]; then' in text, (
"no dedicated preflight-only branch found"
)
# The branch must terminate with `exit 0` (clean dispatcher exit;
# trap cleanup still terminates the spot + clears S3 staging).
branch = text.split('if [ "$MODE" = "preflight-only" ]; then', 1)[1]
branch = branch.split("# ── Smoke test", 1)[0]
assert "exit 0" in branch, "preflight-only branch must exit 0"


def test_preflight_only_runs_before_smoke_and_full_training():
text = _text()
i_branch = text.index('if [ "$MODE" = "preflight-only" ]; then')
i_smoke = text.index("# ── Smoke test (dry_run=True)")
i_full = text.index("# ── Full training (dry_run=False)")
assert i_branch < i_smoke < i_full, (
"preflight-only branch must precede the smoke and full-training "
"sections so its exit 0 short-circuits before any training"
)


def test_preflight_only_does_not_invoke_training_or_promotion():
"""The preflight-only SSM step body must not call run_meta_training /
train_handler.main / dry_run training — those are the train+promote
entrypoints. It may only import the package + run TrainingPreflight +
the read-only ArcticDB probe."""
text = _text()
start = text.index('run_ssm "preflight-only"')
# End of the preflight-only run_ssm heredoc payload.
end = text.index("PREFLIGHT\n)\"", start)
payload = text[start:end]

# Forbidden as *executable references* (call/import forms), not as
# mentions inside the human-readable proof print() strings. We strip
# the print(...) / log.info(...) diagnostic lines first so the proof
# text ("no run_meta_training call") doesn't false-positive.
code_lines = [
ln
for ln in payload.splitlines()
if not ln.lstrip().startswith(("print(", "log.info(", "#"))
]
code = "\n".join(code_lines)
forbidden = [
"run_meta_training(",
"import run_meta_training",
"train_main(",
"from training.train_handler import main",
"download_from_arctic", # that is the training data DOWNLOAD, not a probe
"put_object",
"upload_file",
]
for token in forbidden:
assert token not in code, (
f"preflight-only step must NOT reference {token!r} — "
f"it would break the no-train/no-promote/no-write invariant"
)

# Positive: it must do the reuse-not-rebuild things.
assert "TrainingPreflight" in payload, (
"preflight-only must reuse the existing training/preflight.py "
"TrainingPreflight (do not rebuild a parallel preflight)"
)
assert "import alpha_engine_lib" in payload, (
"preflight-only must import alpha_engine_lib to catch lib-pin drift"
)
assert "list_symbols()" in payload, (
"preflight-only must do a read-only ArcticDB universe probe"
)


def test_preflight_only_step_keeps_aws_region_export():
"""Same #247 regression guard as test_spot_train_aws_region.py — the
new step's env line must still export AWS_REGION/AWS_DEFAULT_REGION
(TrainingPreflight.check_env_vars('AWS_REGION') hard-requires it)."""
text = _text()
start = text.index('run_ssm "preflight-only"')
end = text.index("PREFLIGHT\n)\"", start)
payload = text[start:end]
export_lines = [
ln
for ln in payload.splitlines()
if ln.startswith("export HOME=/home/ec2-user XDG_CACHE_HOME=/tmp")
]
assert export_lines, "preflight-only step missing its per-step export line"
for ln in export_lines:
assert "AWS_REGION=" in ln and "AWS_DEFAULT_REGION=" in ln, (
f"preflight-only export line missing AWS region vars: {ln!r}"
)
Loading