From 1cb4c6cbbbc7a763162eedb47c8434e0454c858d Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Wed, 15 Apr 2026 13:00:40 -0700 Subject: [PATCH] Add 6 features: horizon returns + overnight/intraday + dist-from-high MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Predictor ROADMAP P2 diagnostic — test whether 5d forward is reversal regime vs momentum regime, and whether splitting close-to-close returns into overnight/intraday components improves signal. New features (all technical group): return_60d, return_120d Longer-horizon momentum. Neutral name — meta ridge coefficient sign determines regime. At 5d forward, short-horizon returns load negative (reversal). If 60d/120d load positive, momentum persists at longer lookback — a well-documented pattern (Jegadeesh/Titman 1993). overnight_return_5d, intraday_return_5d 5d sum of (Open_t / Close_{t-1} - 1) vs (Close_t / Open_t - 1). Lou/Polk/Skouras 2019 "A Tug of War" found overnight persists positive (earnings, news, macro) while intraday is noisier and often negative (microstructure, flow). Total momentum_5d ≈ overnight_5d + intraday_5d. Decomposing lets the model learn different dynamics. NaN when Open column is missing (no silent zero-fill per feedback_no_silent_fails). dist_from_5d_high, dist_from_20d_high Reversal-native signals. Distance from recent peak as fraction: (Close - rolling_max(High, N)) / rolling_max(High, N). Always ≤ 0. A stock at its 5d high has no short-term reversal room; a stock pulled back has more. Conceptually cleaner than past returns for reversal signal. Registry: 6 FeatureEntry rows added under "v3.1 technical additions". FEATURES list in feature_engineer.py goes from 53 → 59. dropna still correct — rows missing any required feature are dropped. ## Test plan - [x] Synthetic OHLCV smoke: all 6 features compute, values in sensible ranges (dist_from_5d_high ≤ 0 always, overnight/intraday small magnitudes, return_60d/120d larger). - [x] Full suite: 43 passed. - [ ] After merge: re-run alpha-engine-data backfill to populate historical rows in ArcticDB with the new columns. - [ ] Predictor PR B2 (follow-up) adds features to MOMENTUM_FEATURES list + 21d forward IC diagnostic. Co-Authored-By: Claude Opus 4.6 (1M context) --- features/feature_engineer.py | 60 ++++++++++++++++++++++++++++++++++++ features/registry.py | 8 +++++ 2 files changed, 68 insertions(+) diff --git a/features/feature_engineer.py b/features/feature_engineer.py index 63d7a00..8ba6c87 100644 --- a/features/feature_engineer.py +++ b/features/feature_engineer.py @@ -120,6 +120,17 @@ "gross_margin", "roe", "current_ratio", + # v3.1 additions — longer-horizon + overnight/intraday decomposition + + # reversal-native signals. Predictor ROADMAP P2: collapse FLAT + + # test whether 5d is reversal or momentum regime. 2026-04-15: neutral + # names chosen — meta ridge coefficient sign determines whether the + # feature behaves as reversal (positive coef) or momentum (negative). + "return_60d", + "return_120d", + "overnight_return_5d", + "intraday_return_5d", + "dist_from_5d_high", + "dist_from_20d_high", ] MIN_ROWS_FOR_FEATURES = 265 # 252 warmup + buffer @@ -272,6 +283,55 @@ def compute_features( _mom_short = _FC["momentum_short"] df["momentum_5d"] = (close / close.shift(_mom_short)) - 1.0 + # ── v3.1: Longer-horizon returns ────────────────────────────────────────── + # ROADMAP P2 diagnostic — test whether 5d is the right label horizon. + # Neutral naming: meta ridge coefficient sign determines reversal vs + # momentum regime. Positive coef → reversal (high past returns predict + # negative future returns). Negative coef → momentum persists at this + # horizon. + df["return_60d"] = (close / close.shift(60)) - 1.0 + df["return_120d"] = (close / close.shift(120)) - 1.0 + + # ── v3.1: Overnight / intraday decomposition ────────────────────────────── + # Lou/Polk/Skouras 2019 "A Tug of War": overnight returns + # (Open_t vs Close_{t-1}) have been historically persistent and positive + # (earnings, news, macro), while intraday returns (Close_t vs Open_t) + # have been noisier and often negative (microstructure, flow). Total + # 5d return = overnight_5d + intraday_5d (approximately — compounding + # differences are small at 5d horizons and this additive sum is the + # form used in the source literature). + if "Open" in df.columns: + open_ = df["Open"].astype(float) + overnight_daily = (open_ / close.shift(1)) - 1.0 + intraday_daily = (close / open_) - 1.0 + df["overnight_return_5d"] = overnight_daily.rolling( + window=_mom_short, min_periods=_mom_short, + ).sum() + df["intraday_return_5d"] = intraday_daily.rolling( + window=_mom_short, min_periods=_mom_short, + ).sum() + else: + # Without Open, these features are undefined — NaN propagates and + # dropna will exclude the ticker. No silent zero-fill (per + # feedback_no_silent_fails). + df["overnight_return_5d"] = float("nan") + df["intraday_return_5d"] = float("nan") + + # ── v3.1: Distance from recent highs (reversal-native) ──────────────────── + # Distance from recent peak is a cleaner reversal signal than past + # returns: a stock at its 5d high has nowhere to "continue" in the + # short-term reversal regime, while a stock pulled back from its 5d + # high has more room to mean-revert. Negative values always (close + # cannot exceed max by definition). Closer to zero = near high. + if "High" in df.columns: + high_col = df["High"].astype(float) + else: + high_col = close + rolling_max_5 = high_col.rolling(window=5, min_periods=5).max() + rolling_max_20 = high_col.rolling(window=20, min_periods=20).max() + df["dist_from_5d_high"] = (close - rolling_max_5) / rolling_max_5 + df["dist_from_20d_high"] = (close - rolling_max_20) / rolling_max_20 + # ── Relative volume ratio ────────────────────────────────────────────────── rolling_mean_vol_20 = volume.rolling(window=_vol_slow, min_periods=_vol_slow).mean() df["rel_volume_ratio"] = volume / rolling_mean_vol_20.replace(0, float("nan")) diff --git a/features/registry.py b/features/registry.py index 388ed8b..078997e 100644 --- a/features/registry.py +++ b/features/registry.py @@ -85,6 +85,14 @@ class FeatureEntry: FeatureEntry("iv_rank", "alternative", "IV percentile rank (0-1)", source="yfinance", refresh="weekly"), FeatureEntry("iv_vs_rv", "alternative", "Implied vol / realized vol ratio", source="yfinance", refresh="weekly"), + # ── v3.1 technical additions — horizon + decomposition + reversal-native ── + FeatureEntry("return_60d", "technical", "60-day price return (Close_t / Close_{t-60} - 1)", source="yfinance", refresh="daily"), + FeatureEntry("return_120d", "technical", "120-day price return (Close_t / Close_{t-120} - 1)", source="yfinance", refresh="daily"), + FeatureEntry("overnight_return_5d", "technical", "5d sum of overnight returns (Open_t vs Close_{t-1})", source="yfinance", refresh="daily"), + FeatureEntry("intraday_return_5d", "technical", "5d sum of intraday returns (Close_t vs Open_t)", source="yfinance", refresh="daily"), + FeatureEntry("dist_from_5d_high", "technical", "(Close - 5d rolling max High) / 5d rolling max High", source="yfinance", refresh="daily"), + FeatureEntry("dist_from_20d_high", "technical", "(Close - 20d rolling max High) / 20d rolling max High", source="yfinance", refresh="daily"), + # ── Fundamental (8) — quarterly financials ──────────────────────────────── FeatureEntry("pe_ratio", "fundamental", "Trailing P/E ratio, normalized (PE / 30)", source="fmp", refresh="quarterly"), FeatureEntry("pb_ratio", "fundamental", "Price-to-book ratio, normalized (PB / 5)", source="fmp", refresh="quarterly"),