From 3622378ac0f785851cf1271b753aed1b7badc68f Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Wed, 15 Apr 2026 16:00:26 -0700 Subject: [PATCH] fix(universe_returns): unstick NULL return_5d rows + drop weekend signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related bugs kept scanner/team/cio grading pinned at N/A even when the merge on (ticker, eval_date) would otherwise have worked. 1. NULL return_5d rows got stuck forever. When a collector run inserted a date whose 5d forward window hadn't closed yet, the row landed with NULL return/spy/beat columns. The next run's `existing` set contained that eval_date, so `dates_to_process` skipped it — the returns were never backfilled. Combined with `INSERT OR IGNORE` this silently orphaned Fri 2026-04-03 and other edge cases. Fix: `_get_existing_dates` now returns only eval_dates where `return_5d IS NOT NULL`; `_insert_rows` uses INSERT OR REPLACE so a reprocessed row overwrites the stale NULL one. 2. Weekend signal folders have no market data. Research runs before alpha-engine-research 9a94e34 (2026-04-13, "Stamp signals with next_trading_day") wrote signals/{Sat,Sun}/. Polygon has no grouped-daily data for those dates, so rows landed with NULL returns and then hit bug #1. Post-fix research stamps trading days, but the legacy weekend folders still exist in S3 and would keep retriggering the empty-prices path. Fix: filter eval_dates to trading days (Mon-Fri) before enqueuing for processing. Existing tests pass (46/46). The next Saturday Step Function run will reprocess the stuck Fri 2026-04-03 row and populate return_5d for any subsequent trading days whose 5d window has since closed. Out of scope: market-holiday filtering (e.g. Good Friday). _is_trading_day only screens weekends; a closed-market weekday will still be attempted — polygon returns empty and the row is skipped at _build_rows_for_date, which is acceptable for now. Proper NYSE calendar filtering belongs in a follow-up. --- collectors/universe_returns.py | 37 ++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/collectors/universe_returns.py b/collectors/universe_returns.py index 50f0624..b52f0f2 100644 --- a/collectors/universe_returns.py +++ b/collectors/universe_returns.py @@ -115,6 +115,14 @@ def collect( _ensure_table(db_path) existing = _get_existing_dates(db_path) + # Drop non-trading-day signal folders. Research runs before the + # next_trading_day stamping fix (alpha-engine-research 9a94e34, 2026-04-13) + # wrote signals/{Sat,Sun}/... which have no market data — attempting to + # process them produces rows with NULL return_5d that then get stuck in + # the `existing` set and prevent real reprocessing. Keeping the filter + # here is also a defense against future misstamping. + eval_dates = [d for d in eval_dates if _is_trading_day(d)] + dates_to_process = [d for d in eval_dates if d not in existing] if not dates_to_process: logger.info("All %d eval_dates already in universe_returns", len(eval_dates)) @@ -261,24 +269,45 @@ def _ensure_table(db_path: str) -> None: def _get_existing_dates(db_path: str) -> set[str]: - """Return set of eval_dates already populated.""" + """Return set of eval_dates that have return_5d fully populated. + + A date is considered "existing" only when at least one row has return_5d + set. Dates where rows landed but return_5d stayed NULL (e.g. the 5d + forward window hadn't closed when the collector ran, and the date was + then frozen out by the prior all-dates-in-DB skip) get reprocessed so + the returns can be backfilled. + """ conn = sqlite3.connect(db_path) try: - rows = conn.execute("SELECT DISTINCT eval_date FROM universe_returns").fetchall() + rows = conn.execute( + "SELECT DISTINCT eval_date FROM universe_returns " + "WHERE return_5d IS NOT NULL" + ).fetchall() return {r[0] for r in rows} finally: conn.close() +def _is_trading_day(date_str: str) -> bool: + """True if YYYY-MM-DD is Mon–Fri. Does not account for market holidays.""" + return date.fromisoformat(date_str).weekday() < 5 + + def _insert_rows(db_path: str, rows: list[dict]) -> int: - """Insert rows into universe_returns, skipping duplicates.""" + """Insert rows into universe_returns; reprocessed dates overwrite stale rows. + + Uses INSERT OR REPLACE so a date that was previously inserted with NULL + forward-return columns (because the 5d window hadn't closed yet) gets + its returns filled in on reprocessing. The previous INSERT OR IGNORE + behaviour left those NULL rows stuck forever. + """ conn = sqlite3.connect(db_path) try: inserted = 0 for row in rows: try: conn.execute( - "INSERT OR IGNORE INTO universe_returns " + "INSERT OR REPLACE INTO universe_returns " "(ticker, eval_date, sector, close_price, return_5d, return_10d, return_30d, " "spy_return_5d, spy_return_10d, spy_return_30d, beat_spy_5d, beat_spy_10d, beat_spy_30d, " "sector_etf, sector_etf_return_5d, beat_sector_5d) "