From 07d8c3338f2486a8558718975f34361e35c95e82 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Thu, 28 May 2026 08:25:08 -0700 Subject: [PATCH] refactor(dedup): retire defensive keep="last" on ArcticDB reads (L2188) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the defensive `keep="last"` index dedup from both the training read path (`data/dataset.py::_load_ticker_parquet`) and the inference read path (`inference/stages/load_prices.py::_read_ohlcv` + `_read_close`). The dedup was a 2026-04-15 transition device added to mask same-date duplicate rows ArcticDB was emitting at the time. The upstream fix (alpha-engine-data builders/daily_append.py → `update()` over `append()`) shipped same day, and 6+ weeks of clean Saturday + weekday cycles have verified the write path is now duplicate-free. Per feedback_no_silent_fails: a silent `keep="last"` on values-differ duplicates would mask an upstream write-path regression as a clean read. The right surface for any future re-recurrence is the downstream pandas reindex raising — which fails the pipeline loudly at the first feature computation site (`compute_features`). Tests: - `TestDefensiveDedupRetiredL2188::test_dataset_loader_does_not_dedup` — source-level pin that `_load_ticker_parquet` carries no `.duplicated(` call. - `TestDefensiveDedupRetiredL2188::test_inference_readers_do_not_dedup` — same pin for `load_price_data_from_arctic`. Note: `data/dataset.py:215` retains `.duplicated(keep="first")` for a DIFFERENT purpose (cross-source merge first-wins semantics, not defensive dedup); intentionally out of scope. Suite: 1216 passed. Co-Authored-By: Claude Opus 4.7 (1M context) --- data/dataset.py | 23 ++++++------------- inference/stages/load_prices.py | 19 +++++++++------- tests/test_dataset.py | 40 +++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 24 deletions(-) diff --git a/data/dataset.py b/data/dataset.py index 1f1b8bf..ba9b6d1 100644 --- a/data/dataset.py +++ b/data/dataset.py @@ -58,22 +58,13 @@ def _load_ticker_parquet(path: Path) -> pd.DataFrame: df.index = idx if not df.index.is_monotonic_increasing: df = df.sort_index() - # Deduplicate on date index. Mirrors inference's defensive handling - # at inference/stages/load_prices.py:403. 2026-04-15 smoke tests - # showed 904/909 tickers failing feature computation with - # "cannot reindex on an axis with duplicate labels" — ArcticDB reads - # are emitting same-date rows for essentially every ticker. Upstream - # fix belongs in alpha-engine-data's ArcticDB write path; this is - # the consistency fix that brings training into alignment with the - # inference path that already expects and handles the duplicates. - if df.index.has_duplicates: - n_before = len(df) - df = df[~df.index.duplicated(keep="last")] - log.warning( - "Deduplicated %d duplicate date rows in %s (kept last: %d → %d). " - "Upstream ArcticDB write is emitting duplicates — file against alpha-engine-data.", - n_before - len(df), path.name, n_before, len(df), - ) + # ArcticDB writes are duplicate-free post the 2026-04-15 write-path + # fix (alpha-engine-data builders/daily_append.py → update() instead + # of append()). If duplicates ever re-appear here, the downstream + # ``compute_features`` reindex raises and surfaces the upstream + # regression — preferred over silently picking ``keep="last"``, + # which would mask a values-differ write-path bug as a clean read + # per feedback_no_silent_fails. return df except Exception as exc: log.warning("Failed to load %s: %s", path.name, exc) diff --git a/inference/stages/load_prices.py b/inference/stages/load_prices.py index a57484e..add1f6f 100644 --- a/inference/stages/load_prices.py +++ b/inference/stages/load_prices.py @@ -169,12 +169,15 @@ def load_price_data_from_arctic( * Per-ticker read error rate > 5% → ``PipelineAbort``. * Individual ticker missing/empty → logged WARNING and dropped from output. - Defensive dedup - --------------- - The 2026-04-15 write-path fix (builders/daily_append.py → ``update()`` instead - of ``append()``) prevents new duplicate-date rows, but historical rows may - still carry duplicates. Each frame is deduped on read with ``keep="last"``. - Remove after 1-2 clean Saturday cycles confirm upstream is clean. + Upstream invariant + ------------------ + ArcticDB writes are duplicate-free post the 2026-04-15 write-path fix + (builders/daily_append.py → ``update()`` instead of ``append()``). + Reads here do NOT defensively dedup — if duplicates ever re-appear, + pandas raises downstream at the first ``reindex`` callsite (in + ``compute_features``), surfacing the upstream regression instead of + silently picking ``keep="last"`` (which would mask a values-differ + write-path regression as a clean read). """ end_ts = pd.Timestamp(date_str).normalize() start_ts = end_ts - pd.Timedelta(days=lookback_days) @@ -186,14 +189,14 @@ def _read_ohlcv(lib, sym: str) -> pd.DataFrame: df = res.data if df.empty: return df - return df[~df.index.duplicated(keep="last")].sort_index() + return df.sort_index() def _read_close(lib, sym: str) -> pd.DataFrame: res = lib.read(sym, date_range=(start_ts, end_ts), columns=["Close"]) df = res.data if df.empty: return df - return df[~df.index.duplicated(keep="last")].sort_index() + return df.sort_index() # ── Stocks: universe library only ──────────────────────────────────────── price_data: dict[str, pd.DataFrame] = {} diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 8f2a7a9..297c96c 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,5 +1,6 @@ """Tests for data/dataset.py — array building and normalization.""" +import inspect import json import tempfile from pathlib import Path @@ -192,3 +193,42 @@ def test_returns_arrays(self): mean, std = load_norm_stats(f.name) assert isinstance(mean, np.ndarray) assert isinstance(std, np.ndarray) + + +class TestDefensiveDedupRetiredL2188: + """Pin L2188 — `keep="last"` defensive dedup retired post-soak. + + The 2026-04-15 ArcticDB write-path fix (alpha-engine-data + builders/daily_append.py → `update()` over `append()`) made + duplicates structurally impossible. The defensive `keep="last"` + dedup on the read path was a transition device; per + feedback_no_silent_fails it must NOT be re-introduced — silent + dedup of values-differ duplicates would mask an upstream regression + as a clean read. + + Sites guarded: + - data/dataset.py::_load_ticker_parquet (training read path) + - inference/stages/load_prices.py::_read_ohlcv + _read_close + (inference read path) + """ + + def test_dataset_loader_does_not_dedup(self): + from data import dataset + src = inspect.getsource(dataset._load_ticker_parquet) + assert ".duplicated(" not in src, ( + "data/dataset.py::_load_ticker_parquet must not silently " + "dedup on read — duplicates are structurally impossible " + "post the 2026-04-15 ArcticDB write-path fix. If duplicates " + "appear, surface them via pandas reindex raising downstream " + "rather than masking values-differ regressions per " + "feedback_no_silent_fails." + ) + + def test_inference_readers_do_not_dedup(self): + from inference.stages import load_prices + src = inspect.getsource(load_prices.load_price_data_from_arctic) + assert ".duplicated(" not in src, ( + "inference/stages/load_prices.py::_read_ohlcv + _read_close " + "must not silently dedup on read — same rationale as the " + "dataset loader pin above." + )