-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml.example
More file actions
137 lines (127 loc) · 5.92 KB
/
config.yaml.example
File metadata and controls
137 lines (127 loc) · 5.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# ─────────────────────────────────────────────────────────────────────────────
# DESCRIPTIVE / DOCUMENTATION ONLY
#
# Production reads the live config from alpha-engine-config/data/config.yaml
# at runtime (synced to data Lambdas via deploy.sh; synced to spot EC2 via
# boot scripts). Edits to this file affect ONLY:
# 1. New-environment bootstrap (copies this template to config.yaml), and
# 2. Public-repo schema documentation for the data module's config surface.
#
# Edits here have ZERO effect on the live system. To change a runtime value
# or add a flag that needs to take effect, the change MUST land in
# alpha-engine-config (separate private repo).
# ─────────────────────────────────────────────────────────────────────────────
# Alpha Engine Data Collector Configuration
# Copy to config.yaml and customise. config.yaml is gitignored.
# S3 bucket (shared across all alpha-engine modules)
bucket: alpha-engine-research
# Price cache settings (Phase 1)
price_cache:
s3_prefix: predictor/price_cache/
slim_prefix: predictor/price_cache_slim/
fetch_period: "10y" # yfinance period for full refresh
staleness_threshold_days: 3 # business days before a parquet is considered stale
slim_lookback_days: 730 # calendar days for slim cache (2 years)
refresh_batch_size: 50 # tickers per yfinance batch download
# Market data output
market_data:
s3_prefix: market_data/
# Universe returns (Phase 1) — polygon.io forward return computation
universe_returns:
signals_prefix: signals # S3 prefix for signal dates
sector_map_key: predictor/price_cache/sector_map.json
# db_path: /tmp/research.db # optional: override local DB path (auto-downloaded from S3)
# Signal returns (Phase 1) — score_performance + predictor_outcomes backfill
signal_returns:
forward_days: 21 # prediction horizon (trading days). Drives
# which universe_returns columns the
# backfill reads + records as
# predictor_outcomes.horizon_days. Must
# match the predictor's training horizon
# (cfg.FORWARD_DAYS in alpha-engine-predictor).
# Daily closes (weekday OHLCV archive — intermediate staging,
# 7-day S3 lifecycle expiration; canonical home is ArcticDB universe library)
daily_closes:
s3_prefix: staging/daily_closes/
# Windowed-data-reconciliation arc (plan doc:
# alpha-engine-docs/private/windowed-data-reconciliation-260510.md).
# When window_days > 1, both the EOD yfinance pass and the morning
# polygon pass scan the last N business days, applying the source-
# precedence ladder (NaN < yfinance < polygon) to converge every cell
# to its highest-authority source within ≤24h.
#
# Polygon's free-tier rate limit is honored by-design: one
# `grouped-daily` call per date in the window — total `window_days`
# polygon calls per morning pass — bounded regardless of universe size.
#
# `window_days: 1` preserves single-date legacy behavior. Recommended
# production default is 14 (two-week recovery horizon for transient
# outages); staged rollout flips this AFTER one clean Sat SF + 5 clean
# weekday SFs at 14.
window_days: 1
# Source-precedence-ladder skip optimization. When true, yfinance pass
# skips tickers that already have an authoritative source in the
# existing parquet (keeps batch cost near zero in steady state).
# Polygon pass ignores this flag — always overwrites within the window
# so corporate-action backfills are absorbed (option (a) per
# 2026-05-10 design discussion). Recommended production: true once
# window_days > 1 takes effect.
skip_if_canonical: false
# Chronic polygon coverage gaps — tickers polygon does not reliably serve.
# MorningEnrich's `_self_heal_chronic_polygon_gaps` step yfinance-backfills
# any ArcticDB row gap from `last_date+1` to `target_date` for each ticker
# in this list. Adding a ticker is a deliberate operational decision (it
# weakens the "no silent fails" stance for that ticker specifically); a
# follow-up drift alarm flags entries here that start getting polygon
# coverage so they can be removed. Tickers absent from this list still
# hard-fail polygon_only when missing — preserves the strict default.
chronic_polygon_gaps:
tickers:
BF-B:
reason: "Class B share — polygon serves BF.B (dot), our universe uses BF-B (dash)"
since: "2025-04-01"
BRK-B:
reason: "Class B share — polygon serves BRK.B, our universe uses BRK-B"
since: "2025-04-01"
MOG-A:
reason: "Class A share — polygon serves MOG.A, our universe uses MOG-A"
since: "2025-04-01"
PSTG:
reason: "Polygon coverage flaky since ~2026-04 (no clear cause)"
since: "2026-04-01"
# Tickers that require a leading caret in yfinance (index/rate tickers)
# Stored locally WITHOUT the caret (VIX.parquet, not ^VIX.parquet)
caret_symbols:
- VIX
- VIX3M
- TNX
- IRX
# Always-download tickers (benchmarks, macro, sector ETFs)
# TWO + HYOAS added 2026-05-10 (Stage 2.5 regime-conditioning rebuild) —
# FRED-only, no yfinance caret. Forward-only collection from this date;
# historical backfill via collectors/fred_history.py (Stage 2.5b).
# BAA10Y added 2026-05-10 (Stage 2.5c) — full 40y FRED history (1986+),
# the credit-regime signal HYOAS can't provide across the full training
# corpus (HYOAS license-gated to 2023+ on FRED).
always_download:
- SPY
- VIX
- VIX3M
- TNX
- IRX
- TWO
- HYOAS
- BAA10Y
- GLD
- USO
- XLK
- XLF
- XLE
- XLV
- XLI
- XLY
- XLP
- XLU
- XLB
- XLRE
- XLC