From 95e01b64a1b7046e7fd7cc3e1d8cb89170d50d54 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 4 Mar 2026 17:08:08 -0500 Subject: [PATCH] Fix enhanced_cps_2024.h5 being overwritten by sparse version create_sparse_ecps() was writing to enhanced_cps_2024.h5 instead of sparse_enhanced_cps_2024.h5, destroying the full dataset after enhanced_cps.py produced it. This caused all input variables (notably employment_income) to be lost, inflating the baseline poverty rate from ~14% to ~39% on policyengine.org. Introduced in commit 20572be0 ("Streamline data build"). Co-Authored-By: Claude Opus 4.6 --- changelog.d/fix-enhanced-cps-overwrite.fixed.md | 1 + policyengine_us_data/datasets/cps/small_enhanced_cps.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 changelog.d/fix-enhanced-cps-overwrite.fixed.md diff --git a/changelog.d/fix-enhanced-cps-overwrite.fixed.md b/changelog.d/fix-enhanced-cps-overwrite.fixed.md new file mode 100644 index 00000000..3faa4c36 --- /dev/null +++ b/changelog.d/fix-enhanced-cps-overwrite.fixed.md @@ -0,0 +1 @@ +Fix create_sparse_ecps overwriting enhanced_cps_2024.h5 with sparse version that drops input variables like employment_income. diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index 5e099bec..ccee6458 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -104,7 +104,7 @@ def create_sparse_ecps(): if len(data[variable]) == 0: del data[variable] - with h5py.File(STORAGE_FOLDER / "enhanced_cps_2024.h5", "w") as f: + with h5py.File(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5", "w") as f: for variable, periods in data.items(): grp = f.create_group(variable) for period, values in periods.items():