diff --git a/changelog.d/add-extend-single-year-dataset.added.md b/changelog.d/add-extend-single-year-dataset.added.md new file mode 100644 index 00000000000..fbffda84707 --- /dev/null +++ b/changelog.d/add-extend-single-year-dataset.added.md @@ -0,0 +1 @@ +Add extend_single_year_dataset for fast dataset year projection via multiplicative uprating, with entity-level HDFStore format detection and dual-path loading in Microsimulation. diff --git a/policyengine_us/data/__init__.py b/policyengine_us/data/__init__.py index accc88c8cca..e5215cfad98 100644 --- a/policyengine_us/data/__init__.py +++ b/policyengine_us/data/__init__.py @@ -1 +1,3 @@ from .zip_code_dataset import ZIP_CODE_DATASET +from .dataset_schema import USSingleYearDataset, USMultiYearDataset +from .economic_assumptions import extend_single_year_dataset diff --git a/policyengine_us/data/dataset_schema.py b/policyengine_us/data/dataset_schema.py new file mode 100644 index 00000000000..4663c199042 --- /dev/null +++ b/policyengine_us/data/dataset_schema.py @@ -0,0 +1,231 @@ +import pandas as pd +from pathlib import Path +import h5py + +US_ENTITIES = [ + "person", + "household", + "tax_unit", + "spm_unit", + "family", + "marital_unit", +] + + +class USSingleYearDataset: + person: pd.DataFrame + household: pd.DataFrame + tax_unit: pd.DataFrame + spm_unit: pd.DataFrame + family: pd.DataFrame + marital_unit: pd.DataFrame + + @staticmethod + def validate_file_path(file_path: str, raise_exception: bool = True): + if not file_path.endswith(".h5"): + if raise_exception: + raise ValueError( + "File path must end with '.h5' for USSingleYearDataset." + ) + return False + if not Path(file_path).exists(): + if raise_exception: + raise FileNotFoundError(f"File not found: {file_path}") + return False + + with h5py.File(file_path, "r") as f: + required_datasets = ["person", "household", "tax_unit"] + for dataset in required_datasets: + if dataset not in f: + if raise_exception: + raise ValueError( + f"Dataset '{dataset}' not found in the file: {file_path}" + ) + else: + return False + + return True + + def __init__( + self, + file_path: str = None, + person: pd.DataFrame = None, + household: pd.DataFrame = None, + tax_unit: pd.DataFrame = None, + spm_unit: pd.DataFrame = None, + family: pd.DataFrame = None, + marital_unit: pd.DataFrame = None, + time_period: int = 2024, + ): + file_path = str(file_path) if file_path else None + if file_path is not None: + self.validate_file_path(file_path) + with pd.HDFStore(file_path) as f: + self.person = f["person"] + self.household = f["household"] + self.tax_unit = f["tax_unit"] + self.spm_unit = f["spm_unit"] + self.family = f["family"] + self.marital_unit = f["marital_unit"] + if "_time_period" in f: + self.time_period = str(int(f["_time_period"].iloc[0])) + else: + self.time_period = str(time_period) + else: + if person is None or household is None or tax_unit is None: + raise ValueError( + "Must provide either a file path or at least " + "person, household, and tax_unit DataFrames." + ) + self.person = person + self.household = household + self.tax_unit = tax_unit + self.spm_unit = ( + spm_unit if spm_unit is not None else pd.DataFrame() + ) + self.family = family if family is not None else pd.DataFrame() + self.marital_unit = ( + marital_unit if marital_unit is not None else pd.DataFrame() + ) + self.time_period = str(time_period) + + self.data_format = "arrays" + self.tables = ( + self.person, + self.household, + self.tax_unit, + self.spm_unit, + self.family, + self.marital_unit, + ) + self.table_names = ( + "person", + "household", + "tax_unit", + "spm_unit", + "family", + "marital_unit", + ) + + def save(self, file_path: str): + with pd.HDFStore(file_path) as f: + for name, df in zip(self.table_names, self.tables): + if len(df) > 0: + f.put(name, df, format="table", data_columns=True) + f.put( + "_time_period", + pd.Series([int(self.time_period)]), + format="table", + ) + + def load(self): + data = {} + for df in self.tables: + for col in df.columns: + data[col] = df[col].values + return data + + def copy(self): + return USSingleYearDataset( + person=self.person.copy(), + household=self.household.copy(), + tax_unit=self.tax_unit.copy(), + spm_unit=self.spm_unit.copy(), + family=self.family.copy(), + marital_unit=self.marital_unit.copy(), + time_period=int(self.time_period), + ) + + def validate(self): + for name, df in zip(self.table_names, self.tables): + for col in df.columns: + if df[col].isna().any(): + raise ValueError( + f"Column '{col}' in {name} contains NaN values." + ) + + +class USMultiYearDataset: + def __init__( + self, + file_path: str = None, + datasets: list[USSingleYearDataset] | None = None, + ): + if datasets is not None: + self.datasets = {} + for dataset in datasets: + if not isinstance(dataset, USSingleYearDataset): + raise TypeError( + "All items in datasets must be of type USSingleYearDataset." + ) + year = int(dataset.time_period) + self.datasets[year] = dataset + + if file_path is not None: + with pd.HDFStore(file_path) as f: + self.datasets = {} + for key in f.keys(): + if key.startswith("/person/"): + year = int(key.split("/")[2]) + entity_dfs = {} + for entity in US_ENTITIES: + entity_key = f"/{entity}/{year}" + if entity_key in f: + entity_dfs[entity] = f[entity_key] + else: + entity_dfs[entity] = pd.DataFrame() + self.datasets[year] = USSingleYearDataset( + **entity_dfs, + time_period=year, + ) + + self.data_format = "time_period_arrays" + self.time_period = str(min(self.datasets.keys())) + + def get_year(self, year: int) -> USSingleYearDataset: + if year in self.datasets: + return self.datasets[year] + else: + raise ValueError(f"No dataset found for year {year}.") + + @property + def years(self): + return sorted(self.datasets.keys()) + + def __getitem__(self, year: int): + return self.get_year(year) + + def save(self, file_path: str): + Path(file_path).unlink(missing_ok=True) + with pd.HDFStore(file_path) as f: + for year, dataset in self.datasets.items(): + for name, df in zip(dataset.table_names, dataset.tables): + if len(df) > 0: + f.put( + f"{name}/{year}", + df, + format="table", + data_columns=True, + ) + f.put( + f"time_period/{year}", + pd.Series([year]), + format="table", + data_columns=True, + ) + + def copy(self): + new_datasets = { + year: dataset.copy() for year, dataset in self.datasets.items() + } + return USMultiYearDataset(datasets=list(new_datasets.values())) + + def load(self): + data = {} + for year, dataset in self.datasets.items(): + for df in dataset.tables: + for col in df.columns: + if col not in data: + data[col] = {} + data[col][year] = df[col].values + return data diff --git a/policyengine_us/data/economic_assumptions.py b/policyengine_us/data/economic_assumptions.py new file mode 100644 index 00000000000..98e29bcdd95 --- /dev/null +++ b/policyengine_us/data/economic_assumptions.py @@ -0,0 +1,97 @@ +from policyengine_us.data.dataset_schema import ( + USSingleYearDataset, + USMultiYearDataset, +) + + +def extend_single_year_dataset( + dataset: USSingleYearDataset, + end_year: int = 2035, +) -> USMultiYearDataset: + """Extend a single-year US dataset to multiple years via uprating. + + Copies the base-year DataFrames for each year from the base year through + ``end_year``, then applies multiplicative uprating using growth factors + derived from the policyengine-us parameter tree. + + Variables without an uprating parameter are carried forward unchanged. + """ + start_year = int(dataset.time_period) + datasets = [dataset] + for year in range(start_year + 1, end_year + 1): + next_year = dataset.copy() + next_year.time_period = str(year) + datasets.append(next_year) + + multi_year_dataset = USMultiYearDataset(datasets=datasets) + return _apply_uprating(multi_year_dataset) + + +def _apply_uprating(dataset: USMultiYearDataset) -> USMultiYearDataset: + """Apply year-over-year uprating to all years in a multi-year dataset.""" + from policyengine_us.system import system + + dataset = dataset.copy() + + years = sorted(dataset.datasets.keys()) + for year in years: + if year == years[0]: + continue + current = dataset.datasets[year] + previous = dataset.datasets[year - 1] + _apply_single_year_uprating(current, previous, system) + + return dataset + + +def _apply_single_year_uprating(current, previous, system): + """Apply multiplicative uprating from previous year to current year. + + For each variable column in each entity DataFrame, looks up the + variable's uprating parameter path in ``system.variables``. If the + variable has an uprating parameter, computes the growth factor as + ``param(current_year) / param(previous_year)`` and multiplies the + column by that factor. + + Variables without an uprating parameter (or whose uprating parameter + evaluates to 0 for the previous year) are left unchanged — they were + already copied forward by ``dataset.copy()``. + """ + current_year = int(current.time_period) + previous_year = int(previous.time_period) + current_period = f"{current_year}-01-01" + previous_period = f"{previous_year}-01-01" + + for table_name, current_df, prev_df in zip( + current.table_names, current.tables, previous.tables + ): + for col in current_df.columns: + if col not in system.variables: + continue + var = system.variables[col] + uprating_path = getattr(var, "uprating", None) + if uprating_path is None: + continue + + param = _resolve_parameter(system.parameters, uprating_path) + if param is None: + continue + + prev_val = param(previous_period) + curr_val = param(current_period) + if prev_val == 0: + continue + + factor = curr_val / prev_val + current_df[col] = prev_df[col] * factor + + +def _resolve_parameter(parameters, path): + """Resolve a dotted parameter path like 'gov.bls.cpi.cpi_u'.""" + node = parameters + for part in path.split("."): + try: + node = getattr(node, part) + except AttributeError: + return None + return node diff --git a/policyengine_us/system.py b/policyengine_us/system.py index 3eb94b66d86..6adf360f202 100644 --- a/policyengine_us/system.py +++ b/policyengine_us/system.py @@ -27,6 +27,10 @@ uprate_parameters, ) from .tools.default_uprating import add_default_uprating +from policyengine_us.data.dataset_schema import ( + USSingleYearDataset, + USMultiYearDataset, +) from typing import Annotated @@ -179,6 +183,49 @@ def __init__(self, *args, **kwargs): cg_holder.delete_arrays(known_period) +def _resolve_dataset_path(dataset_str): + """Resolve a dataset string to a local file path, downloading if needed.""" + if "hf://" in dataset_str: + from policyengine_core.tools.hugging_face import ( + parse_hf_url, + download_huggingface_dataset, + ) + + owner, repo, filename, version = parse_hf_url(dataset_str) + return download_huggingface_dataset( + repo=f"{owner}/{repo}", + repo_filename=filename, + version=version, + ) + elif Path(dataset_str).exists(): + return dataset_str + return None + + +def _is_hdfstore_format(file_path): + """Check if an HDF5 file uses entity-level HDFStore format. + + Entity-level files have top-level keys like 'person', 'household', etc. + Variable-centric h5py files have variable names as top-level keys. + """ + import h5py + + entity_names = { + "person", + "household", + "tax_unit", + "spm_unit", + "family", + "marital_unit", + } + try: + with h5py.File(file_path, "r") as f: + top_keys = set(f.keys()) + return bool(entity_names & top_keys) + except Exception: + return False + + class Microsimulation(CoreMicrosimulation): """ A microsimulation of the tax-benefit system for the United States, @@ -216,6 +263,26 @@ def __init__(self, *args, **kwargs): ): self.default_input_period = 2023 + # Detect entity-level HDFStore format and load/extend if needed + dataset = kwargs.get("dataset") + if dataset is not None and isinstance(dataset, str): + local_path = _resolve_dataset_path(dataset) + if local_path is not None and _is_hdfstore_format(local_path): + from policyengine_us.data.economic_assumptions import ( + extend_single_year_dataset, + ) + + single = USSingleYearDataset(file_path=local_path) + multi = extend_single_year_dataset(single) + kwargs["dataset"] = multi + elif isinstance(dataset, USSingleYearDataset): + from policyengine_us.data.economic_assumptions import ( + extend_single_year_dataset, + ) + + multi = extend_single_year_dataset(dataset) + kwargs["dataset"] = multi + super().__init__(*args, **kwargs) reform = create_structural_reforms_from_parameters( diff --git a/policyengine_us/tests/microsimulation/data/fixtures/__init__.py b/policyengine_us/tests/microsimulation/data/fixtures/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/policyengine_us/tests/microsimulation/data/fixtures/economic_assumptions_fixtures.py b/policyengine_us/tests/microsimulation/data/fixtures/economic_assumptions_fixtures.py new file mode 100644 index 00000000000..10d3aa0f58d --- /dev/null +++ b/policyengine_us/tests/microsimulation/data/fixtures/economic_assumptions_fixtures.py @@ -0,0 +1,225 @@ +""" +Fixtures for extend_single_year_dataset and uprating tests. + +Provides mock system objects, parameter trees, and sample datasets +so tests can run without loading the full policyengine-us tax-benefit +system. +""" + +import numpy as np +import pandas as pd + +from policyengine_us.data.dataset_schema import USSingleYearDataset + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +BASE_YEAR = 2024 +END_YEAR_DEFAULT = 2035 +END_YEAR_SHORT = 2026 + +NUM_PERSONS = 10 +NUM_HOUSEHOLDS = 4 +NUM_TAX_UNITS = 5 + +# Uprating parameter paths (mirrors real policyengine-us paths) +EMPLOYMENT_INCOME_UPRATING = "calibration.gov.irs.soi.employment_income" +CPI_U_UPRATING = "gov.bls.cpi.cpi_u" +INVALID_UPRATING_PATH = "does.not.exist" + +# Uprating parameter values — simple round numbers for easy verification. +# The growth factor from 2024 to 2025 is 110/100 = 1.10 (10% growth). +# The growth factor from 2025 to 2026 is 121/110 = 1.10 (10% growth). +EMPLOYMENT_INCOME_PARAM_VALUES = { + "2024-01-01": 100.0, + "2025-01-01": 110.0, + "2026-01-01": 121.0, + "2027-01-01": 133.1, +} +EMPLOYMENT_INCOME_GROWTH_FACTOR_2024_TO_2025 = 110.0 / 100.0 # 1.10 +EMPLOYMENT_INCOME_GROWTH_FACTOR_2025_TO_2026 = 121.0 / 110.0 # 1.10 + +CPI_U_PARAM_VALUES = { + "2024-01-01": 300.0, + "2025-01-01": 309.0, + "2026-01-01": 318.27, +} +CPI_U_GROWTH_FACTOR_2024_TO_2025 = 309.0 / 300.0 # 1.03 + +# Base-year column values +PERSON_IDS = np.arange(1, NUM_PERSONS + 1) +EMPLOYMENT_INCOME_BASE = np.array( + [ + 50_000, + 60_000, + 70_000, + 80_000, + 90_000, + 40_000, + 30_000, + 0, + 100_000, + 55_000, + ], + dtype=float, +) +AGE_BASE = np.array([25, 30, 35, 40, 45, 50, 55, 60, 65, 70], dtype=float) + +HOUSEHOLD_IDS = np.arange(1, NUM_HOUSEHOLDS + 1) +RENT_BASE = np.array([1_200, 1_500, 900, 2_000], dtype=float) + +TAX_UNIT_IDS = np.arange(1, NUM_TAX_UNITS + 1) + + +# --------------------------------------------------------------------------- +# Mock parameter tree +# --------------------------------------------------------------------------- + + +class MockParameter: + """A callable that returns a value for a given period string.""" + + def __init__(self, values: dict[str, float]): + self._values = values + + def __call__(self, period: str) -> float: + return self._values[period] + + +class _ParameterNode: + """A nested attribute-access node for building mock parameter trees.""" + + def __init__(self): + self._children = {} + + def _add_path(self, path: str, param: MockParameter): + parts = path.split(".", 1) + if len(parts) == 1: + self._children[parts[0]] = param + else: + child = self._children.setdefault(parts[0], _ParameterNode()) + child._add_path(parts[1], param) + + def __getattr__(self, name): + if name.startswith("_"): + return super().__getattribute__(name) + try: + return self._children[name] + except KeyError: + raise AttributeError(name) + + +def build_mock_parameters(param_specs: dict[str, dict[str, float]]): + """Build a mock parameter tree from {dotted_path: {period: value}}.""" + root = _ParameterNode() + for path, values in param_specs.items(): + root._add_path(path, MockParameter(values)) + return root + + +# --------------------------------------------------------------------------- +# Mock variable metadata +# --------------------------------------------------------------------------- + + +class MockVariable: + """Minimal stand-in for a policyengine Variable metadata object.""" + + def __init__(self, name: str, uprating: str | None = None): + self.name = name + self.uprating = uprating + + +# Variables with uprating +MOCK_EMPLOYMENT_INCOME_VAR = MockVariable( + "employment_income", uprating=EMPLOYMENT_INCOME_UPRATING +) +MOCK_RENT_VAR = MockVariable("rent", uprating=CPI_U_UPRATING) + +# Variables without uprating (carried forward unchanged) +MOCK_AGE_VAR = MockVariable("age", uprating=None) +MOCK_PERSON_ID_VAR = MockVariable("person_id", uprating=None) + +# Variable with an invalid uprating path +MOCK_BAD_UPRATING_VAR = MockVariable( + "bad_variable", uprating=INVALID_UPRATING_PATH +) + + +def build_mock_variables() -> dict: + """Return a dict mapping variable name -> MockVariable.""" + return { + v.name: v + for v in [ + MOCK_EMPLOYMENT_INCOME_VAR, + MOCK_RENT_VAR, + MOCK_AGE_VAR, + MOCK_PERSON_ID_VAR, + ] + } + + +# --------------------------------------------------------------------------- +# Mock system +# --------------------------------------------------------------------------- + + +class MockSystem: + """Minimal stand-in for policyengine_us.system.system.""" + + def __init__(self, variables=None, parameters=None): + self.variables = variables or build_mock_variables() + self.parameters = parameters or build_mock_parameters( + { + EMPLOYMENT_INCOME_UPRATING: EMPLOYMENT_INCOME_PARAM_VALUES, + CPI_U_UPRATING: CPI_U_PARAM_VALUES, + } + ) + + +def build_mock_system() -> MockSystem: + return MockSystem() + + +# --------------------------------------------------------------------------- +# Sample datasets +# --------------------------------------------------------------------------- + + +def build_base_person_df() -> pd.DataFrame: + return pd.DataFrame( + { + "person_id": PERSON_IDS, + "employment_income": EMPLOYMENT_INCOME_BASE.copy(), + "age": AGE_BASE.copy(), + } + ) + + +def build_base_household_df() -> pd.DataFrame: + return pd.DataFrame( + { + "household_id": HOUSEHOLD_IDS, + "rent": RENT_BASE.copy(), + } + ) + + +def build_base_tax_unit_df() -> pd.DataFrame: + return pd.DataFrame({"tax_unit_id": TAX_UNIT_IDS}) + + +def build_single_year_dataset( + time_period: int = BASE_YEAR, +) -> USSingleYearDataset: + """Create a minimal USSingleYearDataset for testing.""" + return USSingleYearDataset( + person=build_base_person_df(), + household=build_base_household_df(), + tax_unit=build_base_tax_unit_df(), + spm_unit=pd.DataFrame({"spm_unit_id": [1, 2, 3]}), + family=pd.DataFrame({"family_id": [1, 2]}), + marital_unit=pd.DataFrame({"marital_unit_id": [1, 2, 3, 4]}), + time_period=time_period, + ) diff --git a/policyengine_us/tests/microsimulation/data/test_extend_single_year_dataset.py b/policyengine_us/tests/microsimulation/data/test_extend_single_year_dataset.py new file mode 100644 index 00000000000..c5d0368e453 --- /dev/null +++ b/policyengine_us/tests/microsimulation/data/test_extend_single_year_dataset.py @@ -0,0 +1,504 @@ +""" +Tests for extend_single_year_dataset and its internal helpers. + +Uses mock system objects to avoid loading the full policyengine-us +tax-benefit system, keeping tests fast and deterministic. +""" + +import sys as _sys +from types import ModuleType +from unittest.mock import patch + +import numpy as np +import pandas as pd +import pytest + +from policyengine_us.data.dataset_schema import ( + USSingleYearDataset, + USMultiYearDataset, +) +from policyengine_us.data.economic_assumptions import ( + _apply_single_year_uprating, + _resolve_parameter, +) +from policyengine_us.tests.microsimulation.data.fixtures.economic_assumptions_fixtures import ( + BASE_YEAR, + END_YEAR_SHORT, + END_YEAR_DEFAULT, + NUM_PERSONS, + NUM_HOUSEHOLDS, + EMPLOYMENT_INCOME_BASE, + EMPLOYMENT_INCOME_GROWTH_FACTOR_2024_TO_2025, + EMPLOYMENT_INCOME_GROWTH_FACTOR_2025_TO_2026, + CPI_U_GROWTH_FACTOR_2024_TO_2025, + AGE_BASE, + RENT_BASE, + EMPLOYMENT_INCOME_UPRATING, + EMPLOYMENT_INCOME_PARAM_VALUES, + CPI_U_UPRATING, + CPI_U_PARAM_VALUES, + INVALID_UPRATING_PATH, + build_mock_system, + build_mock_parameters, + build_single_year_dataset, + MockVariable, + MockSystem, +) + + +def _call_extend_with_mock_system(mock_system, dataset, **kwargs): + """Call extend_single_year_dataset with a mock system module. + + The real ``_apply_uprating`` does ``from policyengine_us.system import + system`` at call time. We intercept that import by temporarily + injecting a fake ``policyengine_us.system`` module into ``sys.modules`` + so the real (expensive) tax-benefit system is never loaded. + """ + fake_module = ModuleType("policyengine_us.system") + fake_module.system = mock_system + + saved = _sys.modules.get("policyengine_us.system") + _sys.modules["policyengine_us.system"] = fake_module + try: + # Import here so the patched module is used + from policyengine_us.data.economic_assumptions import ( + extend_single_year_dataset, + ) + + return extend_single_year_dataset(dataset, **kwargs) + finally: + if saved is not None: + _sys.modules["policyengine_us.system"] = saved + else: + _sys.modules.pop("policyengine_us.system", None) + + +@pytest.fixture +def mock_system(): + return build_mock_system() + + +@pytest.fixture +def base_dataset(): + return build_single_year_dataset() + + +# --------------------------------------------------------------------------- +# _resolve_parameter +# --------------------------------------------------------------------------- + + +class TestResolveParameter: + def test_given_valid_dotted_path_then_returns_parameter(self): + # Given + params = build_mock_parameters( + {EMPLOYMENT_INCOME_UPRATING: EMPLOYMENT_INCOME_PARAM_VALUES} + ) + + # When + result = _resolve_parameter(params, EMPLOYMENT_INCOME_UPRATING) + + # Then + assert result is not None + assert result("2024-01-01") == 100.0 + + def test_given_invalid_path_then_returns_none(self): + # Given + params = build_mock_parameters( + {EMPLOYMENT_INCOME_UPRATING: EMPLOYMENT_INCOME_PARAM_VALUES} + ) + + # When + result = _resolve_parameter(params, INVALID_UPRATING_PATH) + + # Then + assert result is None + + def test_given_partial_valid_path_then_returns_none(self): + # Given + params = build_mock_parameters( + {EMPLOYMENT_INCOME_UPRATING: EMPLOYMENT_INCOME_PARAM_VALUES} + ) + + # When — path exists partially but not fully + result = _resolve_parameter(params, "calibration.gov.nonexistent") + + # Then + assert result is None + + +# --------------------------------------------------------------------------- +# _apply_single_year_uprating +# --------------------------------------------------------------------------- + + +class TestApplySingleYearUprating: + def test_given_uprated_variable_then_values_scaled_by_growth_factor( + self, base_dataset, mock_system + ): + # Given + current = base_dataset.copy() + current.time_period = str(BASE_YEAR + 1) + previous = base_dataset + + # When + _apply_single_year_uprating(current, previous, mock_system) + + # Then + expected = ( + EMPLOYMENT_INCOME_BASE + * EMPLOYMENT_INCOME_GROWTH_FACTOR_2024_TO_2025 + ) + np.testing.assert_allclose( + current.person["employment_income"].values, expected + ) + + def test_given_variable_without_uprating_then_values_unchanged( + self, base_dataset, mock_system + ): + # Given + current = base_dataset.copy() + current.time_period = str(BASE_YEAR + 1) + previous = base_dataset + + # When + _apply_single_year_uprating(current, previous, mock_system) + + # Then — age has no uprating, should be unchanged + np.testing.assert_array_equal(current.person["age"].values, AGE_BASE) + + def test_given_household_variable_with_uprating_then_values_scaled( + self, base_dataset, mock_system + ): + # Given + current = base_dataset.copy() + current.time_period = str(BASE_YEAR + 1) + previous = base_dataset + + # When + _apply_single_year_uprating(current, previous, mock_system) + + # Then — rent is uprated by CPI-U + expected = RENT_BASE * CPI_U_GROWTH_FACTOR_2024_TO_2025 + np.testing.assert_allclose(current.household["rent"].values, expected) + + def test_given_variable_not_in_system_then_values_unchanged( + self, mock_system + ): + # Given — add a column the system doesn't know about + person_df = pd.DataFrame( + { + "person_id": [1, 2], + "employment_income": [50_000.0, 60_000.0], + "mystery_column": [999.0, 888.0], + } + ) + current = USSingleYearDataset( + person=person_df.copy(), + household=pd.DataFrame({"household_id": [1]}), + tax_unit=pd.DataFrame({"tax_unit_id": [1]}), + time_period=BASE_YEAR + 1, + ) + previous = USSingleYearDataset( + person=person_df.copy(), + household=pd.DataFrame({"household_id": [1]}), + tax_unit=pd.DataFrame({"tax_unit_id": [1]}), + time_period=BASE_YEAR, + ) + + # When + _apply_single_year_uprating(current, previous, mock_system) + + # Then — mystery_column should be untouched + np.testing.assert_array_equal( + current.person["mystery_column"].values, + [999.0, 888.0], + ) + + def test_given_uprating_path_unresolvable_then_values_unchanged(self): + # Given — variable has an uprating path that doesn't exist in params + variables = { + "bad_variable": MockVariable( + "bad_variable", uprating=INVALID_UPRATING_PATH + ) + } + system = MockSystem( + variables=variables, + parameters=build_mock_parameters({}), + ) + + person_df = pd.DataFrame( + { + "person_id": [1], + "bad_variable": [42.0], + } + ) + current = USSingleYearDataset( + person=person_df.copy(), + household=pd.DataFrame({"household_id": [1]}), + tax_unit=pd.DataFrame({"tax_unit_id": [1]}), + time_period=BASE_YEAR + 1, + ) + previous = USSingleYearDataset( + person=person_df.copy(), + household=pd.DataFrame({"household_id": [1]}), + tax_unit=pd.DataFrame({"tax_unit_id": [1]}), + time_period=BASE_YEAR, + ) + + # When + _apply_single_year_uprating(current, previous, system) + + # Then + assert current.person["bad_variable"].values[0] == 42.0 + + def test_given_previous_param_value_zero_then_values_unchanged(self): + # Given — parameter value is 0 for the previous year (avoid div-by-zero) + variables = { + "some_income": MockVariable("some_income", uprating="test.param") + } + system = MockSystem( + variables=variables, + parameters=build_mock_parameters( + {"test.param": {"2024-01-01": 0.0, "2025-01-01": 100.0}} + ), + ) + + person_df = pd.DataFrame( + { + "person_id": [1], + "some_income": [5_000.0], + } + ) + current = USSingleYearDataset( + person=person_df.copy(), + household=pd.DataFrame({"household_id": [1]}), + tax_unit=pd.DataFrame({"tax_unit_id": [1]}), + time_period=BASE_YEAR + 1, + ) + previous = USSingleYearDataset( + person=person_df.copy(), + household=pd.DataFrame({"household_id": [1]}), + tax_unit=pd.DataFrame({"tax_unit_id": [1]}), + time_period=BASE_YEAR, + ) + + # When + _apply_single_year_uprating(current, previous, system) + + # Then — no division by zero, value unchanged + assert current.person["some_income"].values[0] == 5_000.0 + + def test_given_zero_base_values_then_uprating_preserves_zeros( + self, base_dataset, mock_system + ): + # Given — person index 7 has employment_income = 0 + current = base_dataset.copy() + current.time_period = str(BASE_YEAR + 1) + previous = base_dataset + + # When + _apply_single_year_uprating(current, previous, mock_system) + + # Then — 0 * any_factor = 0 + assert current.person["employment_income"].values[7] == 0.0 + + +# --------------------------------------------------------------------------- +# extend_single_year_dataset (end-to-end with mocked system) +# --------------------------------------------------------------------------- + + +class TestExtendSingleYearDataset: + def test_given_base_year_and_end_year_then_correct_number_of_years( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then + expected_years = list(range(BASE_YEAR, END_YEAR_SHORT + 1)) + assert result.years == expected_years + + def test_given_base_year_equals_end_year_then_single_year_returned( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=BASE_YEAR + ) + + # Then + assert result.years == [BASE_YEAR] + + def test_given_default_end_year_then_extends_to_2035(self): + # Given — need param values through 2035 + extended_values = { + f"{y}-01-01": 100.0 * (1.1 ** (y - BASE_YEAR)) + for y in range(BASE_YEAR, END_YEAR_DEFAULT + 1) + } + cpi_values = { + f"{y}-01-01": 300.0 * (1.03 ** (y - BASE_YEAR)) + for y in range(BASE_YEAR, END_YEAR_DEFAULT + 1) + } + system = MockSystem( + parameters=build_mock_parameters( + { + EMPLOYMENT_INCOME_UPRATING: extended_values, + CPI_U_UPRATING: cpi_values, + } + ) + ) + dataset = build_single_year_dataset() + + # When + result = _call_extend_with_mock_system(system, dataset) + + # Then + assert result.years == list(range(BASE_YEAR, END_YEAR_DEFAULT + 1)) + + def test_given_extended_dataset_then_base_year_values_unchanged( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then — base year data should match original + base = result[BASE_YEAR] + np.testing.assert_array_equal( + base.person["employment_income"].values, + EMPLOYMENT_INCOME_BASE, + ) + np.testing.assert_array_equal(base.person["age"].values, AGE_BASE) + np.testing.assert_array_equal(base.household["rent"].values, RENT_BASE) + + def test_given_extended_dataset_then_year_one_correctly_uprated( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then + year_one = result[BASE_YEAR + 1] + expected = ( + EMPLOYMENT_INCOME_BASE + * EMPLOYMENT_INCOME_GROWTH_FACTOR_2024_TO_2025 + ) + np.testing.assert_allclose( + year_one.person["employment_income"].values, expected + ) + + def test_given_extended_dataset_then_year_two_chains_uprating( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then — year 2 should be uprated from year 1, not from base + expected = ( + EMPLOYMENT_INCOME_BASE + * EMPLOYMENT_INCOME_GROWTH_FACTOR_2024_TO_2025 + * EMPLOYMENT_INCOME_GROWTH_FACTOR_2025_TO_2026 + ) + year_two = result[BASE_YEAR + 2] + np.testing.assert_allclose( + year_two.person["employment_income"].values, expected + ) + + def test_given_extended_dataset_then_non_uprated_variable_same_all_years( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then — age has no uprating, identical across all years + for year in result.years: + np.testing.assert_array_equal( + result[year].person["age"].values, AGE_BASE + ) + + def test_given_extended_dataset_then_row_counts_preserved( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then — every year has the same number of rows per entity + for year in result.years: + ds = result[year] + assert len(ds.person) == NUM_PERSONS + assert len(ds.household) == NUM_HOUSEHOLDS + + def test_given_extended_dataset_then_each_year_has_correct_time_period( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then + for year in result.years: + assert int(result[year].time_period) == year + + def test_given_extended_dataset_then_result_is_multi_year_dataset( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then + assert isinstance(result, USMultiYearDataset) + assert result.data_format == "time_period_arrays" + + def test_given_extended_dataset_then_input_dataset_not_mutated( + self, mock_system + ): + # Given + dataset = build_single_year_dataset() + original_values = dataset.person["employment_income"].values.copy() + + # When + _call_extend_with_mock_system( + mock_system, dataset, end_year=END_YEAR_SHORT + ) + + # Then — original dataset should be untouched + np.testing.assert_array_equal( + dataset.person["employment_income"].values, + original_values, + ) + + def test_given_multiple_entities_uprated_then_all_apply_correctly( + self, base_dataset, mock_system + ): + # When + result = _call_extend_with_mock_system( + mock_system, base_dataset, end_year=END_YEAR_SHORT + ) + + # Then — both person (employment_income) and household (rent) uprated + year_one = result[BASE_YEAR + 1] + np.testing.assert_allclose( + year_one.person["employment_income"].values, + EMPLOYMENT_INCOME_BASE + * EMPLOYMENT_INCOME_GROWTH_FACTOR_2024_TO_2025, + ) + np.testing.assert_allclose( + year_one.household["rent"].values, + RENT_BASE * CPI_U_GROWTH_FACTOR_2024_TO_2025, + )