PolicyEngine · DTrim99 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/us/states/sc/data_exploration.ipynb b/us/states/sc/data_exploration.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SC Dataset Exploration\n",
+    "\n",
+    "This notebook explores the South Carolina (SC) dataset to understand household counts, income distribution, and demographic characteristics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from policyengine_us import Microsimulation\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "SC_DATASET = \"hf://policyengine/policyengine-us-data/states/SC.h5\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load SC dataset\n",
+    "sim = Microsimulation(dataset=SC_DATASET)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of households in dataset: 35,324\n",
+      "Household count (weighted): 1,887,388\n",
+      "Person count (weighted): 5,451,832\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check dataset size\n",
+    "household_weight = sim.calculate(\"household_weight\", period=2025)\n",
+    "household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n",
+    "person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n",
+    "\n",
+    "print(f\"Number of households in dataset: {len(household_weight):,}\")\n",
+    "print(f\"Household count (weighted): {household_count.sum():,.0f}\")\n",
+    "print(f\"Person count (weighted): {person_count.sum():,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "# Check income distribution (weighted vs unweighted, household and person level)\nagi_household = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\nagi_hh_array = np.array(agi_household)\nhh_weights = np.array(sim.calculate(\"household_weight\", period=2025))\n\nagi_person = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"person\")\nagi_person_array = np.array(agi_person)\nperson_weights = np.array(sim.calculate(\"person_weight\", period=2025))\n\n# Weighted percentile calculation\ndef weighted_percentile(values, weights, percentile):\n    sorted_indices = np.argsort(values)\n    sorted_values = values[sorted_indices]\n    sorted_weights = weights[sorted_indices]\n    cumulative_weight = np.cumsum(sorted_weights)\n    idx = np.searchsorted(cumulative_weight, cumulative_weight[-1] * percentile / 100)\n    return sorted_values[min(idx, len(sorted_values)-1)]\n\n# Unweighted medians\nunweighted_median_hh = np.median(agi_hh_array)\nunweighted_median_person = np.median(agi_person_array)\n\n# Weighted medians\nweighted_median_hh = weighted_percentile(agi_hh_array, hh_weights, 50)\nweighted_median_person = weighted_percentile(agi_person_array, person_weights, 50)\n\n# Weighted averages\nweighted_avg_hh = np.average(agi_hh_array, weights=hh_weights)\nweighted_avg_person = np.average(agi_person_array, weights=person_weights)\n\n# Average household size\ntotal_persons = person_weights.sum()\ntotal_households = hh_weights.sum()\navg_hh_size = total_persons / total_households\n\nprint(\"=\" * 60)\nprint(\"INCOME DISTRIBUTION SUMMARY\")\nprint(\"=\" * 60)\nprint(f\"\\nHousehold AGI:\")\nprint(f\"  Unweighted median: ${unweighted_median_hh:,.0f}\")\nprint(f\"  Weighted median:   ${weighted_median_hh:,.0f}\")\nprint(f\"  Weighted average:  ${weighted_avg_hh:,.0f}\")\n\nprint(f\"\\nPerson AGI:\")\nprint(f\"  Unweighted median: ${unweighted_median_person:,.0f}\")\nprint(f\"  Weighted median:   ${weighted_median_person:,.0f}\")\nprint(f\"  Weighted average:  ${weighted_avg_person:,.0f}\")\n\nprint(f\"\\nAverage household size: {avg_hh_size:.1f}\")\n\nprint(f\"\\nWeighted household AGI percentiles:\")\nprint(f\"  25th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 25):,.0f}\")\nprint(f\"  50th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 50):,.0f}\")\nprint(f\"  75th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 75):,.0f}\")\nprint(f\"  90th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 90):,.0f}\")\nprint(f\"  95th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 95):,.0f}\")\nprint(f\"  Max AGI: ${agi_hh_array.max():,.0f}\")"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Households with children (weighted):\n",
+      "  Total households with children: 598,564\n",
+      "  Households with 1 child: 247,956\n",
+      "  Households with 2 children: 190,545\n",
+      "  Households with 3+ children: 160,063\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check households with children\n",
+    "is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n",
+    "household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n",
+    "household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n",
+    "\n",
+    "# Create DataFrame\n",
+    "df_households = pd.DataFrame({\n",
+    "    'household_id': household_id,\n",
+    "    'is_child': is_child,\n",
+    "    'household_weight': household_weight\n",
+    "})\n",
+    "\n",
+    "# Count children per household\n",
+    "children_per_household = df_households.groupby('household_id').agg({\n",
+    "    'is_child': 'sum',\n",
+    "    'household_weight': 'first'\n",
+    "}).reset_index()\n",
+    "\n",
+    "# Calculate weighted household counts\n",
+    "total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n",
+    "households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n",
+    "households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n",
+    "households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n",
+    "\n",
+    "print(f\"\\nHouseholds with children (weighted):\")\n",
+    "print(f\"  Total households with children: {total_households_with_children:,.0f}\")\n",
+    "print(f\"  Households with 1 child: {households_with_1_child:,.0f}\")\n",
+    "print(f\"  Households with 2 children: {households_with_2_children:,.0f}\")\n",
+    "print(f\"  Households with 3+ children: {households_with_3plus_children:,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Children by age:\n",
+      "  Total children under 18: 1,198,147\n",
+      "  Children under 6: 349,101\n",
+      "  Children under 3: 169,412\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check children by age groups\n",
+    "df = pd.DataFrame({\n",
+    "    \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n",
+    "    \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n",
+    "    \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n",
+    "    \"age\": sim.calculate(\"age\", map_to=\"person\"),\n",
+    "    \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n",
+    "})\n",
+    "\n",
+    "# Filter for children and apply weights\n",
+    "children_under_18_df = df[df['age'] < 18]\n",
+    "children_under_6_df = df[df['age'] < 6]\n",
+    "children_under_3_df = df[df['age'] < 3]\n",
+    "\n",
+    "# Calculate weighted totals\n",
+    "total_children = children_under_18_df['person_weight'].sum()\n",
+    "children_under_6 = children_under_6_df['person_weight'].sum()\n",
+    "children_under_3 = children_under_3_df['person_weight'].sum()\n",
+    "\n",
+    "print(f\"\\nChildren by age:\")\n",
+    "print(f\"  Total children under 18: {total_children:,.0f}\")\n",
+    "print(f\"  Children under 6: {children_under_6:,.0f}\")\n",
+    "print(f\"  Children under 3: {children_under_3:,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "# Create comprehensive summary table\nsummary_data = {\n    'Metric': [\n        'Household count (weighted)',\n        'Person count (weighted)',\n        'Average household size',\n        'Weighted median household AGI',\n        'Weighted average household AGI',\n        'Weighted median person AGI',\n        'Weighted average person AGI',\n        'Unweighted median household AGI',\n        'Unweighted median person AGI',\n        '25th percentile household AGI',\n        '75th percentile household AGI',\n        '90th percentile household AGI',\n        '95th percentile household AGI',\n        'Max household AGI',\n        'Total households with children',\n        'Households with 1 child',\n        'Households with 2 children',\n        'Households with 3+ children',\n        'Total children under 18',\n        'Children under 6',\n        'Children under 3'\n    ],\n    'Value': [\n        f\"{household_count.sum():,.0f}\",\n        f\"{person_count.sum():,.0f}\",\n        f\"{avg_hh_size:.1f}\",\n        f\"${weighted_median_hh:,.0f}\",\n        f\"${weighted_avg_hh:,.0f}\",\n        f\"${weighted_median_person:,.0f}\",\n        f\"${weighted_avg_person:,.0f}\",\n        f\"${unweighted_median_hh:,.0f}\",\n        f\"${unweighted_median_person:,.0f}\",\n        f\"${weighted_percentile(agi_hh_array, hh_weights, 25):,.0f}\",\n        f\"${weighted_percentile(agi_hh_array, hh_weights, 75):,.0f}\",\n        f\"${weighted_percentile(agi_hh_array, hh_weights, 90):,.0f}\",\n        f\"${weighted_percentile(agi_hh_array, hh_weights, 95):,.0f}\",\n        f\"${agi_hh_array.max():,.0f}\",\n        f\"{total_households_with_children:,.0f}\",\n        f\"{households_with_1_child:,.0f}\",\n        f\"{households_with_2_children:,.0f}\",\n        f\"{households_with_3plus_children:,.0f}\",\n        f\"{total_children:,.0f}\",\n        f\"{children_under_6:,.0f}\",\n        f\"{children_under_3:,.0f}\"\n    ]\n}\n\nsummary_df = pd.DataFrame(summary_data)\n\nprint(\"\\n\" + \"=\"*65)\nprint(\"SC DATASET SUMMARY - WEIGHTED (Population Estimates)\")\nprint(\"=\"*65)\nprint(summary_df.to_string(index=False))\nprint(\"=\"*65)\n\n# Save table\nsummary_df.to_csv('sc_dataset_summary_weighted.csv', index=False)\nprint(\"\\nSummary saved to: sc_dataset_summary_weighted.csv\")"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "======================================================================\n",
+      "HOUSEHOLDS WITH $0 INCOME\n",
+      "======================================================================\n",
+      "Household count: 179,119\n",
+      "Percentage of all households: 9.49%\n",
+      "======================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Households with $0 income\n",
+    "agi_hh = np.array(sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\"))\n",
+    "weights = np.array(sim.calculate(\"household_weight\", period=2025))\n",
+    "\n",
+    "zero_income_mask = agi_hh == 0\n",
+    "zero_income_count = weights[zero_income_mask].sum()\n",
+    "total_households = weights.sum()\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\"HOUSEHOLDS WITH $0 INCOME\")\n",
+    "print(\"=\"*70)\n",
+    "print(f\"Household count: {zero_income_count:,.0f}\")\n",
+    "print(f\"Percentage of all households: {zero_income_count / total_households * 100:.2f}%\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "======================================================================\n",
+      "HOUSEHOLD COUNTS BY INCOME BRACKET\n",
+      "======================================================================\n",
+      "Income Bracket Households % of All Households\n",
+      "       $0-$10k    434,505              23.02%\n",
+      "     $10k-$20k    155,370               8.23%\n",
+      "     $20k-$30k    149,595               7.93%\n",
+      "     $30k-$40k    115,365               6.11%\n",
+      "     $40k-$50k    127,566               6.76%\n",
+      "     $50k-$60k    110,405               5.85%\n",
+      "======================================================================\n",
+      "\n",
+      "Total households in $0-$60k range: 1,092,805\n",
+      "Percentage of all households in $0-$60k range: 57.90%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Household counts by income brackets\n",
+    "income_brackets = [\n",
+    "    (0, 10000, \"$0-$10k\"),\n",
+    "    (10000, 20000, \"$10k-$20k\"),\n",
+    "    (20000, 30000, \"$20k-$30k\"),\n",
+    "    (30000, 40000, \"$30k-$40k\"),\n",
+    "    (40000, 50000, \"$40k-$50k\"),\n",
+    "    (50000, 60000, \"$50k-$60k\")\n",
+    "]\n",
+    "\n",
+    "bracket_data = []\n",
+    "for lower, upper, label in income_brackets:\n",
+    "    mask = (agi_hh >= lower) & (agi_hh < upper)\n",
+    "    count = weights[mask].sum()\n",
+    "    pct_of_total = (count / total_households) * 100\n",
+    "    \n",
+    "    bracket_data.append({\n",
+    "        \"Income Bracket\": label,\n",
+    "        \"Households\": f\"{count:,.0f}\",\n",
+    "        \"% of All Households\": f\"{pct_of_total:.2f}%\"\n",
+    "    })\n",
+    "\n",
+    "income_df = pd.DataFrame(bracket_data)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\"HOUSEHOLD COUNTS BY INCOME BRACKET\")\n",
+    "print(\"=\"*70)\n",
+    "print(income_df.to_string(index=False))\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# Total in $0-$60k range\n",
+    "total_in_range = sum([weights[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])\n",
+    "print(f\"\\nTotal households in $0-$60k range: {total_in_range:,.0f}\")\n",
+    "print(f\"Percentage of all households in $0-$60k range: {total_in_range / total_households * 100:.2f}%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}