Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 290 additions & 0 deletions us/states/sc/data_exploration.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SC Dataset Exploration\n",
"\n",
"This notebook explores the South Carolina (SC) dataset to understand household counts, income distribution, and demographic characteristics."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from policyengine_us import Microsimulation\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"SC_DATASET = \"hf://policyengine/policyengine-us-data/states/SC.h5\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load SC dataset\n",
"sim = Microsimulation(dataset=SC_DATASET)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of households in dataset: 35,324\n",
"Household count (weighted): 1,887,388\n",
"Person count (weighted): 5,451,832\n"
]
}
],
"source": [
"# Check dataset size\n",
"household_weight = sim.calculate(\"household_weight\", period=2025)\n",
"household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n",
"person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n",
"\n",
"print(f\"Number of households in dataset: {len(household_weight):,}\")\n",
"print(f\"Household count (weighted): {household_count.sum():,.0f}\")\n",
"print(f\"Person count (weighted): {person_count.sum():,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Check income distribution (weighted vs unweighted, household and person level)\nagi_household = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\nagi_hh_array = np.array(agi_household)\nhh_weights = np.array(sim.calculate(\"household_weight\", period=2025))\n\nagi_person = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"person\")\nagi_person_array = np.array(agi_person)\nperson_weights = np.array(sim.calculate(\"person_weight\", period=2025))\n\n# Weighted percentile calculation\ndef weighted_percentile(values, weights, percentile):\n sorted_indices = np.argsort(values)\n sorted_values = values[sorted_indices]\n sorted_weights = weights[sorted_indices]\n cumulative_weight = np.cumsum(sorted_weights)\n idx = np.searchsorted(cumulative_weight, cumulative_weight[-1] * percentile / 100)\n return sorted_values[min(idx, len(sorted_values)-1)]\n\n# Unweighted medians\nunweighted_median_hh = np.median(agi_hh_array)\nunweighted_median_person = np.median(agi_person_array)\n\n# Weighted medians\nweighted_median_hh = weighted_percentile(agi_hh_array, hh_weights, 50)\nweighted_median_person = weighted_percentile(agi_person_array, person_weights, 50)\n\n# Weighted averages\nweighted_avg_hh = np.average(agi_hh_array, weights=hh_weights)\nweighted_avg_person = np.average(agi_person_array, weights=person_weights)\n\n# Average household size\ntotal_persons = person_weights.sum()\ntotal_households = hh_weights.sum()\navg_hh_size = total_persons / total_households\n\nprint(\"=\" * 60)\nprint(\"INCOME DISTRIBUTION SUMMARY\")\nprint(\"=\" * 60)\nprint(f\"\\nHousehold AGI:\")\nprint(f\" Unweighted median: ${unweighted_median_hh:,.0f}\")\nprint(f\" Weighted median: ${weighted_median_hh:,.0f}\")\nprint(f\" Weighted average: ${weighted_avg_hh:,.0f}\")\n\nprint(f\"\\nPerson AGI:\")\nprint(f\" Unweighted median: ${unweighted_median_person:,.0f}\")\nprint(f\" Weighted median: ${weighted_median_person:,.0f}\")\nprint(f\" Weighted average: ${weighted_avg_person:,.0f}\")\n\nprint(f\"\\nAverage household size: {avg_hh_size:.1f}\")\n\nprint(f\"\\nWeighted household AGI percentiles:\")\nprint(f\" 25th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 25):,.0f}\")\nprint(f\" 50th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 50):,.0f}\")\nprint(f\" 75th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 75):,.0f}\")\nprint(f\" 90th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 90):,.0f}\")\nprint(f\" 95th percentile: ${weighted_percentile(agi_hh_array, hh_weights, 95):,.0f}\")\nprint(f\" Max AGI: ${agi_hh_array.max():,.0f}\")"
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Households with children (weighted):\n",
" Total households with children: 598,564\n",
" Households with 1 child: 247,956\n",
" Households with 2 children: 190,545\n",
" Households with 3+ children: 160,063\n"
]
}
],
"source": [
"# Check households with children\n",
"is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n",
"household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n",
"household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n",
"\n",
"# Create DataFrame\n",
"df_households = pd.DataFrame({\n",
" 'household_id': household_id,\n",
" 'is_child': is_child,\n",
" 'household_weight': household_weight\n",
"})\n",
"\n",
"# Count children per household\n",
"children_per_household = df_households.groupby('household_id').agg({\n",
" 'is_child': 'sum',\n",
" 'household_weight': 'first'\n",
"}).reset_index()\n",
"\n",
"# Calculate weighted household counts\n",
"total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n",
"households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n",
"households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n",
"households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n",
"\n",
"print(f\"\\nHouseholds with children (weighted):\")\n",
"print(f\" Total households with children: {total_households_with_children:,.0f}\")\n",
"print(f\" Households with 1 child: {households_with_1_child:,.0f}\")\n",
"print(f\" Households with 2 children: {households_with_2_children:,.0f}\")\n",
"print(f\" Households with 3+ children: {households_with_3plus_children:,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Children by age:\n",
" Total children under 18: 1,198,147\n",
" Children under 6: 349,101\n",
" Children under 3: 169,412\n"
]
}
],
"source": [
"# Check children by age groups\n",
"df = pd.DataFrame({\n",
" \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n",
" \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n",
" \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n",
" \"age\": sim.calculate(\"age\", map_to=\"person\"),\n",
" \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n",
"})\n",
"\n",
"# Filter for children and apply weights\n",
"children_under_18_df = df[df['age'] < 18]\n",
"children_under_6_df = df[df['age'] < 6]\n",
"children_under_3_df = df[df['age'] < 3]\n",
"\n",
"# Calculate weighted totals\n",
"total_children = children_under_18_df['person_weight'].sum()\n",
"children_under_6 = children_under_6_df['person_weight'].sum()\n",
"children_under_3 = children_under_3_df['person_weight'].sum()\n",
"\n",
"print(f\"\\nChildren by age:\")\n",
"print(f\" Total children under 18: {total_children:,.0f}\")\n",
"print(f\" Children under 6: {children_under_6:,.0f}\")\n",
"print(f\" Children under 3: {children_under_3:,.0f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Create comprehensive summary table\nsummary_data = {\n 'Metric': [\n 'Household count (weighted)',\n 'Person count (weighted)',\n 'Average household size',\n 'Weighted median household AGI',\n 'Weighted average household AGI',\n 'Weighted median person AGI',\n 'Weighted average person AGI',\n 'Unweighted median household AGI',\n 'Unweighted median person AGI',\n '25th percentile household AGI',\n '75th percentile household AGI',\n '90th percentile household AGI',\n '95th percentile household AGI',\n 'Max household AGI',\n 'Total households with children',\n 'Households with 1 child',\n 'Households with 2 children',\n 'Households with 3+ children',\n 'Total children under 18',\n 'Children under 6',\n 'Children under 3'\n ],\n 'Value': [\n f\"{household_count.sum():,.0f}\",\n f\"{person_count.sum():,.0f}\",\n f\"{avg_hh_size:.1f}\",\n f\"${weighted_median_hh:,.0f}\",\n f\"${weighted_avg_hh:,.0f}\",\n f\"${weighted_median_person:,.0f}\",\n f\"${weighted_avg_person:,.0f}\",\n f\"${unweighted_median_hh:,.0f}\",\n f\"${unweighted_median_person:,.0f}\",\n f\"${weighted_percentile(agi_hh_array, hh_weights, 25):,.0f}\",\n f\"${weighted_percentile(agi_hh_array, hh_weights, 75):,.0f}\",\n f\"${weighted_percentile(agi_hh_array, hh_weights, 90):,.0f}\",\n f\"${weighted_percentile(agi_hh_array, hh_weights, 95):,.0f}\",\n f\"${agi_hh_array.max():,.0f}\",\n f\"{total_households_with_children:,.0f}\",\n f\"{households_with_1_child:,.0f}\",\n f\"{households_with_2_children:,.0f}\",\n f\"{households_with_3plus_children:,.0f}\",\n f\"{total_children:,.0f}\",\n f\"{children_under_6:,.0f}\",\n f\"{children_under_3:,.0f}\"\n ]\n}\n\nsummary_df = pd.DataFrame(summary_data)\n\nprint(\"\\n\" + \"=\"*65)\nprint(\"SC DATASET SUMMARY - WEIGHTED (Population Estimates)\")\nprint(\"=\"*65)\nprint(summary_df.to_string(index=False))\nprint(\"=\"*65)\n\n# Save table\nsummary_df.to_csv('sc_dataset_summary_weighted.csv', index=False)\nprint(\"\\nSummary saved to: sc_dataset_summary_weighted.csv\")"
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"======================================================================\n",
"HOUSEHOLDS WITH $0 INCOME\n",
"======================================================================\n",
"Household count: 179,119\n",
"Percentage of all households: 9.49%\n",
"======================================================================\n"
]
}
],
"source": [
"# Households with $0 income\n",
"agi_hh = np.array(sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\"))\n",
"weights = np.array(sim.calculate(\"household_weight\", period=2025))\n",
"\n",
"zero_income_mask = agi_hh == 0\n",
"zero_income_count = weights[zero_income_mask].sum()\n",
"total_households = weights.sum()\n",
"\n",
"print(\"\\n\" + \"=\"*70)\n",
"print(\"HOUSEHOLDS WITH $0 INCOME\")\n",
"print(\"=\"*70)\n",
"print(f\"Household count: {zero_income_count:,.0f}\")\n",
"print(f\"Percentage of all households: {zero_income_count / total_households * 100:.2f}%\")\n",
"print(\"=\"*70)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"======================================================================\n",
"HOUSEHOLD COUNTS BY INCOME BRACKET\n",
"======================================================================\n",
"Income Bracket Households % of All Households\n",
" $0-$10k 434,505 23.02%\n",
" $10k-$20k 155,370 8.23%\n",
" $20k-$30k 149,595 7.93%\n",
" $30k-$40k 115,365 6.11%\n",
" $40k-$50k 127,566 6.76%\n",
" $50k-$60k 110,405 5.85%\n",
"======================================================================\n",
"\n",
"Total households in $0-$60k range: 1,092,805\n",
"Percentage of all households in $0-$60k range: 57.90%\n"
]
}
],
"source": [
"# Household counts by income brackets\n",
"income_brackets = [\n",
" (0, 10000, \"$0-$10k\"),\n",
" (10000, 20000, \"$10k-$20k\"),\n",
" (20000, 30000, \"$20k-$30k\"),\n",
" (30000, 40000, \"$30k-$40k\"),\n",
" (40000, 50000, \"$40k-$50k\"),\n",
" (50000, 60000, \"$50k-$60k\")\n",
"]\n",
"\n",
"bracket_data = []\n",
"for lower, upper, label in income_brackets:\n",
" mask = (agi_hh >= lower) & (agi_hh < upper)\n",
" count = weights[mask].sum()\n",
" pct_of_total = (count / total_households) * 100\n",
" \n",
" bracket_data.append({\n",
" \"Income Bracket\": label,\n",
" \"Households\": f\"{count:,.0f}\",\n",
" \"% of All Households\": f\"{pct_of_total:.2f}%\"\n",
" })\n",
"\n",
"income_df = pd.DataFrame(bracket_data)\n",
"\n",
"print(\"\\n\" + \"=\"*70)\n",
"print(\"HOUSEHOLD COUNTS BY INCOME BRACKET\")\n",
"print(\"=\"*70)\n",
"print(income_df.to_string(index=False))\n",
"print(\"=\"*70)\n",
"\n",
"# Total in $0-$60k range\n",
"total_in_range = sum([weights[(agi_hh >= lower) & (agi_hh < upper)].sum() for lower, upper, _ in income_brackets])\n",
"print(f\"\\nTotal households in $0-$60k range: {total_in_range:,.0f}\")\n",
"print(f\"Percentage of all households in $0-$60k range: {total_in_range / total_households * 100:.2f}%\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading