texas-district-analysis/rebuild/add_census_data.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b6671d51",
   "metadata": {},
   "source": [
    "# Census Demographics Enrichment for Wells\n",
    "\n",
    "This notebook:\n",
    "1. Connects to Postgres and reads tract GEOIDs from `well_shape_tract`.\n",
    "2. Fetches ACS 2021 5-year demographic variables relevant to environmental justice and performance analysis via the Census API (dataset: `acs/acs5`).\n",
    "3. Computes derived metrics (percent minority, poverty rate, unemployment, educational attainment, linguistic isolation proxy, etc.).\n",
    "4. Writes a new table `census_tract_demographics` and (optionally) demonstrates joining it back to `well_shape_tract`.\n",
    "\n",
    "Prerequisites:\n",
    "- Environment variables for Postgres (PGHOST, PGPORT, PGUSER, PGPASSWORD, PGDATABASE).\n",
    "- Optional: `CENSUS_API_KEY` for higher request volume (without a key, small pulls usually still work but may be throttled).\n",
    "\n",
    "Adjust variables or add/remove as needed before running the fetch cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5d5672c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Distinct tracts from wells: 3551\n"
     ]
    },
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "geoid",
         "rawType": "object",
         "type": "string"
        }
       ],
       "ref": "1697201f-d939-4f51-a9af-8ccdf2da3849",
       "rows": [
        [
         "0",
         "48001950100"
        ],
        [
         "1",
         "48001950401"
        ],
        [
         "2",
         "48001950402"
        ],
        [
         "3",
         "48001950500"
        ],
        [
         "4",
         "48001950600"
        ]
       ],
       "shape": {
        "columns": 1,
        "rows": 5
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>geoid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>48001950100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>48001950401</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>48001950402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>48001950500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>48001950600</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         geoid\n",
       "0  48001950100\n",
       "1  48001950401\n",
       "2  48001950402\n",
       "3  48001950500\n",
       "4  48001950600"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cell 2: Pull tract list from well_shape_tract\n",
    "from sqlalchemy import text\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    tracts = pd.read_sql(text(\"\"\"\n",
    "        SELECT DISTINCT census_tract_geoid AS geoid\n",
    "        FROM well_shape_tract\n",
    "        WHERE census_tract_geoid IS NOT NULL\n",
    "    \"\"\"), conn)\n",
    "\n",
    "print(f\"Distinct tracts from wells: {len(tracts)}\")\n",
    "tracts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "25eff3bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total vars: 54; chunk count: 2\n",
      "Raw merged ACS rows: 3551\n",
      "Raw merged ACS rows: 3551\n"
     ]
    },
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "geoid",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "name",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "total_population",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "white_alone",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "black_alone",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "american_indian_alone",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "asian_alone",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "native_hawaiian_pacific_alone",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "other_race_alone",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "two_or_more_races",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "hispanic_any_race",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "hispanic_base",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "poverty_universe",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "below_poverty",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "education_universe",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_002",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_003",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_004",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_005",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_006",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_007",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_008",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_009",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_010",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_011",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_012",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_013",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_014",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_015",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b15003_016",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "civilian_labor_force",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "civilian_unemployed",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "language_universe",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "linguistic_isolation_est",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "median_household_income",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "renters_universe",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b25070_008",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b25070_009",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b25070_010",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "b25070_011",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "disability_universe",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "with_disability",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "male_under_5",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "female_under_5",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "male_65_66",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "male_67_69",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "male_70_74",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "male_75_79",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "male_80_84",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "male_85_plus",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "female_65_66",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "female_67_69",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "female_70_74",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "female_75_79",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "female_80_84",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "female_85_plus",
         "rawType": "int64",
         "type": "integer"
        }
       ],
       "ref": "bde8ae8b-2bba-4fb2-9b8d-0891f465bd5c",
       "rows": [
        [
         "0",
         "48001950100",
         "Census Tract 9501, Anderson County, Texas",
         "5447",
         "4540",
         "212",
         "4",
         "21",
         "0",
         "66",
         "604",
         "644",
         "5447",
         "5404",
         "913",
         "3648",
         "22",
         "0",
         "0",
         "0",
         "0",
         "0",
         "0",
         "9",
         "21",
         "0",
         "92",
         "54",
         "76",
         "83",
         "3",
         "2092",
         "157",
         "1823",
         "0",
         "61325",
         "219",
         "0",
         "13",
         "25",
         "42",
         "5409",
         "1433",
         "325",
         "214",
         "57",
         "77",
         "138",
         "119",
         "49",
         "13",
         "36",
         "101",
         "264",
         "114",
         "19",
         "83"
        ],
        [
         "1",
         "48001950401",
         "Census Tract 9504.01, Anderson County, Texas",
         "4544",
         "2145",
         "1682",
         "10",
         "8",
         "0",
         "58",
         "641",
         "1435",
         "4544",
         "114",
         "0",
         "3854",
         "0",
         "0",
         "0",
         "10",
         "13",
         "0",
         "7",
         "34",
         "92",
         "71",
         "57",
         "264",
         "264",
         "318",
         "70",
         "55",
         "0",
         "27",
         "0",
         "92813",
         "15",
         "6",
         "0",
         "0",
         "0",
         "114",
         "49",
         "0",
         "15",
         "28",
         "15",
         "14",
         "8",
         "10",
         "10",
         "0",
         "0",
         "4",
         "0",
         "7",
         "0"
        ],
        [
         "2",
         "48001950402",
         "Census Tract 9504.02, Anderson County, Texas",
         "6997",
         "3476",
         "2587",
         "19",
         "0",
         "10",
         "133",
         "772",
         "2045",
         "6997",
         "20",
         "20",
         "6624",
         "35",
         "0",
         "0",
         "20",
         "10",
         "24",
         "31",
         "35",
         "113",
         "88",
         "247",
         "405",
         "386",
         "469",
         "46",
         "0",
         "0",
         "0",
         "0",
         "-666666666",
         "0",
         "0",
         "0",
         "0",
         "0",
         "20",
         "0",
         "0",
         "0",
         "53",
         "96",
         "56",
         "0",
         "0",
         "0",
         "0",
         "0",
         "0",
         "0",
         "0",
         "0"
        ],
        [
         "3",
         "48001950500",
         "Census Tract 9505, Anderson County, Texas",
         "4236",
         "2948",
         "673",
         "33",
         "0",
         "34",
         "194",
         "354",
         "1307",
         "4236",
         "4182",
         "802",
         "2850",
         "64",
         "0",
         "0",
         "20",
         "0",
         "0",
         "0",
         "10",
         "67",
         "0",
         "17",
         "28",
         "42",
         "43",
         "32",
         "1682",
         "48",
         "1379",
         "0",
         "41713",
         "477",
         "49",
         "62",
         "77",
         "28",
         "4190",
         "1217",
         "360",
         "137",
         "23",
         "14",
         "115",
         "56",
         "55",
         "33",
         "15",
         "61",
         "189",
         "106",
         "81",
         "43"
        ],
        [
         "4",
         "48001950600",
         "Census Tract 9506, Anderson County, Texas",
         "5843",
         "3408",
         "1901",
         "0",
         "0",
         "0",
         "126",
         "408",
         "1018",
         "5843",
         "5766",
         "1913",
         "3770",
         "174",
         "0",
         "0",
         "20",
         "0",
         "41",
         "0",
         "0",
         "9",
         "0",
         "0",
         "93",
         "61",
         "218",
         "556",
         "2128",
         "71",
         "1919",
         "0",
         "32552",
         "833",
         "60",
         "0",
         "199",
         "287",
         "5779",
         "1402",
         "60",
         "199",
         "16",
         "28",
         "33",
         "58",
         "12",
         "49",
         "142",
         "117",
         "170",
         "77",
         "45",
         "96"
        ]
       ],
       "shape": {
        "columns": 56,
        "rows": 5
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>geoid</th>\n",
       "      <th>name</th>\n",
       "      <th>total_population</th>\n",
       "      <th>white_alone</th>\n",
       "      <th>black_alone</th>\n",
       "      <th>american_indian_alone</th>\n",
       "      <th>asian_alone</th>\n",
       "      <th>native_hawaiian_pacific_alone</th>\n",
       "      <th>other_race_alone</th>\n",
       "      <th>two_or_more_races</th>\n",
       "      <th>...</th>\n",
       "      <th>male_70_74</th>\n",
       "      <th>male_75_79</th>\n",
       "      <th>male_80_84</th>\n",
       "      <th>male_85_plus</th>\n",
       "      <th>female_65_66</th>\n",
       "      <th>female_67_69</th>\n",
       "      <th>female_70_74</th>\n",
       "      <th>female_75_79</th>\n",
       "      <th>female_80_84</th>\n",
       "      <th>female_85_plus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>48001950100</td>\n",
       "      <td>Census Tract 9501, Anderson County, Texas</td>\n",
       "      <td>5447</td>\n",
       "      <td>4540</td>\n",
       "      <td>212</td>\n",
       "      <td>4</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>604</td>\n",
       "      <td>...</td>\n",
       "      <td>138</td>\n",
       "      <td>119</td>\n",
       "      <td>49</td>\n",
       "      <td>13</td>\n",
       "      <td>36</td>\n",
       "      <td>101</td>\n",
       "      <td>264</td>\n",
       "      <td>114</td>\n",
       "      <td>19</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>48001950401</td>\n",
       "      <td>Census Tract 9504.01, Anderson County, Texas</td>\n",
       "      <td>4544</td>\n",
       "      <td>2145</td>\n",
       "      <td>1682</td>\n",
       "      <td>10</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>58</td>\n",
       "      <td>641</td>\n",
       "      <td>...</td>\n",
       "      <td>14</td>\n",
       "      <td>8</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>48001950402</td>\n",
       "      <td>Census Tract 9504.02, Anderson County, Texas</td>\n",
       "      <td>6997</td>\n",
       "      <td>3476</td>\n",
       "      <td>2587</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>133</td>\n",
       "      <td>772</td>\n",
       "      <td>...</td>\n",
       "      <td>56</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>48001950500</td>\n",
       "      <td>Census Tract 9505, Anderson County, Texas</td>\n",
       "      <td>4236</td>\n",
       "      <td>2948</td>\n",
       "      <td>673</td>\n",
       "      <td>33</td>\n",
       "      <td>0</td>\n",
       "      <td>34</td>\n",
       "      <td>194</td>\n",
       "      <td>354</td>\n",
       "      <td>...</td>\n",
       "      <td>115</td>\n",
       "      <td>56</td>\n",
       "      <td>55</td>\n",
       "      <td>33</td>\n",
       "      <td>15</td>\n",
       "      <td>61</td>\n",
       "      <td>189</td>\n",
       "      <td>106</td>\n",
       "      <td>81</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>48001950600</td>\n",
       "      <td>Census Tract 9506, Anderson County, Texas</td>\n",
       "      <td>5843</td>\n",
       "      <td>3408</td>\n",
       "      <td>1901</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>126</td>\n",
       "      <td>408</td>\n",
       "      <td>...</td>\n",
       "      <td>33</td>\n",
       "      <td>58</td>\n",
       "      <td>12</td>\n",
       "      <td>49</td>\n",
       "      <td>142</td>\n",
       "      <td>117</td>\n",
       "      <td>170</td>\n",
       "      <td>77</td>\n",
       "      <td>45</td>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 56 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         geoid                                          name  \\\n",
       "0  48001950100     Census Tract 9501, Anderson County, Texas   \n",
       "1  48001950401  Census Tract 9504.01, Anderson County, Texas   \n",
       "2  48001950402  Census Tract 9504.02, Anderson County, Texas   \n",
       "3  48001950500     Census Tract 9505, Anderson County, Texas   \n",
       "4  48001950600     Census Tract 9506, Anderson County, Texas   \n",
       "\n",
       "   total_population  white_alone  black_alone  american_indian_alone  \\\n",
       "0              5447         4540          212                      4   \n",
       "1              4544         2145         1682                     10   \n",
       "2              6997         3476         2587                     19   \n",
       "3              4236         2948          673                     33   \n",
       "4              5843         3408         1901                      0   \n",
       "\n",
       "   asian_alone  native_hawaiian_pacific_alone  other_race_alone  \\\n",
       "0           21                              0                66   \n",
       "1            8                              0                58   \n",
       "2            0                             10               133   \n",
       "3            0                             34               194   \n",
       "4            0                              0               126   \n",
       "\n",
       "   two_or_more_races  ...  male_70_74  male_75_79  male_80_84  male_85_plus  \\\n",
       "0                604  ...         138         119          49            13   \n",
       "1                641  ...          14           8          10            10   \n",
       "2                772  ...          56           0           0             0   \n",
       "3                354  ...         115          56          55            33   \n",
       "4                408  ...          33          58          12            49   \n",
       "\n",
       "   female_65_66  female_67_69  female_70_74  female_75_79  female_80_84  \\\n",
       "0            36           101           264           114            19   \n",
       "1             0             0             4             0             7   \n",
       "2             0             0             0             0             0   \n",
       "3            15            61           189           106            81   \n",
       "4           142           117           170            77            45   \n",
       "\n",
       "   female_85_plus  \n",
       "0              83  \n",
       "1               0  \n",
       "2               0  \n",
       "3              43  \n",
       "4              96  \n",
       "\n",
       "[5 rows x 56 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cell 4: Fetch ACS data for all Texas tracts intersecting our wells (chunked to avoid 400 errors)\n",
    "import math\n",
    "\n",
    "# Build list of tract components (state=48 always for Texas GEOIDs)\n",
    "tracts['state'] = '48'\n",
    "tracts['county'] = tracts['geoid'].str.slice(2, 5)\n",
    "tracts['tract'] = tracts['geoid'].str.slice(5)\n",
    "\n",
    "BASE_URL = \"https://api.census.gov/data/2021/acs/acs5\"\n",
    "all_vars = list(acs_vars.keys())  # variable codes\n",
    "NAME_VAR = 'NAME'\n",
    "\n",
    "# Census API often fails if too many vars or malformed ones; chunk variables (e.g., 40 per request)\n",
    "CHUNK_SIZE = 40\n",
    "var_chunks = [all_vars[i:i+CHUNK_SIZE] for i in range(0, len(all_vars), CHUNK_SIZE)]\n",
    "print(f\"Total vars: {len(all_vars)}; chunk count: {len(var_chunks)}\")\n",
    "\n",
    "import requests\n",
    "import pandas as pd\n",
    "\n",
    "def fetch_for_county(county, tract_subset):\n",
    "    frames = []\n",
    "    for chunk in var_chunks:\n",
    "        params = {\n",
    "            'get': ','.join(chunk + [NAME_VAR]),\n",
    "            'for': 'tract:*',\n",
    "            'in': f'state:48 county:{county}'\n",
    "        }\n",
    "        if API_KEY:\n",
    "            params['key'] = API_KEY\n",
    "        resp = requests.get(BASE_URL, params=params, timeout=30)\n",
    "        if resp.status_code != 200:\n",
    "            print(f\"[warn] status {resp.status_code} for county {county} chunk starting {chunk[0]}\")\n",
    "            try:\n",
    "                print(resp.text[:500])\n",
    "            except Exception:\n",
    "                pass\n",
    "            continue\n",
    "        data = resp.json()\n",
    "        header, *records = data\n",
    "        df = pd.DataFrame(records, columns=header)\n",
    "        keep = set(tract_subset['tract'])\n",
    "        df = df[df['tract'].isin(keep)].copy()\n",
    "        df['geoid'] = '48' + df['county'] + df['tract']\n",
    "        frames.append(df)\n",
    "    if not frames:\n",
    "        return pd.DataFrame()\n",
    "    # Merge on geoid only to accumulate variables; keep NAME/state/county/tract from the first frame\n",
    "    base = frames[0]\n",
    "    for f in frames[1:]:\n",
    "        # Drop duplicate columns present in base, but never drop the join key\n",
    "        join_keys = {'geoid'}\n",
    "        dup_cols = set(c for c in f.columns if c in base.columns and c not in join_keys)\n",
    "        if dup_cols:\n",
    "            f = f.drop(columns=list(dup_cols))\n",
    "        base = base.merge(f, on='geoid', how='left')\n",
    "    return base\n",
    "\n",
    "county_frames = []\n",
    "for county, part in tracts.groupby('county'):\n",
    "    cf = fetch_for_county(county, part)\n",
    "    if not cf.empty:\n",
    "        county_frames.append(cf)\n",
    "\n",
    "acs_raw = pd.concat(county_frames, ignore_index=True) if county_frames else pd.DataFrame()\n",
    "print(f\"Raw merged ACS rows: {len(acs_raw)}\")\n",
    "\n",
    "# Rename variables to friendly names and coerce numerics\n",
    "rename_map = {k: v for k, v in acs_vars.items()}\n",
    "acs_df = acs_raw.rename(columns=rename_map)\n",
    "\n",
    "num_cols = list(rename_map.values())\n",
    "for c in num_cols:\n",
    "    if c in acs_df.columns:\n",
    "        acs_df[c] = pd.to_numeric(acs_df[c], errors='coerce')\n",
    "\n",
    "# Guarantee columns exist (fill missing with NA)\n",
    "for c in num_cols:\n",
    "    if c not in acs_df.columns:\n",
    "        acs_df[c] = None\n",
    "\n",
    "# Standardize the tract name column to lowercase 'name' to avoid quoted identifier issues\n",
    "if NAME_VAR in acs_df.columns:\n",
    "    acs_df = acs_df.rename(columns={NAME_VAR: 'name'})\n",
    "\n",
    "acs_df = acs_df[['geoid', 'name'] + num_cols]\n",
    "acs_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3b8739b1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         geoid  pct_minority  pct_hispanic  poverty_rate  unemployment_rate  \\\n",
      "0  48001950100      0.166514      0.118230      0.168949           0.075048   \n",
      "1  48001950401      0.527949      0.315801      0.000000           0.000000   \n",
      "2  48001950402      0.503216      0.292268      1.000000                NaN   \n",
      "3  48001950500      0.304060      0.308546      0.191774           0.028537   \n",
      "4  48001950600      0.416738      0.174226      0.331772           0.033365   \n",
      "\n",
      "   less_than_hs_pct  linguistic_isolation_rate  renter_cost_burden_rate  \\\n",
      "0          0.098684                        0.0                 0.365297   \n",
      "1          0.311365                        0.0                 0.400000   \n",
      "2          0.288194                        NaN                      NaN   \n",
      "3          0.113333                        0.0                 0.452830   \n",
      "4          0.310875                        0.0                 0.655462   \n",
      "\n",
      "   disability_rate  pct_under5  pct_65plus  ej_composite_score  \\\n",
      "0         0.264929    0.098954    0.196438            0.474332   \n",
      "1         0.429825    0.003301    0.021127            0.501327   \n",
      "2         0.000000    0.000000    0.029298            0.668572   \n",
      "3         0.290453    0.117328    0.186733            0.511775   \n",
      "4         0.242603    0.044327    0.144275            0.646844   \n",
      "\n",
      "   median_household_income  \n",
      "0                    61325  \n",
      "1                    92813  \n",
      "2               -666666666  \n",
      "3                    41713  \n",
      "4                    32552  \n"
     ]
    }
   ],
   "source": [
    "# Cell 5: Derived EJ metrics\n",
    "# Minority and Hispanic\n",
    "acs_df['minority_population'] = (acs_df['total_population'] - acs_df['white_alone'].fillna(0))\n",
    "acs_df['pct_minority'] = (acs_df['minority_population'] / acs_df['total_population']).where(acs_df['total_population']>0)\n",
    "acs_df['pct_hispanic'] = (acs_df['hispanic_any_race'] / acs_df['hispanic_base']).where(acs_df['hispanic_base']>0)\n",
    "\n",
    "# Poverty\n",
    "acs_df['poverty_rate'] = (acs_df['below_poverty'] / acs_df['poverty_universe']).where(acs_df['poverty_universe']>0)\n",
    "\n",
    "# Unemployment: unemployed / civilian labor force\n",
    "acs_df['unemployment_rate'] = (acs_df['civilian_unemployed'] / acs_df['civilian_labor_force']).where(acs_df['civilian_labor_force']>0)\n",
    "\n",
    "# Linguistic isolation proxy\n",
    "acs_df['linguistic_isolation_rate'] = (acs_df['linguistic_isolation_est'] / acs_df['language_universe']).where(acs_df['language_universe']>0)\n",
    "\n",
    "# Educational attainment (< HS): sum B15003_002.._016 over education_universe\n",
    "b15003_bins = [f'b15003_{i:03d}' for i in range(2, 17)]\n",
    "acs_df['less_than_hs'] = acs_df[b15003_bins].sum(axis=1, skipna=True)\n",
    "acs_df['less_than_hs_pct'] = (acs_df['less_than_hs'] / acs_df['education_universe']).where(acs_df['education_universe']>0)\n",
    "\n",
    "# Renter cost burden >30% = sum bins 8-11 over renters_universe\n",
    "rent_burden_bins = ['b25070_008','b25070_009','b25070_010','b25070_011']\n",
    "acs_df['renter_cost_burden_over_30'] = acs_df[rent_burden_bins].sum(axis=1, skipna=True)\n",
    "acs_df['renter_cost_burden_rate'] = (acs_df['renter_cost_burden_over_30'] / acs_df['renters_universe']).where(acs_df['renters_universe']>0)\n",
    "\n",
    "# Disability rate\n",
    "acs_df['disability_rate'] = (acs_df['with_disability'] / acs_df['disability_universe']).where(acs_df['disability_universe']>0)\n",
    "\n",
    "# Age structure: pct under 5, pct 65+\n",
    "acs_df['under5_total'] = acs_df['male_under_5'].fillna(0) + acs_df['female_under_5'].fillna(0)\n",
    "acs_df['pct_under5'] = (acs_df['under5_total'] / acs_df['total_population']).where(acs_df['total_population']>0)\n",
    "\n",
    "elder_bins = ['male_65_66','male_67_69','male_70_74','male_75_79','male_80_84','male_85_plus',\n",
    "              'female_65_66','female_67_69','female_70_74','female_75_79','female_80_84','female_85_plus']\n",
    "acs_df['elder_total'] = acs_df[elder_bins].sum(axis=1, skipna=True)\n",
    "acs_df['pct_65plus'] = (acs_df['elder_total'] / acs_df['total_population']).where(acs_df['total_population']>0)\n",
    "\n",
    "# Composite EJ score: percentile rank across selected metrics and average\n",
    "# Higher is worse (more vulnerable): poverty_rate, pct_minority, renter_cost_burden_rate, linguistic_isolation_rate,\n",
    "# less_than_hs_pct, unemployment_rate, disability_rate\n",
    "comp_cols = ['poverty_rate','pct_minority','renter_cost_burden_rate','linguistic_isolation_rate',\n",
    "             'less_than_hs_pct','unemployment_rate','disability_rate']\n",
    "\n",
    "# Compute percentiles per column (0..1)\n",
    "for c in comp_cols:\n",
    "    # rank method=average, pct=True gives 0..1 percentile; handle NA by leaving NA then fill with 0\n",
    "    acs_df[c + '_pctile'] = acs_df[c].rank(pct=True)\n",
    "    acs_df.loc[acs_df[c].isna(), c + '_pctile'] = None\n",
    "\n",
    "acs_df['ej_composite_score'] = acs_df[[c + '_pctile' for c in comp_cols]].mean(axis=1, skipna=True)\n",
    "\n",
    "# Keep a tidy preview\n",
    "print(acs_df[['geoid','pct_minority','pct_hispanic','poverty_rate','unemployment_rate','less_than_hs_pct',\n",
    "              'linguistic_isolation_rate','renter_cost_burden_rate','disability_rate','pct_under5','pct_65plus',\n",
    "              'ej_composite_score','median_household_income']].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "92ac800b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Updated census_tract_demographics (schema reconciled, data refreshed, indexes ensured).\n"
     ]
    },
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "canonical_api10",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "census_tract_geoid",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "ruca_category",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "is_rural",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "ej_composite_score",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "pct_minority",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "poverty_rate",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "unemployment_rate",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "less_than_hs_pct",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "linguistic_isolation_rate",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "renter_cost_burden_rate",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "disability_rate",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "pct_under5",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "pct_65plus",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "median_household_income",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "ruca_primary_description",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "ruca_secondary_description",
         "rawType": "object",
         "type": "string"
        }
       ],
       "ref": "b720feb6-5b69-4250-bf7f-5b130bf14b85",
       "rows": [
        [
         "0",
         "4236101293",
         "48361020300",
         "Micropolitan",
         "false",
         "0.762412654987491",
         "0.34279038718291055",
         "0.3413793103448276",
         "0.08861859252823631",
         "0.18162839248434237",
         "0.021718602455146365",
         "0.47280334728033474",
         "0.32164634146341464",
         "0.0931241655540721",
         "0.15453938584779706",
         "48218",
         "Micropolitan core",
         "Micropolitan core, no additional code"
        ],
        [
         "1",
         "4236130846",
         "48361020300",
         "Micropolitan",
         "false",
         "0.762412654987491",
         "0.34279038718291055",
         "0.3413793103448276",
         "0.08861859252823631",
         "0.18162839248434237",
         "0.021718602455146365",
         "0.47280334728033474",
         "0.32164634146341464",
         "0.0931241655540721",
         "0.15453938584779706",
         "48218",
         "Micropolitan core",
         "Micropolitan core, no additional code"
        ],
        [
         "2",
         "4236130889",
         "48361020300",
         "Micropolitan",
         "false",
         "0.762412654987491",
         "0.34279038718291055",
         "0.3413793103448276",
         "0.08861859252823631",
         "0.18162839248434237",
         "0.021718602455146365",
         "0.47280334728033474",
         "0.32164634146341464",
         "0.0931241655540721",
         "0.15453938584779706",
         "48218",
         "Micropolitan core",
         "Micropolitan core, no additional code"
        ],
        [
         "3",
         "4236130612",
         "48361020300",
         "Micropolitan",
         "false",
         "0.762412654987491",
         "0.34279038718291055",
         "0.3413793103448276",
         "0.08861859252823631",
         "0.18162839248434237",
         "0.021718602455146365",
         "0.47280334728033474",
         "0.32164634146341464",
         "0.0931241655540721",
         "0.15453938584779706",
         "48218",
         "Micropolitan core",
         "Micropolitan core, no additional code"
        ],
        [
         "4",
         "4236130951",
         "48361020300",
         "Micropolitan",
         "false",
         "0.762412654987491",
         "0.34279038718291055",
         "0.3413793103448276",
         "0.08861859252823631",
         "0.18162839248434237",
         "0.021718602455146365",
         "0.47280334728033474",
         "0.32164634146341464",
         "0.0931241655540721",
         "0.15453938584779706",
         "48218",
         "Micropolitan core",
         "Micropolitan core, no additional code"
        ],
        [
         "5",
         "4236130691",
         "48361021200",
         "Micropolitan",
         "false",
         "0.4149936725524403",
         "0.07561597281223449",
         "0.11989741397734559",
         "0.04049586776859504",
         "0.07249129471351694",
         "0.0",
         "0.7454545454545455",
         "0.225785896346644",
         "0.04927782497875956",
         "0.17587085811384875",
         "75030",
         "Micropolitan high commuting",
         "Micropolitan high commuting, no additional code"
        ],
        [
         "6",
         "4236130768",
         "48361021200",
         "Micropolitan",
         "false",
         "0.4149936725524403",
         "0.07561597281223449",
         "0.11989741397734559",
         "0.04049586776859504",
         "0.07249129471351694",
         "0.0",
         "0.7454545454545455",
         "0.225785896346644",
         "0.04927782497875956",
         "0.17587085811384875",
         "75030",
         "Micropolitan high commuting",
         "Micropolitan high commuting, no additional code"
        ],
        [
         "7",
         "4236100686",
         "48361022200",
         "Micropolitan",
         "false",
         "0.3776728029920977",
         "0.05415499533146592",
         "0.07862004224360479",
         "0.04386374241717219",
         "0.03543613707165109",
         "0.0",
         "0.458128078817734",
         "0.3131289492160075",
         "0.08029878618113913",
         "0.1092436974789916",
         "110550",
         "Micropolitan low commuting",
         "Micropolitan low commuting, no additional code"
        ],
        [
         "8",
         "4236130968",
         "48361022301",
         "Micropolitan",
         "false",
         "0.28235883683560237",
         "0.12156951689725562",
         "0.0381038784304831",
         "0.027311744049941473",
         "0.06414091060152875",
         "0.015587529976019185",
         "0.20195439739413681",
         "0.19505556815604447",
         "0.05057836244046269",
         "0.13608528010886822",
         "94161",
         "Micropolitan core",
         "Micropolitan core, no additional code"
        ],
        [
         "9",
         "4236130847",
         "48361021300",
         "Micropolitan",
         "false",
         "0.3443398903976459",
         "0.19737715803452854",
         "0.05079681274900399",
         "0.029069767441860465",
         "0.06048387096774194",
         "0.0",
         "0.31693989071038253",
         "0.2974904437427289",
         "0.08598937583001329",
         "0.13645418326693226",
         "86287",
         "Micropolitan core",
         "Micropolitan core, no additional code"
        ]
       ],
       "shape": {
        "columns": 17,
        "rows": 10
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>canonical_api10</th>\n",
       "      <th>census_tract_geoid</th>\n",
       "      <th>ruca_category</th>\n",
       "      <th>is_rural</th>\n",
       "      <th>ej_composite_score</th>\n",
       "      <th>pct_minority</th>\n",
       "      <th>poverty_rate</th>\n",
       "      <th>unemployment_rate</th>\n",
       "      <th>less_than_hs_pct</th>\n",
       "      <th>linguistic_isolation_rate</th>\n",
       "      <th>renter_cost_burden_rate</th>\n",
       "      <th>disability_rate</th>\n",
       "      <th>pct_under5</th>\n",
       "      <th>pct_65plus</th>\n",
       "      <th>median_household_income</th>\n",
       "      <th>ruca_primary_description</th>\n",
       "      <th>ruca_secondary_description</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4236101293</td>\n",
       "      <td>48361020300</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.762413</td>\n",
       "      <td>0.342790</td>\n",
       "      <td>0.341379</td>\n",
       "      <td>0.088619</td>\n",
       "      <td>0.181628</td>\n",
       "      <td>0.021719</td>\n",
       "      <td>0.472803</td>\n",
       "      <td>0.321646</td>\n",
       "      <td>0.093124</td>\n",
       "      <td>0.154539</td>\n",
       "      <td>48218</td>\n",
       "      <td>Micropolitan core</td>\n",
       "      <td>Micropolitan core, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4236130846</td>\n",
       "      <td>48361020300</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.762413</td>\n",
       "      <td>0.342790</td>\n",
       "      <td>0.341379</td>\n",
       "      <td>0.088619</td>\n",
       "      <td>0.181628</td>\n",
       "      <td>0.021719</td>\n",
       "      <td>0.472803</td>\n",
       "      <td>0.321646</td>\n",
       "      <td>0.093124</td>\n",
       "      <td>0.154539</td>\n",
       "      <td>48218</td>\n",
       "      <td>Micropolitan core</td>\n",
       "      <td>Micropolitan core, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4236130889</td>\n",
       "      <td>48361020300</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.762413</td>\n",
       "      <td>0.342790</td>\n",
       "      <td>0.341379</td>\n",
       "      <td>0.088619</td>\n",
       "      <td>0.181628</td>\n",
       "      <td>0.021719</td>\n",
       "      <td>0.472803</td>\n",
       "      <td>0.321646</td>\n",
       "      <td>0.093124</td>\n",
       "      <td>0.154539</td>\n",
       "      <td>48218</td>\n",
       "      <td>Micropolitan core</td>\n",
       "      <td>Micropolitan core, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4236130612</td>\n",
       "      <td>48361020300</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.762413</td>\n",
       "      <td>0.342790</td>\n",
       "      <td>0.341379</td>\n",
       "      <td>0.088619</td>\n",
       "      <td>0.181628</td>\n",
       "      <td>0.021719</td>\n",
       "      <td>0.472803</td>\n",
       "      <td>0.321646</td>\n",
       "      <td>0.093124</td>\n",
       "      <td>0.154539</td>\n",
       "      <td>48218</td>\n",
       "      <td>Micropolitan core</td>\n",
       "      <td>Micropolitan core, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4236130951</td>\n",
       "      <td>48361020300</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.762413</td>\n",
       "      <td>0.342790</td>\n",
       "      <td>0.341379</td>\n",
       "      <td>0.088619</td>\n",
       "      <td>0.181628</td>\n",
       "      <td>0.021719</td>\n",
       "      <td>0.472803</td>\n",
       "      <td>0.321646</td>\n",
       "      <td>0.093124</td>\n",
       "      <td>0.154539</td>\n",
       "      <td>48218</td>\n",
       "      <td>Micropolitan core</td>\n",
       "      <td>Micropolitan core, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4236130691</td>\n",
       "      <td>48361021200</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.414994</td>\n",
       "      <td>0.075616</td>\n",
       "      <td>0.119897</td>\n",
       "      <td>0.040496</td>\n",
       "      <td>0.072491</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.745455</td>\n",
       "      <td>0.225786</td>\n",
       "      <td>0.049278</td>\n",
       "      <td>0.175871</td>\n",
       "      <td>75030</td>\n",
       "      <td>Micropolitan high commuting</td>\n",
       "      <td>Micropolitan high commuting, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>4236130768</td>\n",
       "      <td>48361021200</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.414994</td>\n",
       "      <td>0.075616</td>\n",
       "      <td>0.119897</td>\n",
       "      <td>0.040496</td>\n",
       "      <td>0.072491</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.745455</td>\n",
       "      <td>0.225786</td>\n",
       "      <td>0.049278</td>\n",
       "      <td>0.175871</td>\n",
       "      <td>75030</td>\n",
       "      <td>Micropolitan high commuting</td>\n",
       "      <td>Micropolitan high commuting, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>4236100686</td>\n",
       "      <td>48361022200</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.377673</td>\n",
       "      <td>0.054155</td>\n",
       "      <td>0.078620</td>\n",
       "      <td>0.043864</td>\n",
       "      <td>0.035436</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.458128</td>\n",
       "      <td>0.313129</td>\n",
       "      <td>0.080299</td>\n",
       "      <td>0.109244</td>\n",
       "      <td>110550</td>\n",
       "      <td>Micropolitan low commuting</td>\n",
       "      <td>Micropolitan low commuting, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>4236130968</td>\n",
       "      <td>48361022301</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.282359</td>\n",
       "      <td>0.121570</td>\n",
       "      <td>0.038104</td>\n",
       "      <td>0.027312</td>\n",
       "      <td>0.064141</td>\n",
       "      <td>0.015588</td>\n",
       "      <td>0.201954</td>\n",
       "      <td>0.195056</td>\n",
       "      <td>0.050578</td>\n",
       "      <td>0.136085</td>\n",
       "      <td>94161</td>\n",
       "      <td>Micropolitan core</td>\n",
       "      <td>Micropolitan core, no additional code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>4236130847</td>\n",
       "      <td>48361021300</td>\n",
       "      <td>Micropolitan</td>\n",
       "      <td>false</td>\n",
       "      <td>0.344340</td>\n",
       "      <td>0.197377</td>\n",
       "      <td>0.050797</td>\n",
       "      <td>0.029070</td>\n",
       "      <td>0.060484</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.316940</td>\n",
       "      <td>0.297490</td>\n",
       "      <td>0.085989</td>\n",
       "      <td>0.136454</td>\n",
       "      <td>86287</td>\n",
       "      <td>Micropolitan core</td>\n",
       "      <td>Micropolitan core, no additional code</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  canonical_api10 census_tract_geoid ruca_category is_rural  \\\n",
       "0      4236101293        48361020300  Micropolitan    false   \n",
       "1      4236130846        48361020300  Micropolitan    false   \n",
       "2      4236130889        48361020300  Micropolitan    false   \n",
       "3      4236130612        48361020300  Micropolitan    false   \n",
       "4      4236130951        48361020300  Micropolitan    false   \n",
       "5      4236130691        48361021200  Micropolitan    false   \n",
       "6      4236130768        48361021200  Micropolitan    false   \n",
       "7      4236100686        48361022200  Micropolitan    false   \n",
       "8      4236130968        48361022301  Micropolitan    false   \n",
       "9      4236130847        48361021300  Micropolitan    false   \n",
       "\n",
       "   ej_composite_score  pct_minority  poverty_rate  unemployment_rate  \\\n",
       "0            0.762413      0.342790      0.341379           0.088619   \n",
       "1            0.762413      0.342790      0.341379           0.088619   \n",
       "2            0.762413      0.342790      0.341379           0.088619   \n",
       "3            0.762413      0.342790      0.341379           0.088619   \n",
       "4            0.762413      0.342790      0.341379           0.088619   \n",
       "5            0.414994      0.075616      0.119897           0.040496   \n",
       "6            0.414994      0.075616      0.119897           0.040496   \n",
       "7            0.377673      0.054155      0.078620           0.043864   \n",
       "8            0.282359      0.121570      0.038104           0.027312   \n",
       "9            0.344340      0.197377      0.050797           0.029070   \n",
       "\n",
       "   less_than_hs_pct  linguistic_isolation_rate  renter_cost_burden_rate  \\\n",
       "0          0.181628                   0.021719                 0.472803   \n",
       "1          0.181628                   0.021719                 0.472803   \n",
       "2          0.181628                   0.021719                 0.472803   \n",
       "3          0.181628                   0.021719                 0.472803   \n",
       "4          0.181628                   0.021719                 0.472803   \n",
       "5          0.072491                   0.000000                 0.745455   \n",
       "6          0.072491                   0.000000                 0.745455   \n",
       "7          0.035436                   0.000000                 0.458128   \n",
       "8          0.064141                   0.015588                 0.201954   \n",
       "9          0.060484                   0.000000                 0.316940   \n",
       "\n",
       "   disability_rate  pct_under5  pct_65plus  median_household_income  \\\n",
       "0         0.321646    0.093124    0.154539                    48218   \n",
       "1         0.321646    0.093124    0.154539                    48218   \n",
       "2         0.321646    0.093124    0.154539                    48218   \n",
       "3         0.321646    0.093124    0.154539                    48218   \n",
       "4         0.321646    0.093124    0.154539                    48218   \n",
       "5         0.225786    0.049278    0.175871                    75030   \n",
       "6         0.225786    0.049278    0.175871                    75030   \n",
       "7         0.313129    0.080299    0.109244                   110550   \n",
       "8         0.195056    0.050578    0.136085                    94161   \n",
       "9         0.297490    0.085989    0.136454                    86287   \n",
       "\n",
       "      ruca_primary_description  \\\n",
       "0            Micropolitan core   \n",
       "1            Micropolitan core   \n",
       "2            Micropolitan core   \n",
       "3            Micropolitan core   \n",
       "4            Micropolitan core   \n",
       "5  Micropolitan high commuting   \n",
       "6  Micropolitan high commuting   \n",
       "7   Micropolitan low commuting   \n",
       "8            Micropolitan core   \n",
       "9            Micropolitan core   \n",
       "\n",
       "                        ruca_secondary_description  \n",
       "0            Micropolitan core, no additional code  \n",
       "1            Micropolitan core, no additional code  \n",
       "2            Micropolitan core, no additional code  \n",
       "3            Micropolitan core, no additional code  \n",
       "4            Micropolitan core, no additional code  \n",
       "5  Micropolitan high commuting, no additional code  \n",
       "6  Micropolitan high commuting, no additional code  \n",
       "7   Micropolitan low commuting, no additional code  \n",
       "8            Micropolitan core, no additional code  \n",
       "9            Micropolitan core, no additional code  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cell 6: Persist tract demographics to Postgres and join preview (non-destructive schema-safe update)\n",
    "from sqlalchemy import text\n",
    "\n",
    "persist_cols = [\n",
    "    'geoid','name','total_population','minority_population','pct_minority','pct_hispanic','poverty_rate',\n",
    "    'unemployment_rate','less_than_hs_pct','linguistic_isolation_rate','renter_cost_burden_rate','disability_rate',\n",
    "    'pct_under5','pct_65plus','ej_composite_score','median_household_income',\n",
    "    'ruca_code_2020','ruca_primary','ruca_primary_description','ruca_secondary','ruca_secondary_description','ruca_category','is_nonmetro','is_rural'\n",
    "]\n",
    "\n",
    "write_df = acs_df[persist_cols].copy()\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    # Ensure target table exists; if not, create fresh via to_sql\n",
    "    existing = conn.execute(text(\"\"\"\n",
    "        SELECT to_regclass('census_tract_demographics') IS NOT NULL AS exists\n",
    "    \"\"\")) .scalar()\n",
    "    if not existing:\n",
    "        write_df.to_sql('census_tract_demographics', con=conn, if_exists='replace', index=False, method='multi')\n",
    "    else:\n",
    "        # Add any missing columns before load, and normalize legacy NAME -> name\n",
    "        existing_cols = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name='census_tract_demographics'\"))]\n",
    "        if 'NAME' in existing_cols and 'name' not in existing_cols:\n",
    "            # If a quoted \"NAME\" exists, rename to lowercase name to avoid identifier issues\n",
    "            conn.execute(text('ALTER TABLE census_tract_demographics RENAME COLUMN \"NAME\" TO name'))\n",
    "            existing_cols = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name='census_tract_demographics'\"))]\n",
    "        for col in persist_cols:\n",
    "            if col not in existing_cols:\n",
    "                # Infer a reasonable SQL type from pandas dtype\n",
    "                series = write_df[col]\n",
    "                if pd.api.types.is_integer_dtype(series.dropna()):\n",
    "                    sql_type = 'BIGINT'\n",
    "                elif pd.api.types.is_float_dtype(series.dropna()):\n",
    "                    sql_type = 'DOUBLE PRECISION'\n",
    "                elif pd.api.types.is_bool_dtype(series.dropna()):\n",
    "                    sql_type = 'BOOLEAN'\n",
    "                else:\n",
    "                    sql_type = 'TEXT'\n",
    "                conn.execute(text(f\"ALTER TABLE census_tract_demographics ADD COLUMN IF NOT EXISTS {col} {sql_type}\"))\n",
    "        # Stage data in a temp table\n",
    "        write_df.to_sql('_census_tract_demographics_stage', con=conn, if_exists='replace', index=False, method='multi')\n",
    "        # Upsert strategy: delete all then insert (tract-level snapshot)\n",
    "        conn.execute(text(\"DELETE FROM census_tract_demographics\"))\n",
    "        insert_cols = ','.join(persist_cols)\n",
    "        conn.execute(text(f\"INSERT INTO census_tract_demographics ({insert_cols}) SELECT {insert_cols} FROM _census_tract_demographics_stage\"))\n",
    "        conn.execute(text(\"DROP TABLE IF EXISTS _census_tract_demographics_stage\"))\n",
    "    # Indexes (create if absent)\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_geoid ON census_tract_demographics (geoid)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_ej_score ON census_tract_demographics (ej_composite_score)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_poverty_rate ON census_tract_demographics (poverty_rate)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_pct_minority ON census_tract_demographics (pct_minority)\"))\n",
    "    conn.execute(text(\"ANALYZE census_tract_demographics\"))\n",
    "\n",
    "print(\"Updated census_tract_demographics (schema reconciled, data refreshed, indexes ensured).\")\n",
    "\n",
    "# Preview join back to wells\n",
    "with engine.begin() as conn:\n",
    "    preview = pd.read_sql(text(\"\"\"\n",
    "        SELECT w.canonical_api10, w.census_tract_geoid,\n",
    "               d.ruca_category, d.is_rural, d.ej_composite_score, d.pct_minority, d.poverty_rate, d.unemployment_rate,\n",
    "               d.less_than_hs_pct, d.linguistic_isolation_rate, d.renter_cost_burden_rate,\n",
    "               d.disability_rate, d.pct_under5, d.pct_65plus, d.median_household_income,\n",
    "               d.ruca_primary_description, d.ruca_secondary_description\n",
    "        FROM well_shape_tract w\n",
    "        LEFT JOIN census_tract_demographics d\n",
    "          ON w.census_tract_geoid = d.geoid\n",
    "        LIMIT 10\n",
    "    \"\"\"), conn)\n",
    "\n",
    "preview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d38db1df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'well_shape_tract_columns_count': 11}\n",
      "{'column': 'canonical_api10', 'present': True, 'nonnull_count': 1010430}\n",
      "{'column': 'api10_number', 'present': True, 'nonnull_count': 852539}\n",
      "{'column': 'api_number', 'present': True, 'nonnull_count': 1010430}\n",
      "{'materialized_table': 'well_with_demographics_table', 'rows': 1373579}\n",
      "Sample (first 10) from materialized table:\n",
      "  canonical_api10 api10_number api_number census_tract_geoid\n",
      "0      4236101293         None   36101293        48361020300\n",
      "1      4236130846         None   36130846        48361020300\n",
      "2      4236130889         None   36130889        48361020300\n",
      "3      4236130612         None   36130612        48361020300\n",
      "4      4236130951         None   36130951        48361020300\n",
      "5      4236130691         None   36130691        48361021200\n",
      "6      4236130768         None   36130768        48361021200\n",
      "7      4236100686         None   36100686        48361022200\n",
      "8      4236130968         None   36130968        48361022301\n",
      "9      4236130847         None   36130847        48361021300\n",
      "{'materialized_table': 'well_with_demographics_table', 'rows': 1373579}\n",
      "Sample (first 10) from materialized table:\n",
      "  canonical_api10 api10_number api_number census_tract_geoid\n",
      "0      4236101293         None   36101293        48361020300\n",
      "1      4236130846         None   36130846        48361020300\n",
      "2      4236130889         None   36130889        48361020300\n",
      "3      4236130612         None   36130612        48361020300\n",
      "4      4236130951         None   36130951        48361020300\n",
      "5      4236130691         None   36130691        48361021200\n",
      "6      4236130768         None   36130768        48361021200\n",
      "7      4236100686         None   36100686        48361022200\n",
      "8      4236130968         None   36130968        48361022301\n",
      "9      4236130847         None   36130847        48361021300\n"
     ]
    }
   ],
   "source": [
    "# Cell 8: Materialize wells + demographics into a PostGIS table (defensive: preserve API ids)\n",
    "from sqlalchemy import text\n",
    "\n",
    "target_table = 'well_with_demographics_table'\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    # Inspect available identifier columns on well_shape_tract\n",
    "    existing_cols = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name='well_shape_tract'\"))]\n",
    "    print({'well_shape_tract_columns_count': len(existing_cols)})\n",
    "    for col in ['canonical_api10','api10_number','api_number']:\n",
    "        present = col in existing_cols\n",
    "        nonnull = 0\n",
    "        if present:\n",
    "            nonnull = conn.execute(text(f\"SELECT COUNT(*) FROM well_shape_tract WHERE {col} IS NOT NULL\")).scalar()\n",
    "        print({ 'column': col, 'present': present, 'nonnull_count': int(nonnull) if present else None })\n",
    "\n",
    "    # Build canonical_api10 expression: prefer canonical_api10, then api10_number, then api_number\n",
    "    if 'canonical_api10' in existing_cols:\n",
    "        canonical_expr = 'w.canonical_api10::text'\n",
    "    elif 'api10_number' in existing_cols:\n",
    "        canonical_expr = 'w.api10_number::text'\n",
    "    elif 'api_number' in existing_cols:\n",
    "        canonical_expr = 'w.api_number::text'\n",
    "    else:\n",
    "        canonical_expr = \"NULL::text\"\n",
    "\n",
    "    # Keep raw api columns too if present\n",
    "    raw_api_selects = []\n",
    "    if 'api10_number' in existing_cols:\n",
    "        raw_api_selects.append('w.api10_number')\n",
    "    if 'api_number' in existing_cols:\n",
    "        raw_api_selects.append('w.api_number')\n",
    "\n",
    "    # Define select list with canonical_api10 first, then raw APIs, then other fields\n",
    "    select_list = [f\"{canonical_expr} AS canonical_api10\"] + raw_api_selects + [\n",
    "        'w.census_tract_geoid',\n",
    "        'w.latitude',\n",
    "        'w.longitude',\n",
    "        'w.geom'\n",
    "    ]\n",
    "\n",
    "    # Add demographic columns to the select (same as before)\n",
    "    dem_cols = [\n",
    "        'd.name AS tract_name', 'd.ruca_code_2020', 'd.ruca_category', 'd.ruca_primary_description', 'd.ruca_secondary_description',\n",
    "        'd.ej_composite_score', 'd.pct_minority','d.pct_hispanic','d.poverty_rate','d.unemployment_rate','d.less_than_hs_pct',\n",
    "        'd.linguistic_isolation_rate','d.renter_cost_burden_rate','d.disability_rate','d.pct_under5','d.pct_65plus','d.median_household_income'\n",
    "    ]\n",
    "    select_list.extend(dem_cols)\n",
    "\n",
    "    select_sql = \"SELECT\\n  \" + \",\\n  \".join(select_list) + f\"\\nFROM well_shape_tract w\\nLEFT JOIN census_tract_demographics d ON w.census_tract_geoid = d.geoid\"\n",
    "\n",
    "    # Ensure PostGIS\n",
    "    conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis\"))\n",
    "\n",
    "    # Stage and atomically replace\n",
    "    conn.execute(text(\"DROP TABLE IF EXISTS _well_with_demographics_stage\"))\n",
    "    conn.execute(text(f\"CREATE TABLE _well_with_demographics_stage AS {select_sql}\"))\n",
    "\n",
    "    conn.execute(text(f\"DROP TABLE IF EXISTS {target_table}\"))\n",
    "    conn.execute(text(f\"ALTER TABLE _well_with_demographics_stage RENAME TO {target_table}\"))\n",
    "\n",
    "    # Indexes\n",
    "    conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_api10 ON {target_table} (canonical_api10)\"))\n",
    "    conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_geoid ON {target_table} (census_tract_geoid)\"))\n",
    "    # geometry column may be named geom in this table\n",
    "    # create GIST index on geom if present\n",
    "    cols_after = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name=:t\"), {'t': target_table})]\n",
    "    if 'geom' in cols_after:\n",
    "        conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_geom ON {target_table} USING GIST (geom)\"))\n",
    "    elif 'geometry' in cols_after:\n",
    "        conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_geometry ON {target_table} USING GIST (geometry)\"))\n",
    "\n",
    "    conn.execute(text(f\"ANALYZE {target_table}\"))\n",
    "\n",
    "# Report row count and a quick sample\n",
    "with engine.begin() as conn:\n",
    "    cnt = conn.execute(text(f\"SELECT COUNT(*) FROM {target_table}\")).scalar()\n",
    "    print({\"materialized_table\": target_table, \"rows\": int(cnt)})\n",
    "    # Build a safe sample query using only existing columns\n",
    "    cols_now = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name=:t\"), {'t': target_table})]\n",
    "    sample_cols = ['canonical_api10'] + [c for c in ['api10_number','api_number','census_tract_geoid'] if c in cols_now]\n",
    "    sample_sql = f\"SELECT {', '.join(sample_cols)} FROM {target_table} LIMIT 10\"\n",
    "    sample = pd.read_sql(text(sample_sql), conn)\n",
    "\n",
    "print(\"Sample (first 10) from materialized table:\")\n",
    "print(sample)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}