{
"cells": [
{
"cell_type": "markdown",
"id": "b6671d51",
"metadata": {},
"source": [
"# Census Demographics Enrichment for Wells\n",
"\n",
"This notebook:\n",
"1. Connects to Postgres and reads tract GEOIDs from `well_shape_tract`.\n",
"2. Fetches ACS 2021 5-year demographic variables relevant to environmental justice and performance analysis via the Census API (dataset: `acs/acs5`).\n",
"3. Computes derived metrics (percent minority, poverty rate, unemployment, educational attainment, linguistic isolation proxy, etc.).\n",
"4. Writes a new table `census_tract_demographics` and (optionally) demonstrates joining it back to `well_shape_tract`.\n",
"\n",
"Prerequisites:\n",
"- Environment variables for Postgres (PGHOST, PGPORT, PGUSER, PGPASSWORD, PGDATABASE).\n",
"- Optional: `CENSUS_API_KEY` for higher request volume (without a key, small pulls usually still work but may be throttled).\n",
"\n",
"Adjust variables or add/remove as needed before running the fetch cell."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5d5672c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Distinct tracts from wells: 3551\n"
]
},
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "geoid",
"rawType": "object",
"type": "string"
}
],
"ref": "1697201f-d939-4f51-a9af-8ccdf2da3849",
"rows": [
[
"0",
"48001950100"
],
[
"1",
"48001950401"
],
[
"2",
"48001950402"
],
[
"3",
"48001950500"
],
[
"4",
"48001950600"
]
],
"shape": {
"columns": 1,
"rows": 5
}
},
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" geoid | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 48001950100 | \n",
"
\n",
" \n",
" | 1 | \n",
" 48001950401 | \n",
"
\n",
" \n",
" | 2 | \n",
" 48001950402 | \n",
"
\n",
" \n",
" | 3 | \n",
" 48001950500 | \n",
"
\n",
" \n",
" | 4 | \n",
" 48001950600 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" geoid\n",
"0 48001950100\n",
"1 48001950401\n",
"2 48001950402\n",
"3 48001950500\n",
"4 48001950600"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cell 2: Pull tract list from well_shape_tract\n",
"from sqlalchemy import text\n",
"\n",
"with engine.begin() as conn:\n",
" tracts = pd.read_sql(text(\"\"\"\n",
" SELECT DISTINCT census_tract_geoid AS geoid\n",
" FROM well_shape_tract\n",
" WHERE census_tract_geoid IS NOT NULL\n",
" \"\"\"), conn)\n",
"\n",
"print(f\"Distinct tracts from wells: {len(tracts)}\")\n",
"tracts.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "25eff3bb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total vars: 54; chunk count: 2\n",
"Raw merged ACS rows: 3551\n",
"Raw merged ACS rows: 3551\n"
]
},
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "geoid",
"rawType": "object",
"type": "string"
},
{
"name": "name",
"rawType": "object",
"type": "string"
},
{
"name": "total_population",
"rawType": "int64",
"type": "integer"
},
{
"name": "white_alone",
"rawType": "int64",
"type": "integer"
},
{
"name": "black_alone",
"rawType": "int64",
"type": "integer"
},
{
"name": "american_indian_alone",
"rawType": "int64",
"type": "integer"
},
{
"name": "asian_alone",
"rawType": "int64",
"type": "integer"
},
{
"name": "native_hawaiian_pacific_alone",
"rawType": "int64",
"type": "integer"
},
{
"name": "other_race_alone",
"rawType": "int64",
"type": "integer"
},
{
"name": "two_or_more_races",
"rawType": "int64",
"type": "integer"
},
{
"name": "hispanic_any_race",
"rawType": "int64",
"type": "integer"
},
{
"name": "hispanic_base",
"rawType": "int64",
"type": "integer"
},
{
"name": "poverty_universe",
"rawType": "int64",
"type": "integer"
},
{
"name": "below_poverty",
"rawType": "int64",
"type": "integer"
},
{
"name": "education_universe",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_002",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_003",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_004",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_005",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_006",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_007",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_008",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_009",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_010",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_011",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_012",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_013",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_014",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_015",
"rawType": "int64",
"type": "integer"
},
{
"name": "b15003_016",
"rawType": "int64",
"type": "integer"
},
{
"name": "civilian_labor_force",
"rawType": "int64",
"type": "integer"
},
{
"name": "civilian_unemployed",
"rawType": "int64",
"type": "integer"
},
{
"name": "language_universe",
"rawType": "int64",
"type": "integer"
},
{
"name": "linguistic_isolation_est",
"rawType": "int64",
"type": "integer"
},
{
"name": "median_household_income",
"rawType": "int64",
"type": "integer"
},
{
"name": "renters_universe",
"rawType": "int64",
"type": "integer"
},
{
"name": "b25070_008",
"rawType": "int64",
"type": "integer"
},
{
"name": "b25070_009",
"rawType": "int64",
"type": "integer"
},
{
"name": "b25070_010",
"rawType": "int64",
"type": "integer"
},
{
"name": "b25070_011",
"rawType": "int64",
"type": "integer"
},
{
"name": "disability_universe",
"rawType": "int64",
"type": "integer"
},
{
"name": "with_disability",
"rawType": "int64",
"type": "integer"
},
{
"name": "male_under_5",
"rawType": "int64",
"type": "integer"
},
{
"name": "female_under_5",
"rawType": "int64",
"type": "integer"
},
{
"name": "male_65_66",
"rawType": "int64",
"type": "integer"
},
{
"name": "male_67_69",
"rawType": "int64",
"type": "integer"
},
{
"name": "male_70_74",
"rawType": "int64",
"type": "integer"
},
{
"name": "male_75_79",
"rawType": "int64",
"type": "integer"
},
{
"name": "male_80_84",
"rawType": "int64",
"type": "integer"
},
{
"name": "male_85_plus",
"rawType": "int64",
"type": "integer"
},
{
"name": "female_65_66",
"rawType": "int64",
"type": "integer"
},
{
"name": "female_67_69",
"rawType": "int64",
"type": "integer"
},
{
"name": "female_70_74",
"rawType": "int64",
"type": "integer"
},
{
"name": "female_75_79",
"rawType": "int64",
"type": "integer"
},
{
"name": "female_80_84",
"rawType": "int64",
"type": "integer"
},
{
"name": "female_85_plus",
"rawType": "int64",
"type": "integer"
}
],
"ref": "bde8ae8b-2bba-4fb2-9b8d-0891f465bd5c",
"rows": [
[
"0",
"48001950100",
"Census Tract 9501, Anderson County, Texas",
"5447",
"4540",
"212",
"4",
"21",
"0",
"66",
"604",
"644",
"5447",
"5404",
"913",
"3648",
"22",
"0",
"0",
"0",
"0",
"0",
"0",
"9",
"21",
"0",
"92",
"54",
"76",
"83",
"3",
"2092",
"157",
"1823",
"0",
"61325",
"219",
"0",
"13",
"25",
"42",
"5409",
"1433",
"325",
"214",
"57",
"77",
"138",
"119",
"49",
"13",
"36",
"101",
"264",
"114",
"19",
"83"
],
[
"1",
"48001950401",
"Census Tract 9504.01, Anderson County, Texas",
"4544",
"2145",
"1682",
"10",
"8",
"0",
"58",
"641",
"1435",
"4544",
"114",
"0",
"3854",
"0",
"0",
"0",
"10",
"13",
"0",
"7",
"34",
"92",
"71",
"57",
"264",
"264",
"318",
"70",
"55",
"0",
"27",
"0",
"92813",
"15",
"6",
"0",
"0",
"0",
"114",
"49",
"0",
"15",
"28",
"15",
"14",
"8",
"10",
"10",
"0",
"0",
"4",
"0",
"7",
"0"
],
[
"2",
"48001950402",
"Census Tract 9504.02, Anderson County, Texas",
"6997",
"3476",
"2587",
"19",
"0",
"10",
"133",
"772",
"2045",
"6997",
"20",
"20",
"6624",
"35",
"0",
"0",
"20",
"10",
"24",
"31",
"35",
"113",
"88",
"247",
"405",
"386",
"469",
"46",
"0",
"0",
"0",
"0",
"-666666666",
"0",
"0",
"0",
"0",
"0",
"20",
"0",
"0",
"0",
"53",
"96",
"56",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0"
],
[
"3",
"48001950500",
"Census Tract 9505, Anderson County, Texas",
"4236",
"2948",
"673",
"33",
"0",
"34",
"194",
"354",
"1307",
"4236",
"4182",
"802",
"2850",
"64",
"0",
"0",
"20",
"0",
"0",
"0",
"10",
"67",
"0",
"17",
"28",
"42",
"43",
"32",
"1682",
"48",
"1379",
"0",
"41713",
"477",
"49",
"62",
"77",
"28",
"4190",
"1217",
"360",
"137",
"23",
"14",
"115",
"56",
"55",
"33",
"15",
"61",
"189",
"106",
"81",
"43"
],
[
"4",
"48001950600",
"Census Tract 9506, Anderson County, Texas",
"5843",
"3408",
"1901",
"0",
"0",
"0",
"126",
"408",
"1018",
"5843",
"5766",
"1913",
"3770",
"174",
"0",
"0",
"20",
"0",
"41",
"0",
"0",
"9",
"0",
"0",
"93",
"61",
"218",
"556",
"2128",
"71",
"1919",
"0",
"32552",
"833",
"60",
"0",
"199",
"287",
"5779",
"1402",
"60",
"199",
"16",
"28",
"33",
"58",
"12",
"49",
"142",
"117",
"170",
"77",
"45",
"96"
]
],
"shape": {
"columns": 56,
"rows": 5
}
},
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" geoid | \n",
" name | \n",
" total_population | \n",
" white_alone | \n",
" black_alone | \n",
" american_indian_alone | \n",
" asian_alone | \n",
" native_hawaiian_pacific_alone | \n",
" other_race_alone | \n",
" two_or_more_races | \n",
" ... | \n",
" male_70_74 | \n",
" male_75_79 | \n",
" male_80_84 | \n",
" male_85_plus | \n",
" female_65_66 | \n",
" female_67_69 | \n",
" female_70_74 | \n",
" female_75_79 | \n",
" female_80_84 | \n",
" female_85_plus | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 48001950100 | \n",
" Census Tract 9501, Anderson County, Texas | \n",
" 5447 | \n",
" 4540 | \n",
" 212 | \n",
" 4 | \n",
" 21 | \n",
" 0 | \n",
" 66 | \n",
" 604 | \n",
" ... | \n",
" 138 | \n",
" 119 | \n",
" 49 | \n",
" 13 | \n",
" 36 | \n",
" 101 | \n",
" 264 | \n",
" 114 | \n",
" 19 | \n",
" 83 | \n",
"
\n",
" \n",
" | 1 | \n",
" 48001950401 | \n",
" Census Tract 9504.01, Anderson County, Texas | \n",
" 4544 | \n",
" 2145 | \n",
" 1682 | \n",
" 10 | \n",
" 8 | \n",
" 0 | \n",
" 58 | \n",
" 641 | \n",
" ... | \n",
" 14 | \n",
" 8 | \n",
" 10 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 4 | \n",
" 0 | \n",
" 7 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 48001950402 | \n",
" Census Tract 9504.02, Anderson County, Texas | \n",
" 6997 | \n",
" 3476 | \n",
" 2587 | \n",
" 19 | \n",
" 0 | \n",
" 10 | \n",
" 133 | \n",
" 772 | \n",
" ... | \n",
" 56 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 48001950500 | \n",
" Census Tract 9505, Anderson County, Texas | \n",
" 4236 | \n",
" 2948 | \n",
" 673 | \n",
" 33 | \n",
" 0 | \n",
" 34 | \n",
" 194 | \n",
" 354 | \n",
" ... | \n",
" 115 | \n",
" 56 | \n",
" 55 | \n",
" 33 | \n",
" 15 | \n",
" 61 | \n",
" 189 | \n",
" 106 | \n",
" 81 | \n",
" 43 | \n",
"
\n",
" \n",
" | 4 | \n",
" 48001950600 | \n",
" Census Tract 9506, Anderson County, Texas | \n",
" 5843 | \n",
" 3408 | \n",
" 1901 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 126 | \n",
" 408 | \n",
" ... | \n",
" 33 | \n",
" 58 | \n",
" 12 | \n",
" 49 | \n",
" 142 | \n",
" 117 | \n",
" 170 | \n",
" 77 | \n",
" 45 | \n",
" 96 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 56 columns
\n",
"
"
],
"text/plain": [
" geoid name \\\n",
"0 48001950100 Census Tract 9501, Anderson County, Texas \n",
"1 48001950401 Census Tract 9504.01, Anderson County, Texas \n",
"2 48001950402 Census Tract 9504.02, Anderson County, Texas \n",
"3 48001950500 Census Tract 9505, Anderson County, Texas \n",
"4 48001950600 Census Tract 9506, Anderson County, Texas \n",
"\n",
" total_population white_alone black_alone american_indian_alone \\\n",
"0 5447 4540 212 4 \n",
"1 4544 2145 1682 10 \n",
"2 6997 3476 2587 19 \n",
"3 4236 2948 673 33 \n",
"4 5843 3408 1901 0 \n",
"\n",
" asian_alone native_hawaiian_pacific_alone other_race_alone \\\n",
"0 21 0 66 \n",
"1 8 0 58 \n",
"2 0 10 133 \n",
"3 0 34 194 \n",
"4 0 0 126 \n",
"\n",
" two_or_more_races ... male_70_74 male_75_79 male_80_84 male_85_plus \\\n",
"0 604 ... 138 119 49 13 \n",
"1 641 ... 14 8 10 10 \n",
"2 772 ... 56 0 0 0 \n",
"3 354 ... 115 56 55 33 \n",
"4 408 ... 33 58 12 49 \n",
"\n",
" female_65_66 female_67_69 female_70_74 female_75_79 female_80_84 \\\n",
"0 36 101 264 114 19 \n",
"1 0 0 4 0 7 \n",
"2 0 0 0 0 0 \n",
"3 15 61 189 106 81 \n",
"4 142 117 170 77 45 \n",
"\n",
" female_85_plus \n",
"0 83 \n",
"1 0 \n",
"2 0 \n",
"3 43 \n",
"4 96 \n",
"\n",
"[5 rows x 56 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cell 4: Fetch ACS data for all Texas tracts intersecting our wells (chunked to avoid 400 errors)\n",
"import math\n",
"\n",
"# Build list of tract components (state=48 always for Texas GEOIDs)\n",
"tracts['state'] = '48'\n",
"tracts['county'] = tracts['geoid'].str.slice(2, 5)\n",
"tracts['tract'] = tracts['geoid'].str.slice(5)\n",
"\n",
"BASE_URL = \"https://api.census.gov/data/2021/acs/acs5\"\n",
"all_vars = list(acs_vars.keys()) # variable codes\n",
"NAME_VAR = 'NAME'\n",
"\n",
"# Census API often fails if too many vars or malformed ones; chunk variables (e.g., 40 per request)\n",
"CHUNK_SIZE = 40\n",
"var_chunks = [all_vars[i:i+CHUNK_SIZE] for i in range(0, len(all_vars), CHUNK_SIZE)]\n",
"print(f\"Total vars: {len(all_vars)}; chunk count: {len(var_chunks)}\")\n",
"\n",
"import requests\n",
"import pandas as pd\n",
"\n",
"def fetch_for_county(county, tract_subset):\n",
" frames = []\n",
" for chunk in var_chunks:\n",
" params = {\n",
" 'get': ','.join(chunk + [NAME_VAR]),\n",
" 'for': 'tract:*',\n",
" 'in': f'state:48 county:{county}'\n",
" }\n",
" if API_KEY:\n",
" params['key'] = API_KEY\n",
" resp = requests.get(BASE_URL, params=params, timeout=30)\n",
" if resp.status_code != 200:\n",
" print(f\"[warn] status {resp.status_code} for county {county} chunk starting {chunk[0]}\")\n",
" try:\n",
" print(resp.text[:500])\n",
" except Exception:\n",
" pass\n",
" continue\n",
" data = resp.json()\n",
" header, *records = data\n",
" df = pd.DataFrame(records, columns=header)\n",
" keep = set(tract_subset['tract'])\n",
" df = df[df['tract'].isin(keep)].copy()\n",
" df['geoid'] = '48' + df['county'] + df['tract']\n",
" frames.append(df)\n",
" if not frames:\n",
" return pd.DataFrame()\n",
" # Merge on geoid only to accumulate variables; keep NAME/state/county/tract from the first frame\n",
" base = frames[0]\n",
" for f in frames[1:]:\n",
" # Drop duplicate columns present in base, but never drop the join key\n",
" join_keys = {'geoid'}\n",
" dup_cols = set(c for c in f.columns if c in base.columns and c not in join_keys)\n",
" if dup_cols:\n",
" f = f.drop(columns=list(dup_cols))\n",
" base = base.merge(f, on='geoid', how='left')\n",
" return base\n",
"\n",
"county_frames = []\n",
"for county, part in tracts.groupby('county'):\n",
" cf = fetch_for_county(county, part)\n",
" if not cf.empty:\n",
" county_frames.append(cf)\n",
"\n",
"acs_raw = pd.concat(county_frames, ignore_index=True) if county_frames else pd.DataFrame()\n",
"print(f\"Raw merged ACS rows: {len(acs_raw)}\")\n",
"\n",
"# Rename variables to friendly names and coerce numerics\n",
"rename_map = {k: v for k, v in acs_vars.items()}\n",
"acs_df = acs_raw.rename(columns=rename_map)\n",
"\n",
"num_cols = list(rename_map.values())\n",
"for c in num_cols:\n",
" if c in acs_df.columns:\n",
" acs_df[c] = pd.to_numeric(acs_df[c], errors='coerce')\n",
"\n",
"# Guarantee columns exist (fill missing with NA)\n",
"for c in num_cols:\n",
" if c not in acs_df.columns:\n",
" acs_df[c] = None\n",
"\n",
"# Standardize the tract name column to lowercase 'name' to avoid quoted identifier issues\n",
"if NAME_VAR in acs_df.columns:\n",
" acs_df = acs_df.rename(columns={NAME_VAR: 'name'})\n",
"\n",
"acs_df = acs_df[['geoid', 'name'] + num_cols]\n",
"acs_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3b8739b1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" geoid pct_minority pct_hispanic poverty_rate unemployment_rate \\\n",
"0 48001950100 0.166514 0.118230 0.168949 0.075048 \n",
"1 48001950401 0.527949 0.315801 0.000000 0.000000 \n",
"2 48001950402 0.503216 0.292268 1.000000 NaN \n",
"3 48001950500 0.304060 0.308546 0.191774 0.028537 \n",
"4 48001950600 0.416738 0.174226 0.331772 0.033365 \n",
"\n",
" less_than_hs_pct linguistic_isolation_rate renter_cost_burden_rate \\\n",
"0 0.098684 0.0 0.365297 \n",
"1 0.311365 0.0 0.400000 \n",
"2 0.288194 NaN NaN \n",
"3 0.113333 0.0 0.452830 \n",
"4 0.310875 0.0 0.655462 \n",
"\n",
" disability_rate pct_under5 pct_65plus ej_composite_score \\\n",
"0 0.264929 0.098954 0.196438 0.474332 \n",
"1 0.429825 0.003301 0.021127 0.501327 \n",
"2 0.000000 0.000000 0.029298 0.668572 \n",
"3 0.290453 0.117328 0.186733 0.511775 \n",
"4 0.242603 0.044327 0.144275 0.646844 \n",
"\n",
" median_household_income \n",
"0 61325 \n",
"1 92813 \n",
"2 -666666666 \n",
"3 41713 \n",
"4 32552 \n"
]
}
],
"source": [
"# Cell 5: Derived EJ metrics\n",
"# Minority and Hispanic\n",
"acs_df['minority_population'] = (acs_df['total_population'] - acs_df['white_alone'].fillna(0))\n",
"acs_df['pct_minority'] = (acs_df['minority_population'] / acs_df['total_population']).where(acs_df['total_population']>0)\n",
"acs_df['pct_hispanic'] = (acs_df['hispanic_any_race'] / acs_df['hispanic_base']).where(acs_df['hispanic_base']>0)\n",
"\n",
"# Poverty\n",
"acs_df['poverty_rate'] = (acs_df['below_poverty'] / acs_df['poverty_universe']).where(acs_df['poverty_universe']>0)\n",
"\n",
"# Unemployment: unemployed / civilian labor force\n",
"acs_df['unemployment_rate'] = (acs_df['civilian_unemployed'] / acs_df['civilian_labor_force']).where(acs_df['civilian_labor_force']>0)\n",
"\n",
"# Linguistic isolation proxy\n",
"acs_df['linguistic_isolation_rate'] = (acs_df['linguistic_isolation_est'] / acs_df['language_universe']).where(acs_df['language_universe']>0)\n",
"\n",
"# Educational attainment (< HS): sum B15003_002.._016 over education_universe\n",
"b15003_bins = [f'b15003_{i:03d}' for i in range(2, 17)]\n",
"acs_df['less_than_hs'] = acs_df[b15003_bins].sum(axis=1, skipna=True)\n",
"acs_df['less_than_hs_pct'] = (acs_df['less_than_hs'] / acs_df['education_universe']).where(acs_df['education_universe']>0)\n",
"\n",
"# Renter cost burden >30% = sum bins 8-11 over renters_universe\n",
"rent_burden_bins = ['b25070_008','b25070_009','b25070_010','b25070_011']\n",
"acs_df['renter_cost_burden_over_30'] = acs_df[rent_burden_bins].sum(axis=1, skipna=True)\n",
"acs_df['renter_cost_burden_rate'] = (acs_df['renter_cost_burden_over_30'] / acs_df['renters_universe']).where(acs_df['renters_universe']>0)\n",
"\n",
"# Disability rate\n",
"acs_df['disability_rate'] = (acs_df['with_disability'] / acs_df['disability_universe']).where(acs_df['disability_universe']>0)\n",
"\n",
"# Age structure: pct under 5, pct 65+\n",
"acs_df['under5_total'] = acs_df['male_under_5'].fillna(0) + acs_df['female_under_5'].fillna(0)\n",
"acs_df['pct_under5'] = (acs_df['under5_total'] / acs_df['total_population']).where(acs_df['total_population']>0)\n",
"\n",
"elder_bins = ['male_65_66','male_67_69','male_70_74','male_75_79','male_80_84','male_85_plus',\n",
" 'female_65_66','female_67_69','female_70_74','female_75_79','female_80_84','female_85_plus']\n",
"acs_df['elder_total'] = acs_df[elder_bins].sum(axis=1, skipna=True)\n",
"acs_df['pct_65plus'] = (acs_df['elder_total'] / acs_df['total_population']).where(acs_df['total_population']>0)\n",
"\n",
"# Composite EJ score: percentile rank across selected metrics and average\n",
"# Higher is worse (more vulnerable): poverty_rate, pct_minority, renter_cost_burden_rate, linguistic_isolation_rate,\n",
"# less_than_hs_pct, unemployment_rate, disability_rate\n",
"comp_cols = ['poverty_rate','pct_minority','renter_cost_burden_rate','linguistic_isolation_rate',\n",
" 'less_than_hs_pct','unemployment_rate','disability_rate']\n",
"\n",
"# Compute percentiles per column (0..1)\n",
"for c in comp_cols:\n",
" # rank method=average, pct=True gives 0..1 percentile; handle NA by leaving NA then fill with 0\n",
" acs_df[c + '_pctile'] = acs_df[c].rank(pct=True)\n",
" acs_df.loc[acs_df[c].isna(), c + '_pctile'] = None\n",
"\n",
"acs_df['ej_composite_score'] = acs_df[[c + '_pctile' for c in comp_cols]].mean(axis=1, skipna=True)\n",
"\n",
"# Keep a tidy preview\n",
"print(acs_df[['geoid','pct_minority','pct_hispanic','poverty_rate','unemployment_rate','less_than_hs_pct',\n",
" 'linguistic_isolation_rate','renter_cost_burden_rate','disability_rate','pct_under5','pct_65plus',\n",
" 'ej_composite_score','median_household_income']].head())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "92ac800b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Updated census_tract_demographics (schema reconciled, data refreshed, indexes ensured).\n"
]
},
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "canonical_api10",
"rawType": "object",
"type": "string"
},
{
"name": "census_tract_geoid",
"rawType": "object",
"type": "string"
},
{
"name": "ruca_category",
"rawType": "object",
"type": "string"
},
{
"name": "is_rural",
"rawType": "object",
"type": "string"
},
{
"name": "ej_composite_score",
"rawType": "float64",
"type": "float"
},
{
"name": "pct_minority",
"rawType": "float64",
"type": "float"
},
{
"name": "poverty_rate",
"rawType": "float64",
"type": "float"
},
{
"name": "unemployment_rate",
"rawType": "float64",
"type": "float"
},
{
"name": "less_than_hs_pct",
"rawType": "float64",
"type": "float"
},
{
"name": "linguistic_isolation_rate",
"rawType": "float64",
"type": "float"
},
{
"name": "renter_cost_burden_rate",
"rawType": "float64",
"type": "float"
},
{
"name": "disability_rate",
"rawType": "float64",
"type": "float"
},
{
"name": "pct_under5",
"rawType": "float64",
"type": "float"
},
{
"name": "pct_65plus",
"rawType": "float64",
"type": "float"
},
{
"name": "median_household_income",
"rawType": "int64",
"type": "integer"
},
{
"name": "ruca_primary_description",
"rawType": "object",
"type": "string"
},
{
"name": "ruca_secondary_description",
"rawType": "object",
"type": "string"
}
],
"ref": "b720feb6-5b69-4250-bf7f-5b130bf14b85",
"rows": [
[
"0",
"4236101293",
"48361020300",
"Micropolitan",
"false",
"0.762412654987491",
"0.34279038718291055",
"0.3413793103448276",
"0.08861859252823631",
"0.18162839248434237",
"0.021718602455146365",
"0.47280334728033474",
"0.32164634146341464",
"0.0931241655540721",
"0.15453938584779706",
"48218",
"Micropolitan core",
"Micropolitan core, no additional code"
],
[
"1",
"4236130846",
"48361020300",
"Micropolitan",
"false",
"0.762412654987491",
"0.34279038718291055",
"0.3413793103448276",
"0.08861859252823631",
"0.18162839248434237",
"0.021718602455146365",
"0.47280334728033474",
"0.32164634146341464",
"0.0931241655540721",
"0.15453938584779706",
"48218",
"Micropolitan core",
"Micropolitan core, no additional code"
],
[
"2",
"4236130889",
"48361020300",
"Micropolitan",
"false",
"0.762412654987491",
"0.34279038718291055",
"0.3413793103448276",
"0.08861859252823631",
"0.18162839248434237",
"0.021718602455146365",
"0.47280334728033474",
"0.32164634146341464",
"0.0931241655540721",
"0.15453938584779706",
"48218",
"Micropolitan core",
"Micropolitan core, no additional code"
],
[
"3",
"4236130612",
"48361020300",
"Micropolitan",
"false",
"0.762412654987491",
"0.34279038718291055",
"0.3413793103448276",
"0.08861859252823631",
"0.18162839248434237",
"0.021718602455146365",
"0.47280334728033474",
"0.32164634146341464",
"0.0931241655540721",
"0.15453938584779706",
"48218",
"Micropolitan core",
"Micropolitan core, no additional code"
],
[
"4",
"4236130951",
"48361020300",
"Micropolitan",
"false",
"0.762412654987491",
"0.34279038718291055",
"0.3413793103448276",
"0.08861859252823631",
"0.18162839248434237",
"0.021718602455146365",
"0.47280334728033474",
"0.32164634146341464",
"0.0931241655540721",
"0.15453938584779706",
"48218",
"Micropolitan core",
"Micropolitan core, no additional code"
],
[
"5",
"4236130691",
"48361021200",
"Micropolitan",
"false",
"0.4149936725524403",
"0.07561597281223449",
"0.11989741397734559",
"0.04049586776859504",
"0.07249129471351694",
"0.0",
"0.7454545454545455",
"0.225785896346644",
"0.04927782497875956",
"0.17587085811384875",
"75030",
"Micropolitan high commuting",
"Micropolitan high commuting, no additional code"
],
[
"6",
"4236130768",
"48361021200",
"Micropolitan",
"false",
"0.4149936725524403",
"0.07561597281223449",
"0.11989741397734559",
"0.04049586776859504",
"0.07249129471351694",
"0.0",
"0.7454545454545455",
"0.225785896346644",
"0.04927782497875956",
"0.17587085811384875",
"75030",
"Micropolitan high commuting",
"Micropolitan high commuting, no additional code"
],
[
"7",
"4236100686",
"48361022200",
"Micropolitan",
"false",
"0.3776728029920977",
"0.05415499533146592",
"0.07862004224360479",
"0.04386374241717219",
"0.03543613707165109",
"0.0",
"0.458128078817734",
"0.3131289492160075",
"0.08029878618113913",
"0.1092436974789916",
"110550",
"Micropolitan low commuting",
"Micropolitan low commuting, no additional code"
],
[
"8",
"4236130968",
"48361022301",
"Micropolitan",
"false",
"0.28235883683560237",
"0.12156951689725562",
"0.0381038784304831",
"0.027311744049941473",
"0.06414091060152875",
"0.015587529976019185",
"0.20195439739413681",
"0.19505556815604447",
"0.05057836244046269",
"0.13608528010886822",
"94161",
"Micropolitan core",
"Micropolitan core, no additional code"
],
[
"9",
"4236130847",
"48361021300",
"Micropolitan",
"false",
"0.3443398903976459",
"0.19737715803452854",
"0.05079681274900399",
"0.029069767441860465",
"0.06048387096774194",
"0.0",
"0.31693989071038253",
"0.2974904437427289",
"0.08598937583001329",
"0.13645418326693226",
"86287",
"Micropolitan core",
"Micropolitan core, no additional code"
]
],
"shape": {
"columns": 17,
"rows": 10
}
},
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" canonical_api10 | \n",
" census_tract_geoid | \n",
" ruca_category | \n",
" is_rural | \n",
" ej_composite_score | \n",
" pct_minority | \n",
" poverty_rate | \n",
" unemployment_rate | \n",
" less_than_hs_pct | \n",
" linguistic_isolation_rate | \n",
" renter_cost_burden_rate | \n",
" disability_rate | \n",
" pct_under5 | \n",
" pct_65plus | \n",
" median_household_income | \n",
" ruca_primary_description | \n",
" ruca_secondary_description | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 4236101293 | \n",
" 48361020300 | \n",
" Micropolitan | \n",
" false | \n",
" 0.762413 | \n",
" 0.342790 | \n",
" 0.341379 | \n",
" 0.088619 | \n",
" 0.181628 | \n",
" 0.021719 | \n",
" 0.472803 | \n",
" 0.321646 | \n",
" 0.093124 | \n",
" 0.154539 | \n",
" 48218 | \n",
" Micropolitan core | \n",
" Micropolitan core, no additional code | \n",
"
\n",
" \n",
" | 1 | \n",
" 4236130846 | \n",
" 48361020300 | \n",
" Micropolitan | \n",
" false | \n",
" 0.762413 | \n",
" 0.342790 | \n",
" 0.341379 | \n",
" 0.088619 | \n",
" 0.181628 | \n",
" 0.021719 | \n",
" 0.472803 | \n",
" 0.321646 | \n",
" 0.093124 | \n",
" 0.154539 | \n",
" 48218 | \n",
" Micropolitan core | \n",
" Micropolitan core, no additional code | \n",
"
\n",
" \n",
" | 2 | \n",
" 4236130889 | \n",
" 48361020300 | \n",
" Micropolitan | \n",
" false | \n",
" 0.762413 | \n",
" 0.342790 | \n",
" 0.341379 | \n",
" 0.088619 | \n",
" 0.181628 | \n",
" 0.021719 | \n",
" 0.472803 | \n",
" 0.321646 | \n",
" 0.093124 | \n",
" 0.154539 | \n",
" 48218 | \n",
" Micropolitan core | \n",
" Micropolitan core, no additional code | \n",
"
\n",
" \n",
" | 3 | \n",
" 4236130612 | \n",
" 48361020300 | \n",
" Micropolitan | \n",
" false | \n",
" 0.762413 | \n",
" 0.342790 | \n",
" 0.341379 | \n",
" 0.088619 | \n",
" 0.181628 | \n",
" 0.021719 | \n",
" 0.472803 | \n",
" 0.321646 | \n",
" 0.093124 | \n",
" 0.154539 | \n",
" 48218 | \n",
" Micropolitan core | \n",
" Micropolitan core, no additional code | \n",
"
\n",
" \n",
" | 4 | \n",
" 4236130951 | \n",
" 48361020300 | \n",
" Micropolitan | \n",
" false | \n",
" 0.762413 | \n",
" 0.342790 | \n",
" 0.341379 | \n",
" 0.088619 | \n",
" 0.181628 | \n",
" 0.021719 | \n",
" 0.472803 | \n",
" 0.321646 | \n",
" 0.093124 | \n",
" 0.154539 | \n",
" 48218 | \n",
" Micropolitan core | \n",
" Micropolitan core, no additional code | \n",
"
\n",
" \n",
" | 5 | \n",
" 4236130691 | \n",
" 48361021200 | \n",
" Micropolitan | \n",
" false | \n",
" 0.414994 | \n",
" 0.075616 | \n",
" 0.119897 | \n",
" 0.040496 | \n",
" 0.072491 | \n",
" 0.000000 | \n",
" 0.745455 | \n",
" 0.225786 | \n",
" 0.049278 | \n",
" 0.175871 | \n",
" 75030 | \n",
" Micropolitan high commuting | \n",
" Micropolitan high commuting, no additional code | \n",
"
\n",
" \n",
" | 6 | \n",
" 4236130768 | \n",
" 48361021200 | \n",
" Micropolitan | \n",
" false | \n",
" 0.414994 | \n",
" 0.075616 | \n",
" 0.119897 | \n",
" 0.040496 | \n",
" 0.072491 | \n",
" 0.000000 | \n",
" 0.745455 | \n",
" 0.225786 | \n",
" 0.049278 | \n",
" 0.175871 | \n",
" 75030 | \n",
" Micropolitan high commuting | \n",
" Micropolitan high commuting, no additional code | \n",
"
\n",
" \n",
" | 7 | \n",
" 4236100686 | \n",
" 48361022200 | \n",
" Micropolitan | \n",
" false | \n",
" 0.377673 | \n",
" 0.054155 | \n",
" 0.078620 | \n",
" 0.043864 | \n",
" 0.035436 | \n",
" 0.000000 | \n",
" 0.458128 | \n",
" 0.313129 | \n",
" 0.080299 | \n",
" 0.109244 | \n",
" 110550 | \n",
" Micropolitan low commuting | \n",
" Micropolitan low commuting, no additional code | \n",
"
\n",
" \n",
" | 8 | \n",
" 4236130968 | \n",
" 48361022301 | \n",
" Micropolitan | \n",
" false | \n",
" 0.282359 | \n",
" 0.121570 | \n",
" 0.038104 | \n",
" 0.027312 | \n",
" 0.064141 | \n",
" 0.015588 | \n",
" 0.201954 | \n",
" 0.195056 | \n",
" 0.050578 | \n",
" 0.136085 | \n",
" 94161 | \n",
" Micropolitan core | \n",
" Micropolitan core, no additional code | \n",
"
\n",
" \n",
" | 9 | \n",
" 4236130847 | \n",
" 48361021300 | \n",
" Micropolitan | \n",
" false | \n",
" 0.344340 | \n",
" 0.197377 | \n",
" 0.050797 | \n",
" 0.029070 | \n",
" 0.060484 | \n",
" 0.000000 | \n",
" 0.316940 | \n",
" 0.297490 | \n",
" 0.085989 | \n",
" 0.136454 | \n",
" 86287 | \n",
" Micropolitan core | \n",
" Micropolitan core, no additional code | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" canonical_api10 census_tract_geoid ruca_category is_rural \\\n",
"0 4236101293 48361020300 Micropolitan false \n",
"1 4236130846 48361020300 Micropolitan false \n",
"2 4236130889 48361020300 Micropolitan false \n",
"3 4236130612 48361020300 Micropolitan false \n",
"4 4236130951 48361020300 Micropolitan false \n",
"5 4236130691 48361021200 Micropolitan false \n",
"6 4236130768 48361021200 Micropolitan false \n",
"7 4236100686 48361022200 Micropolitan false \n",
"8 4236130968 48361022301 Micropolitan false \n",
"9 4236130847 48361021300 Micropolitan false \n",
"\n",
" ej_composite_score pct_minority poverty_rate unemployment_rate \\\n",
"0 0.762413 0.342790 0.341379 0.088619 \n",
"1 0.762413 0.342790 0.341379 0.088619 \n",
"2 0.762413 0.342790 0.341379 0.088619 \n",
"3 0.762413 0.342790 0.341379 0.088619 \n",
"4 0.762413 0.342790 0.341379 0.088619 \n",
"5 0.414994 0.075616 0.119897 0.040496 \n",
"6 0.414994 0.075616 0.119897 0.040496 \n",
"7 0.377673 0.054155 0.078620 0.043864 \n",
"8 0.282359 0.121570 0.038104 0.027312 \n",
"9 0.344340 0.197377 0.050797 0.029070 \n",
"\n",
" less_than_hs_pct linguistic_isolation_rate renter_cost_burden_rate \\\n",
"0 0.181628 0.021719 0.472803 \n",
"1 0.181628 0.021719 0.472803 \n",
"2 0.181628 0.021719 0.472803 \n",
"3 0.181628 0.021719 0.472803 \n",
"4 0.181628 0.021719 0.472803 \n",
"5 0.072491 0.000000 0.745455 \n",
"6 0.072491 0.000000 0.745455 \n",
"7 0.035436 0.000000 0.458128 \n",
"8 0.064141 0.015588 0.201954 \n",
"9 0.060484 0.000000 0.316940 \n",
"\n",
" disability_rate pct_under5 pct_65plus median_household_income \\\n",
"0 0.321646 0.093124 0.154539 48218 \n",
"1 0.321646 0.093124 0.154539 48218 \n",
"2 0.321646 0.093124 0.154539 48218 \n",
"3 0.321646 0.093124 0.154539 48218 \n",
"4 0.321646 0.093124 0.154539 48218 \n",
"5 0.225786 0.049278 0.175871 75030 \n",
"6 0.225786 0.049278 0.175871 75030 \n",
"7 0.313129 0.080299 0.109244 110550 \n",
"8 0.195056 0.050578 0.136085 94161 \n",
"9 0.297490 0.085989 0.136454 86287 \n",
"\n",
" ruca_primary_description \\\n",
"0 Micropolitan core \n",
"1 Micropolitan core \n",
"2 Micropolitan core \n",
"3 Micropolitan core \n",
"4 Micropolitan core \n",
"5 Micropolitan high commuting \n",
"6 Micropolitan high commuting \n",
"7 Micropolitan low commuting \n",
"8 Micropolitan core \n",
"9 Micropolitan core \n",
"\n",
" ruca_secondary_description \n",
"0 Micropolitan core, no additional code \n",
"1 Micropolitan core, no additional code \n",
"2 Micropolitan core, no additional code \n",
"3 Micropolitan core, no additional code \n",
"4 Micropolitan core, no additional code \n",
"5 Micropolitan high commuting, no additional code \n",
"6 Micropolitan high commuting, no additional code \n",
"7 Micropolitan low commuting, no additional code \n",
"8 Micropolitan core, no additional code \n",
"9 Micropolitan core, no additional code "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cell 6: Persist tract demographics to Postgres and join preview (non-destructive schema-safe update)\n",
"from sqlalchemy import text\n",
"\n",
"persist_cols = [\n",
" 'geoid','name','total_population','minority_population','pct_minority','pct_hispanic','poverty_rate',\n",
" 'unemployment_rate','less_than_hs_pct','linguistic_isolation_rate','renter_cost_burden_rate','disability_rate',\n",
" 'pct_under5','pct_65plus','ej_composite_score','median_household_income',\n",
" 'ruca_code_2020','ruca_primary','ruca_primary_description','ruca_secondary','ruca_secondary_description','ruca_category','is_nonmetro','is_rural'\n",
"]\n",
"\n",
"write_df = acs_df[persist_cols].copy()\n",
"\n",
"with engine.begin() as conn:\n",
" # Ensure target table exists; if not, create fresh via to_sql\n",
" existing = conn.execute(text(\"\"\"\n",
" SELECT to_regclass('census_tract_demographics') IS NOT NULL AS exists\n",
" \"\"\")) .scalar()\n",
" if not existing:\n",
" write_df.to_sql('census_tract_demographics', con=conn, if_exists='replace', index=False, method='multi')\n",
" else:\n",
" # Add any missing columns before load, and normalize legacy NAME -> name\n",
" existing_cols = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name='census_tract_demographics'\"))]\n",
" if 'NAME' in existing_cols and 'name' not in existing_cols:\n",
" # If a quoted \"NAME\" exists, rename to lowercase name to avoid identifier issues\n",
" conn.execute(text('ALTER TABLE census_tract_demographics RENAME COLUMN \"NAME\" TO name'))\n",
" existing_cols = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name='census_tract_demographics'\"))]\n",
" for col in persist_cols:\n",
" if col not in existing_cols:\n",
" # Infer a reasonable SQL type from pandas dtype\n",
" series = write_df[col]\n",
" if pd.api.types.is_integer_dtype(series.dropna()):\n",
" sql_type = 'BIGINT'\n",
" elif pd.api.types.is_float_dtype(series.dropna()):\n",
" sql_type = 'DOUBLE PRECISION'\n",
" elif pd.api.types.is_bool_dtype(series.dropna()):\n",
" sql_type = 'BOOLEAN'\n",
" else:\n",
" sql_type = 'TEXT'\n",
" conn.execute(text(f\"ALTER TABLE census_tract_demographics ADD COLUMN IF NOT EXISTS {col} {sql_type}\"))\n",
" # Stage data in a temp table\n",
" write_df.to_sql('_census_tract_demographics_stage', con=conn, if_exists='replace', index=False, method='multi')\n",
" # Upsert strategy: delete all then insert (tract-level snapshot)\n",
" conn.execute(text(\"DELETE FROM census_tract_demographics\"))\n",
" insert_cols = ','.join(persist_cols)\n",
" conn.execute(text(f\"INSERT INTO census_tract_demographics ({insert_cols}) SELECT {insert_cols} FROM _census_tract_demographics_stage\"))\n",
" conn.execute(text(\"DROP TABLE IF EXISTS _census_tract_demographics_stage\"))\n",
" # Indexes (create if absent)\n",
" conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_geoid ON census_tract_demographics (geoid)\"))\n",
" conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_ej_score ON census_tract_demographics (ej_composite_score)\"))\n",
" conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_poverty_rate ON census_tract_demographics (poverty_rate)\"))\n",
" conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_ctd_pct_minority ON census_tract_demographics (pct_minority)\"))\n",
" conn.execute(text(\"ANALYZE census_tract_demographics\"))\n",
"\n",
"print(\"Updated census_tract_demographics (schema reconciled, data refreshed, indexes ensured).\")\n",
"\n",
"# Preview join back to wells\n",
"with engine.begin() as conn:\n",
" preview = pd.read_sql(text(\"\"\"\n",
" SELECT w.canonical_api10, w.census_tract_geoid,\n",
" d.ruca_category, d.is_rural, d.ej_composite_score, d.pct_minority, d.poverty_rate, d.unemployment_rate,\n",
" d.less_than_hs_pct, d.linguistic_isolation_rate, d.renter_cost_burden_rate,\n",
" d.disability_rate, d.pct_under5, d.pct_65plus, d.median_household_income,\n",
" d.ruca_primary_description, d.ruca_secondary_description\n",
" FROM well_shape_tract w\n",
" LEFT JOIN census_tract_demographics d\n",
" ON w.census_tract_geoid = d.geoid\n",
" LIMIT 10\n",
" \"\"\"), conn)\n",
"\n",
"preview"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "d38db1df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'well_shape_tract_columns_count': 11}\n",
"{'column': 'canonical_api10', 'present': True, 'nonnull_count': 1010430}\n",
"{'column': 'api10_number', 'present': True, 'nonnull_count': 852539}\n",
"{'column': 'api_number', 'present': True, 'nonnull_count': 1010430}\n",
"{'materialized_table': 'well_with_demographics_table', 'rows': 1373579}\n",
"Sample (first 10) from materialized table:\n",
" canonical_api10 api10_number api_number census_tract_geoid\n",
"0 4236101293 None 36101293 48361020300\n",
"1 4236130846 None 36130846 48361020300\n",
"2 4236130889 None 36130889 48361020300\n",
"3 4236130612 None 36130612 48361020300\n",
"4 4236130951 None 36130951 48361020300\n",
"5 4236130691 None 36130691 48361021200\n",
"6 4236130768 None 36130768 48361021200\n",
"7 4236100686 None 36100686 48361022200\n",
"8 4236130968 None 36130968 48361022301\n",
"9 4236130847 None 36130847 48361021300\n",
"{'materialized_table': 'well_with_demographics_table', 'rows': 1373579}\n",
"Sample (first 10) from materialized table:\n",
" canonical_api10 api10_number api_number census_tract_geoid\n",
"0 4236101293 None 36101293 48361020300\n",
"1 4236130846 None 36130846 48361020300\n",
"2 4236130889 None 36130889 48361020300\n",
"3 4236130612 None 36130612 48361020300\n",
"4 4236130951 None 36130951 48361020300\n",
"5 4236130691 None 36130691 48361021200\n",
"6 4236130768 None 36130768 48361021200\n",
"7 4236100686 None 36100686 48361022200\n",
"8 4236130968 None 36130968 48361022301\n",
"9 4236130847 None 36130847 48361021300\n"
]
}
],
"source": [
"# Cell 8: Materialize wells + demographics into a PostGIS table (defensive: preserve API ids)\n",
"from sqlalchemy import text\n",
"\n",
"target_table = 'well_with_demographics_table'\n",
"\n",
"with engine.begin() as conn:\n",
" # Inspect available identifier columns on well_shape_tract\n",
" existing_cols = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name='well_shape_tract'\"))]\n",
" print({'well_shape_tract_columns_count': len(existing_cols)})\n",
" for col in ['canonical_api10','api10_number','api_number']:\n",
" present = col in existing_cols\n",
" nonnull = 0\n",
" if present:\n",
" nonnull = conn.execute(text(f\"SELECT COUNT(*) FROM well_shape_tract WHERE {col} IS NOT NULL\")).scalar()\n",
" print({ 'column': col, 'present': present, 'nonnull_count': int(nonnull) if present else None })\n",
"\n",
" # Build canonical_api10 expression: prefer canonical_api10, then api10_number, then api_number\n",
" if 'canonical_api10' in existing_cols:\n",
" canonical_expr = 'w.canonical_api10::text'\n",
" elif 'api10_number' in existing_cols:\n",
" canonical_expr = 'w.api10_number::text'\n",
" elif 'api_number' in existing_cols:\n",
" canonical_expr = 'w.api_number::text'\n",
" else:\n",
" canonical_expr = \"NULL::text\"\n",
"\n",
" # Keep raw api columns too if present\n",
" raw_api_selects = []\n",
" if 'api10_number' in existing_cols:\n",
" raw_api_selects.append('w.api10_number')\n",
" if 'api_number' in existing_cols:\n",
" raw_api_selects.append('w.api_number')\n",
"\n",
" # Define select list with canonical_api10 first, then raw APIs, then other fields\n",
" select_list = [f\"{canonical_expr} AS canonical_api10\"] + raw_api_selects + [\n",
" 'w.census_tract_geoid',\n",
" 'w.latitude',\n",
" 'w.longitude',\n",
" 'w.geom'\n",
" ]\n",
"\n",
" # Add demographic columns to the select (same as before)\n",
" dem_cols = [\n",
" 'd.name AS tract_name', 'd.ruca_code_2020', 'd.ruca_category', 'd.ruca_primary_description', 'd.ruca_secondary_description',\n",
" 'd.ej_composite_score', 'd.pct_minority','d.pct_hispanic','d.poverty_rate','d.unemployment_rate','d.less_than_hs_pct',\n",
" 'd.linguistic_isolation_rate','d.renter_cost_burden_rate','d.disability_rate','d.pct_under5','d.pct_65plus','d.median_household_income'\n",
" ]\n",
" select_list.extend(dem_cols)\n",
"\n",
" select_sql = \"SELECT\\n \" + \",\\n \".join(select_list) + f\"\\nFROM well_shape_tract w\\nLEFT JOIN census_tract_demographics d ON w.census_tract_geoid = d.geoid\"\n",
"\n",
" # Ensure PostGIS\n",
" conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis\"))\n",
"\n",
" # Stage and atomically replace\n",
" conn.execute(text(\"DROP TABLE IF EXISTS _well_with_demographics_stage\"))\n",
" conn.execute(text(f\"CREATE TABLE _well_with_demographics_stage AS {select_sql}\"))\n",
"\n",
" conn.execute(text(f\"DROP TABLE IF EXISTS {target_table}\"))\n",
" conn.execute(text(f\"ALTER TABLE _well_with_demographics_stage RENAME TO {target_table}\"))\n",
"\n",
" # Indexes\n",
" conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_api10 ON {target_table} (canonical_api10)\"))\n",
" conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_geoid ON {target_table} (census_tract_geoid)\"))\n",
" # geometry column may be named geom in this table\n",
" # create GIST index on geom if present\n",
" cols_after = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name=:t\"), {'t': target_table})]\n",
" if 'geom' in cols_after:\n",
" conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_geom ON {target_table} USING GIST (geom)\"))\n",
" elif 'geometry' in cols_after:\n",
" conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_wd_geometry ON {target_table} USING GIST (geometry)\"))\n",
"\n",
" conn.execute(text(f\"ANALYZE {target_table}\"))\n",
"\n",
"# Report row count and a quick sample\n",
"with engine.begin() as conn:\n",
" cnt = conn.execute(text(f\"SELECT COUNT(*) FROM {target_table}\")).scalar()\n",
" print({\"materialized_table\": target_table, \"rows\": int(cnt)})\n",
" # Build a safe sample query using only existing columns\n",
" cols_now = [r[0] for r in conn.execute(text(\"SELECT column_name FROM information_schema.columns WHERE table_name=:t\"), {'t': target_table})]\n",
" sample_cols = ['canonical_api10'] + [c for c in ['api10_number','api_number','census_tract_geoid'] if c in cols_now]\n",
" sample_sql = f\"SELECT {', '.join(sample_cols)} FROM {target_table} LIMIT 10\"\n",
" sample = pd.read_sql(text(sample_sql), conn)\n",
"\n",
"print(\"Sample (first 10) from materialized table:\")\n",
"print(sample)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}