tables for manuscript. recreated ruca table

This commit is contained in:
2025-10-10 08:58:06 -07:00
parent 388f0ae1c2
commit abff7981c2
54 changed files with 679283 additions and 17766 deletions

533
data_setup/add_ruca.ipynb Normal file
View File

@@ -0,0 +1,533 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a79e3ddd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Document #</th>\n",
" <th>Report</th>\n",
" <th>Operator</th>\n",
" <th>Operator #</th>\n",
" <th>Tracking #</th>\n",
" <th>Initial Report Date</th>\n",
" <th>Date of Discovery</th>\n",
" <th>Spill Type</th>\n",
" <th>Qtr Qtr</th>\n",
" <th>Section</th>\n",
" <th>...</th>\n",
" <th>total_population</th>\n",
" <th>white_population</th>\n",
" <th>hispanic_population</th>\n",
" <th>median_household_income</th>\n",
" <th>poverty_population</th>\n",
" <th>unemployed_population</th>\n",
" <th>percent_white</th>\n",
" <th>percent_hispanic</th>\n",
" <th>percent_poverty</th>\n",
" <th>unemployment_rate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>400827079</td>\n",
" <td>S</td>\n",
" <td>NOBLE ENERGY INC</td>\n",
" <td>100322</td>\n",
" <td>400823757</td>\n",
" <td>04/10/2015</td>\n",
" <td>04/09/2015</td>\n",
" <td>Historical</td>\n",
" <td>NWNW</td>\n",
" <td>12</td>\n",
" <td>...</td>\n",
" <td>11173.0</td>\n",
" <td>9194.0</td>\n",
" <td>3065.0</td>\n",
" <td>83193.0</td>\n",
" <td>247.0</td>\n",
" <td>245.0</td>\n",
" <td>82.287658</td>\n",
" <td>27.432203</td>\n",
" <td>2.210686</td>\n",
" <td>2.192786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>400827243</td>\n",
" <td>I</td>\n",
" <td>NOBLE ENERGY INC</td>\n",
" <td>100322</td>\n",
" <td>400827243</td>\n",
" <td>04/17/2015</td>\n",
" <td>04/17/2015</td>\n",
" <td>Historical</td>\n",
" <td>SESW</td>\n",
" <td>34</td>\n",
" <td>...</td>\n",
" <td>11173.0</td>\n",
" <td>9194.0</td>\n",
" <td>3065.0</td>\n",
" <td>83193.0</td>\n",
" <td>247.0</td>\n",
" <td>245.0</td>\n",
" <td>82.287658</td>\n",
" <td>27.432203</td>\n",
" <td>2.210686</td>\n",
" <td>2.192786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>400827326</td>\n",
" <td>I</td>\n",
" <td>KINDER MORGAN CO2 CO LP</td>\n",
" <td>46685</td>\n",
" <td>400827326</td>\n",
" <td>04/18/2015</td>\n",
" <td>04/17/2015</td>\n",
" <td>Recent</td>\n",
" <td>NWSW</td>\n",
" <td>23</td>\n",
" <td>...</td>\n",
" <td>2459.0</td>\n",
" <td>2404.0</td>\n",
" <td>81.0</td>\n",
" <td>66683.0</td>\n",
" <td>330.0</td>\n",
" <td>26.0</td>\n",
" <td>97.763318</td>\n",
" <td>3.294022</td>\n",
" <td>13.420089</td>\n",
" <td>1.057340</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>400834096</td>\n",
" <td>I</td>\n",
" <td>SMITH OIL PROPERTIES INC</td>\n",
" <td>79905</td>\n",
" <td>400834096</td>\n",
" <td>04/30/2015</td>\n",
" <td>03/26/2015</td>\n",
" <td>Historical</td>\n",
" <td>NENW</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>7335.0</td>\n",
" <td>6302.0</td>\n",
" <td>2011.0</td>\n",
" <td>71440.0</td>\n",
" <td>831.0</td>\n",
" <td>166.0</td>\n",
" <td>85.916837</td>\n",
" <td>27.416496</td>\n",
" <td>11.329243</td>\n",
" <td>2.263122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>400834131</td>\n",
" <td>I</td>\n",
" <td>LINN OPERATING INC</td>\n",
" <td>10516</td>\n",
" <td>400834131</td>\n",
" <td>05/01/2015</td>\n",
" <td>04/30/2015</td>\n",
" <td>Recent</td>\n",
" <td>NESW</td>\n",
" <td>15</td>\n",
" <td>...</td>\n",
" <td>7240.0</td>\n",
" <td>5646.0</td>\n",
" <td>1659.0</td>\n",
" <td>64573.0</td>\n",
" <td>693.0</td>\n",
" <td>240.0</td>\n",
" <td>77.983425</td>\n",
" <td>22.914365</td>\n",
" <td>9.571823</td>\n",
" <td>3.314917</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 118 columns</p>\n",
"</div>"
],
"text/plain": [
" Document # Report Operator Operator # Tracking # \\\n",
"0 400827079 S NOBLE ENERGY INC 100322 400823757 \n",
"1 400827243 I NOBLE ENERGY INC 100322 400827243 \n",
"2 400827326 I KINDER MORGAN CO2 CO LP 46685 400827326 \n",
"3 400834096 I SMITH OIL PROPERTIES INC 79905 400834096 \n",
"4 400834131 I LINN OPERATING INC 10516 400834131 \n",
"\n",
" Initial Report Date Date of Discovery Spill Type Qtr Qtr Section ... \\\n",
"0 04/10/2015 04/09/2015 Historical NWNW 12 ... \n",
"1 04/17/2015 04/17/2015 Historical SESW 34 ... \n",
"2 04/18/2015 04/17/2015 Recent NWSW 23 ... \n",
"3 04/30/2015 03/26/2015 Historical NENW 4 ... \n",
"4 05/01/2015 04/30/2015 Recent NESW 15 ... \n",
"\n",
" total_population white_population hispanic_population \\\n",
"0 11173.0 9194.0 3065.0 \n",
"1 11173.0 9194.0 3065.0 \n",
"2 2459.0 2404.0 81.0 \n",
"3 7335.0 6302.0 2011.0 \n",
"4 7240.0 5646.0 1659.0 \n",
"\n",
" median_household_income poverty_population unemployed_population \\\n",
"0 83193.0 247.0 245.0 \n",
"1 83193.0 247.0 245.0 \n",
"2 66683.0 330.0 26.0 \n",
"3 71440.0 831.0 166.0 \n",
"4 64573.0 693.0 240.0 \n",
"\n",
" percent_white percent_hispanic percent_poverty unemployment_rate \n",
"0 82.287658 27.432203 2.210686 2.192786 \n",
"1 82.287658 27.432203 2.210686 2.192786 \n",
"2 97.763318 3.294022 13.420089 1.057340 \n",
"3 85.916837 27.416496 11.329243 2.263122 \n",
"4 77.983425 22.914365 9.571823 3.314917 \n",
"\n",
"[5 rows x 118 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"import geopandas as gpd\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"\n",
"import os\n",
"\n",
"# Database connection details from zshrc environment variables\n",
"db_name = 'colorado_spills'\n",
"user = os.getenv('DB_USER')\n",
"password = os.getenv('DB_PASSWORD')\n",
"host = os.getenv('DB_HOST')\n",
"port = os.getenv('DB_PORT')\n",
"\n",
"\n",
"# Create an engine to connect to the PostgreSQL database\n",
"engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}')\n",
"\n",
"# Function to load data from a table\n",
"def load_table(table_name):\n",
" query = f'SELECT * FROM {table_name}'\n",
" df = pd.read_sql(query, engine)\n",
" return df\n",
"\n",
"# Load the spills data\n",
"spills_with_demographics = load_table('spills_with_demographics')\n",
"spills_with_demographics.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "dba1c393",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gdf_spills_with_demographics has been created.\n"
]
}
],
"source": [
"# create a GeoDataFrame with spills and demographics\n",
"gdf_spills_with_demographics = gpd.GeoDataFrame(\n",
" spills_with_demographics,\n",
" geometry=gpd.points_from_xy(spills_with_demographics.Longitude, spills_with_demographics.Latitude),\n",
" crs=\"EPSG:4326\"\n",
") \n",
"print(\"gdf_spills_with_demographics has been created.\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "317abe05",
"metadata": {},
"outputs": [],
"source": [
"# verify that the demographics data has been merged correctly\n",
"assert not spills_with_demographics['total_population'].isna().any(), \"Some spills are missing demographic data\"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "73411f29",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gdf_spills_with_demographics has been verified.\n"
]
}
],
"source": [
"# verify the GeoDataFrame\n",
"assert gdf_spills_with_demographics.geometry.notnull().all(), \"Some geometries are null in gdf_spills_with_demographics\"\n",
"assert gdf_spills_with_demographics.crs == \"EPSG:4326\", \"CRS of gdf_spills_with_demographics is not EPSG:4326\"\n",
"print(\"gdf_spills_with_demographics has been verified.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "3d0d0791",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting geoalchemy2\n",
" Downloading geoalchemy2-0.18.0-py3-none-any.whl.metadata (2.3 kB)\n",
"Requirement already satisfied: SQLAlchemy>=1.4 in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from geoalchemy2) (2.0.43)\n",
"Requirement already satisfied: packaging in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from geoalchemy2) (25.0)\n",
"Requirement already satisfied: greenlet>=1 in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from SQLAlchemy>=1.4->geoalchemy2) (3.2.4)\n",
"Requirement already satisfied: typing-extensions>=4.6.0 in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from SQLAlchemy>=1.4->geoalchemy2) (4.15.0)\n",
"Downloading geoalchemy2-0.18.0-py3-none-any.whl (81 kB)\n",
"Installing collected packages: geoalchemy2\n",
"Successfully installed geoalchemy2-0.18.0\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install geoalchemy2"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d30657cf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gdf_spills_with_demographics has been created.\n"
]
}
],
"source": [
"# create a GeoDataFrame with spills and demographics\n",
"gdf_spills_with_demographics = gpd.GeoDataFrame(\n",
" spills_with_demographics,\n",
" geometry=gpd.points_from_xy(spills_with_demographics.Longitude, spills_with_demographics.Latitude),\n",
" crs=\"EPSG:4326\"\n",
") \n",
"print(\"gdf_spills_with_demographics has been created.\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "55ac288c",
"metadata": {},
"outputs": [],
"source": [
"# ...existing code...\n",
"# ...existing code...\n",
"from sqlalchemy import text\n",
"# ...existing code...\n",
"# Enable PostGIS extension (run this once)\n",
"with engine.connect() as conn:\n",
" conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis;\"))\n",
" conn.commit()\n",
"\n",
"# Now this should work\n",
"gdf_spills_with_demographics.to_postgis('gdf_spills_with_demographics', engine, if_exists='replace', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "4f9f93a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gdf_spills_with_demographics has been saved to the database.\n",
"gdf_spills_with_demographics has been saved to a CSV file.\n",
"gdf_spills_with_demographics has been saved to a GeoJSON file.\n"
]
}
],
"source": [
"import os\n",
"import geoalchemy2\n",
"# Save the GeoDataFrame to a new table in the database\n",
"gdf_spills_with_demographics.to_postgis('gdf_spills_with_demographics', engine, if_exists='replace', index=False)\n",
"print(\"gdf_spills_with_demographics has been saved to the database.\")\n",
"# Save the GeoDataFrame to a CSV file\n",
"gdf_spills_with_demographics.to_csv('gdf_spills_with_demographics.csv', index=False)\n",
"print(\"gdf_spills_with_demographics has been saved to a CSV file.\")\n",
"# Save GeoDataFrame as a GeoJSON file\n",
"gdf_spills_with_demographics.to_file('gdf_spills_with_demographics.geojson', driver='GeoJSON')\n",
"print(\"gdf_spills_with_demographics has been saved to a GeoJSON file.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "3713ba8d",
"metadata": {},
"outputs": [],
"source": [
"ruca_df = pd.read_csv(\n",
" '/home/dadams/CSU Fullerton Dropbox/David Adams/Research Projects/colorado_spills_ejproject/github/colorado_spills/data/RUCA-codes-2020-tract.csv', # update path\n",
" encoding='latin1',\n",
" dtype={'TractFIPS20': str}\n",
")\n",
"\n",
"# Keep and rename needed columns\n",
"ruca_df = ruca_df.rename(columns={\n",
" 'TractFIPS20': 'GEOID',\n",
" 'PrimaryRUCA': 'ruca_code',\n",
" 'PrimaryRUCADescription': 'ruca_description'\n",
"})[['GEOID', 'ruca_code', 'ruca_description']]\n",
"\n",
"# Ensure GEOID is 11-character string\n",
"ruca_df['GEOID'] = ruca_df['GEOID'].str.zfill(11)\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "9051abcb",
"metadata": {},
"outputs": [],
"source": [
"spills = pd.read_sql_table('gdf_spills_with_demographics', engine)\n",
"\n",
"# Make sure GEOID is also string-padded\n",
"spills['GEOID'] = spills['GEOID'].astype(str).str.zfill(11)\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f5578b3b",
"metadata": {},
"outputs": [],
"source": [
"spills_with_ruca = spills.merge(ruca_df, on='GEOID', how='left')\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "343a6d52",
"metadata": {},
"outputs": [],
"source": [
"def classify_rurality(code):\n",
" if pd.isna(code):\n",
" return 'Unknown'\n",
" code = int(code)\n",
" if code <= 3:\n",
" return 'Urban'\n",
" elif 4 <= code <= 6:\n",
" return 'Suburban'\n",
" else:\n",
" return 'Rural'\n",
"\n",
"spills_with_ruca['rurality'] = spills_with_ruca['ruca_code'].apply(classify_rurality)\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "49b419cb",
"metadata": {},
"outputs": [],
"source": [
"from shapely import wkb\n",
"import geopandas as gpd\n",
"\n",
"# Convert WKBElement objects to Shapely\n",
"spills_with_ruca['geometry'] = spills_with_ruca['geometry'].apply(lambda g: wkb.loads(bytes(g.data)) if g else None)\n",
"\n",
"# Now safely create GeoDataFrame\n",
"spills_with_ruca = gpd.GeoDataFrame(spills_with_ruca, geometry='geometry', crs='EPSG:4326')\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "f78e96b2",
"metadata": {},
"outputs": [],
"source": [
"spills_with_ruca.to_postgis(\n",
" name='spills_with_ruca',\n",
" con=engine,\n",
" if_exists='replace',\n",
" index=False\n",
")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long