tables for manuscript. recreated ruca table

2025-10-10 08:58:06 -07:00
parent 388f0ae1c2
commit abff7981c2
54 changed files with 679283 additions and 17766 deletions
--- a/data_setup/add_ruca.ipynb
+++ b/data_setup/add_ruca.ipynb
@@ -0,0 +1,533 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a79e3ddd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Document #</th>\n",
+       "      <th>Report</th>\n",
+       "      <th>Operator</th>\n",
+       "      <th>Operator #</th>\n",
+       "      <th>Tracking #</th>\n",
+       "      <th>Initial Report Date</th>\n",
+       "      <th>Date of Discovery</th>\n",
+       "      <th>Spill Type</th>\n",
+       "      <th>Qtr Qtr</th>\n",
+       "      <th>Section</th>\n",
+       "      <th>...</th>\n",
+       "      <th>total_population</th>\n",
+       "      <th>white_population</th>\n",
+       "      <th>hispanic_population</th>\n",
+       "      <th>median_household_income</th>\n",
+       "      <th>poverty_population</th>\n",
+       "      <th>unemployed_population</th>\n",
+       "      <th>percent_white</th>\n",
+       "      <th>percent_hispanic</th>\n",
+       "      <th>percent_poverty</th>\n",
+       "      <th>unemployment_rate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>400827079</td>\n",
+       "      <td>S</td>\n",
+       "      <td>NOBLE ENERGY INC</td>\n",
+       "      <td>100322</td>\n",
+       "      <td>400823757</td>\n",
+       "      <td>04/10/2015</td>\n",
+       "      <td>04/09/2015</td>\n",
+       "      <td>Historical</td>\n",
+       "      <td>NWNW</td>\n",
+       "      <td>12</td>\n",
+       "      <td>...</td>\n",
+       "      <td>11173.0</td>\n",
+       "      <td>9194.0</td>\n",
+       "      <td>3065.0</td>\n",
+       "      <td>83193.0</td>\n",
+       "      <td>247.0</td>\n",
+       "      <td>245.0</td>\n",
+       "      <td>82.287658</td>\n",
+       "      <td>27.432203</td>\n",
+       "      <td>2.210686</td>\n",
+       "      <td>2.192786</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>400827243</td>\n",
+       "      <td>I</td>\n",
+       "      <td>NOBLE ENERGY INC</td>\n",
+       "      <td>100322</td>\n",
+       "      <td>400827243</td>\n",
+       "      <td>04/17/2015</td>\n",
+       "      <td>04/17/2015</td>\n",
+       "      <td>Historical</td>\n",
+       "      <td>SESW</td>\n",
+       "      <td>34</td>\n",
+       "      <td>...</td>\n",
+       "      <td>11173.0</td>\n",
+       "      <td>9194.0</td>\n",
+       "      <td>3065.0</td>\n",
+       "      <td>83193.0</td>\n",
+       "      <td>247.0</td>\n",
+       "      <td>245.0</td>\n",
+       "      <td>82.287658</td>\n",
+       "      <td>27.432203</td>\n",
+       "      <td>2.210686</td>\n",
+       "      <td>2.192786</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>400827326</td>\n",
+       "      <td>I</td>\n",
+       "      <td>KINDER MORGAN CO2 CO LP</td>\n",
+       "      <td>46685</td>\n",
+       "      <td>400827326</td>\n",
+       "      <td>04/18/2015</td>\n",
+       "      <td>04/17/2015</td>\n",
+       "      <td>Recent</td>\n",
+       "      <td>NWSW</td>\n",
+       "      <td>23</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2459.0</td>\n",
+       "      <td>2404.0</td>\n",
+       "      <td>81.0</td>\n",
+       "      <td>66683.0</td>\n",
+       "      <td>330.0</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>97.763318</td>\n",
+       "      <td>3.294022</td>\n",
+       "      <td>13.420089</td>\n",
+       "      <td>1.057340</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>400834096</td>\n",
+       "      <td>I</td>\n",
+       "      <td>SMITH OIL PROPERTIES INC</td>\n",
+       "      <td>79905</td>\n",
+       "      <td>400834096</td>\n",
+       "      <td>04/30/2015</td>\n",
+       "      <td>03/26/2015</td>\n",
+       "      <td>Historical</td>\n",
+       "      <td>NENW</td>\n",
+       "      <td>4</td>\n",
+       "      <td>...</td>\n",
+       "      <td>7335.0</td>\n",
+       "      <td>6302.0</td>\n",
+       "      <td>2011.0</td>\n",
+       "      <td>71440.0</td>\n",
+       "      <td>831.0</td>\n",
+       "      <td>166.0</td>\n",
+       "      <td>85.916837</td>\n",
+       "      <td>27.416496</td>\n",
+       "      <td>11.329243</td>\n",
+       "      <td>2.263122</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>400834131</td>\n",
+       "      <td>I</td>\n",
+       "      <td>LINN OPERATING INC</td>\n",
+       "      <td>10516</td>\n",
+       "      <td>400834131</td>\n",
+       "      <td>05/01/2015</td>\n",
+       "      <td>04/30/2015</td>\n",
+       "      <td>Recent</td>\n",
+       "      <td>NESW</td>\n",
+       "      <td>15</td>\n",
+       "      <td>...</td>\n",
+       "      <td>7240.0</td>\n",
+       "      <td>5646.0</td>\n",
+       "      <td>1659.0</td>\n",
+       "      <td>64573.0</td>\n",
+       "      <td>693.0</td>\n",
+       "      <td>240.0</td>\n",
+       "      <td>77.983425</td>\n",
+       "      <td>22.914365</td>\n",
+       "      <td>9.571823</td>\n",
+       "      <td>3.314917</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 118 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Document # Report                  Operator  Operator #  Tracking #  \\\n",
+       "0   400827079      S          NOBLE ENERGY INC      100322   400823757   \n",
+       "1   400827243      I          NOBLE ENERGY INC      100322   400827243   \n",
+       "2   400827326      I   KINDER MORGAN CO2 CO LP       46685   400827326   \n",
+       "3   400834096      I  SMITH OIL PROPERTIES INC       79905   400834096   \n",
+       "4   400834131      I        LINN OPERATING INC       10516   400834131   \n",
+       "\n",
+       "  Initial Report Date Date of Discovery  Spill Type Qtr Qtr  Section  ...  \\\n",
+       "0          04/10/2015        04/09/2015  Historical  NWNW         12  ...   \n",
+       "1          04/17/2015        04/17/2015  Historical  SESW         34  ...   \n",
+       "2          04/18/2015        04/17/2015      Recent  NWSW         23  ...   \n",
+       "3          04/30/2015        03/26/2015  Historical  NENW          4  ...   \n",
+       "4          05/01/2015        04/30/2015      Recent  NESW         15  ...   \n",
+       "\n",
+       "  total_population white_population hispanic_population  \\\n",
+       "0          11173.0           9194.0              3065.0   \n",
+       "1          11173.0           9194.0              3065.0   \n",
+       "2           2459.0           2404.0                81.0   \n",
+       "3           7335.0           6302.0              2011.0   \n",
+       "4           7240.0           5646.0              1659.0   \n",
+       "\n",
+       "   median_household_income  poverty_population unemployed_population  \\\n",
+       "0                  83193.0               247.0                 245.0   \n",
+       "1                  83193.0               247.0                 245.0   \n",
+       "2                  66683.0               330.0                  26.0   \n",
+       "3                  71440.0               831.0                 166.0   \n",
+       "4                  64573.0               693.0                 240.0   \n",
+       "\n",
+       "  percent_white percent_hispanic  percent_poverty unemployment_rate  \n",
+       "0     82.287658        27.432203         2.210686          2.192786  \n",
+       "1     82.287658        27.432203         2.210686          2.192786  \n",
+       "2     97.763318         3.294022        13.420089          1.057340  \n",
+       "3     85.916837        27.416496        11.329243          2.263122  \n",
+       "4     77.983425        22.914365         9.571823          3.314917  \n",
+       "\n",
+       "[5 rows x 118 columns]"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sqlalchemy import create_engine\n",
+    "import geopandas as gpd\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# Database connection details from zshrc environment variables\n",
+    "db_name = 'colorado_spills'\n",
+    "user = os.getenv('DB_USER')\n",
+    "password = os.getenv('DB_PASSWORD')\n",
+    "host = os.getenv('DB_HOST')\n",
+    "port = os.getenv('DB_PORT')\n",
+    "\n",
+    "\n",
+    "# Create an engine to connect to the PostgreSQL database\n",
+    "engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}')\n",
+    "\n",
+    "# Function to load data from a table\n",
+    "def load_table(table_name):\n",
+    "    query = f'SELECT * FROM {table_name}'\n",
+    "    df = pd.read_sql(query, engine)\n",
+    "    return df\n",
+    "\n",
+    "# Load the spills data\n",
+    "spills_with_demographics = load_table('spills_with_demographics')\n",
+    "spills_with_demographics.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dba1c393",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gdf_spills_with_demographics has been created.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create a GeoDataFrame with spills and demographics\n",
+    "gdf_spills_with_demographics = gpd.GeoDataFrame(\n",
+    "    spills_with_demographics,\n",
+    "    geometry=gpd.points_from_xy(spills_with_demographics.Longitude, spills_with_demographics.Latitude),\n",
+    "    crs=\"EPSG:4326\"\n",
+    ")  \n",
+    "print(\"gdf_spills_with_demographics has been created.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "317abe05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# verify that the demographics data has been merged correctly\n",
+    "assert not spills_with_demographics['total_population'].isna().any(), \"Some spills are missing demographic data\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "73411f29",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gdf_spills_with_demographics has been verified.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# verify the GeoDataFrame\n",
+    "assert gdf_spills_with_demographics.geometry.notnull().all(), \"Some geometries are null in gdf_spills_with_demographics\"\n",
+    "assert gdf_spills_with_demographics.crs == \"EPSG:4326\", \"CRS of gdf_spills_with_demographics is not EPSG:4326\"\n",
+    "print(\"gdf_spills_with_demographics has been verified.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "3d0d0791",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting geoalchemy2\n",
+      "  Downloading geoalchemy2-0.18.0-py3-none-any.whl.metadata (2.3 kB)\n",
+      "Requirement already satisfied: SQLAlchemy>=1.4 in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from geoalchemy2) (2.0.43)\n",
+      "Requirement already satisfied: packaging in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from geoalchemy2) (25.0)\n",
+      "Requirement already satisfied: greenlet>=1 in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from SQLAlchemy>=1.4->geoalchemy2) (3.2.4)\n",
+      "Requirement already satisfied: typing-extensions>=4.6.0 in /home/dadams/Repos/colorado_spills/.venv/lib/python3.13/site-packages (from SQLAlchemy>=1.4->geoalchemy2) (4.15.0)\n",
+      "Downloading geoalchemy2-0.18.0-py3-none-any.whl (81 kB)\n",
+      "Installing collected packages: geoalchemy2\n",
+      "Successfully installed geoalchemy2-0.18.0\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install geoalchemy2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d30657cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gdf_spills_with_demographics has been created.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create a GeoDataFrame with spills and demographics\n",
+    "gdf_spills_with_demographics = gpd.GeoDataFrame(\n",
+    "    spills_with_demographics,\n",
+    "    geometry=gpd.points_from_xy(spills_with_demographics.Longitude, spills_with_demographics.Latitude),\n",
+    "    crs=\"EPSG:4326\"\n",
+    ")  \n",
+    "print(\"gdf_spills_with_demographics has been created.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "55ac288c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ...existing code...\n",
+    "# ...existing code...\n",
+    "from sqlalchemy import text\n",
+    "# ...existing code...\n",
+    "# Enable PostGIS extension (run this once)\n",
+    "with engine.connect() as conn:\n",
+    "    conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis;\"))\n",
+    "    conn.commit()\n",
+    "\n",
+    "# Now this should work\n",
+    "gdf_spills_with_demographics.to_postgis('gdf_spills_with_demographics', engine, if_exists='replace', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "4f9f93a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gdf_spills_with_demographics has been saved to the database.\n",
+      "gdf_spills_with_demographics has been saved to a CSV file.\n",
+      "gdf_spills_with_demographics has been saved to a GeoJSON file.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import geoalchemy2\n",
+    "# Save the GeoDataFrame to a new table in the database\n",
+    "gdf_spills_with_demographics.to_postgis('gdf_spills_with_demographics', engine, if_exists='replace', index=False)\n",
+    "print(\"gdf_spills_with_demographics has been saved to the database.\")\n",
+    "# Save the GeoDataFrame to a CSV file\n",
+    "gdf_spills_with_demographics.to_csv('gdf_spills_with_demographics.csv', index=False)\n",
+    "print(\"gdf_spills_with_demographics has been saved to a CSV file.\")\n",
+    "# Save GeoDataFrame as a GeoJSON file\n",
+    "gdf_spills_with_demographics.to_file('gdf_spills_with_demographics.geojson', driver='GeoJSON')\n",
+    "print(\"gdf_spills_with_demographics has been saved to a GeoJSON file.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "3713ba8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ruca_df = pd.read_csv(\n",
+    "    '/home/dadams/CSU Fullerton Dropbox/David Adams/Research Projects/colorado_spills_ejproject/github/colorado_spills/data/RUCA-codes-2020-tract.csv',  # update path\n",
+    "    encoding='latin1',\n",
+    "    dtype={'TractFIPS20': str}\n",
+    ")\n",
+    "\n",
+    "# Keep and rename needed columns\n",
+    "ruca_df = ruca_df.rename(columns={\n",
+    "    'TractFIPS20': 'GEOID',\n",
+    "    'PrimaryRUCA': 'ruca_code',\n",
+    "    'PrimaryRUCADescription': 'ruca_description'\n",
+    "})[['GEOID', 'ruca_code', 'ruca_description']]\n",
+    "\n",
+    "# Ensure GEOID is 11-character string\n",
+    "ruca_df['GEOID'] = ruca_df['GEOID'].str.zfill(11)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "9051abcb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spills = pd.read_sql_table('gdf_spills_with_demographics', engine)\n",
+    "\n",
+    "# Make sure GEOID is also string-padded\n",
+    "spills['GEOID'] = spills['GEOID'].astype(str).str.zfill(11)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f5578b3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spills_with_ruca = spills.merge(ruca_df, on='GEOID', how='left')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "343a6d52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def classify_rurality(code):\n",
+    "    if pd.isna(code):\n",
+    "        return 'Unknown'\n",
+    "    code = int(code)\n",
+    "    if code <= 3:\n",
+    "        return 'Urban'\n",
+    "    elif 4 <= code <= 6:\n",
+    "        return 'Suburban'\n",
+    "    else:\n",
+    "        return 'Rural'\n",
+    "\n",
+    "spills_with_ruca['rurality'] = spills_with_ruca['ruca_code'].apply(classify_rurality)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "49b419cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shapely import wkb\n",
+    "import geopandas as gpd\n",
+    "\n",
+    "# Convert WKBElement objects to Shapely\n",
+    "spills_with_ruca['geometry'] = spills_with_ruca['geometry'].apply(lambda g: wkb.loads(bytes(g.data)) if g else None)\n",
+    "\n",
+    "# Now safely create GeoDataFrame\n",
+    "spills_with_ruca = gpd.GeoDataFrame(spills_with_ruca, geometry='geometry', crs='EPSG:4326')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "f78e96b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spills_with_ruca.to_postgis(\n",
+    "    name='spills_with_ruca',\n",
+    "    con=engine,\n",
+    "    if_exists='replace',\n",
+    "    index=False\n",
+    ")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/data_setup/gdf_spills_with_demographics.csv
+++ b/data_setup/gdf_spills_with_demographics.csv
--- a/data_setup/gdf_spills_with_demographics.geojson
+++ b/data_setup/gdf_spills_with_demographics.geojson
--- a/data_setup/get_demographics.ipynb
+++ b/data_setup/get_demographics.ipynb