Files
colorado_spills/analysis/vunerable_analysis.ipynb
2025-03-08 18:07:58 -08:00

195 lines
10 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: python-dotenv in /home/dadams/miniconda3/envs/spatial_env2/lib/python3.10/site-packages (1.0.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install python-dotenv"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique columns in spills_df: ['Document #', 'Report', 'Operator', 'Operator #', 'Tracking #', 'Initial Report Date', 'Date of Discovery', 'Spill Type', 'Qtr Qtr', 'Section', 'Township', 'range', 'meridian', 'Latitude', 'Longitude', 'Municipality', 'county', 'Facility Type', 'Facility ID', 'API County Code', 'API Sequence Number', 'Spilled outside of berms', 'More than five barrels spilled', 'Oil Spill Volume', 'Condensate Spill Volume', 'Flow Back Spill Volume', 'Produced Water Spill Volume', 'E&P Waste Spill Volume', 'Other Waste', 'Drilling Fluid Spill Volume', 'Current Land Use', 'Other Land Use', 'Weather Conditions', 'Surface Owner', 'Surface Owner Other', 'Waters of the State', 'Residence / Occupied Structure', 'livestock', 'Public Byway', 'Surface Water Supply Area', 'Spill Description', 'Supplemental Report Date', 'Oil BBLs Spilled', 'Oil BBLs Recovered', 'Oil Unknown', 'Condensate BBLs Spilled', 'Condensate BBLs Recovered', 'Condensate Unknown', 'Produced Water BBLs Spilled', 'Produced Water BBLs Recovered', 'Produced Water Unknown', 'Drilling Fluid BBLs Spilled', 'Drilling Fluid BBLs Recovered', 'Drilling Fluid Unknown', 'Flow Back Fluid BBLs Spilled', 'Flow Back Fluid BBLs Recovered', 'Flow Back Fluid Unkown', 'Other E&P Waste BBLS Spilled', 'Other E&P Waste BBLS Recovered', 'Other E&P Waste Unknown', 'Other E&P Waste', 'Spill Contained within Berm', 'Emergency Pit Constructed', 'soil', 'groundwater', 'Surface Water', 'Dry Drainage Feature', 'Surface Area Length', 'Surface Area Width', 'Depth of Impact in Feet', 'Depth of Impact in Inches', 'Area Depth Determined', 'Geology Description', 'Depth to Groundwater', 'Water wells in area', 'Water Wells', 'Water Wells None', 'Surface Water Near', 'Surface Water None', 'Wetlands', 'Wetlands None', 'Springs', 'Springs None', 'Livestock Near', 'Livestock None', 'Occupied Buildings', 'Occupied Buildings None', 'Additional Spill Details', 'Supplemental Report Date CA', 'Human Error', 'Equipment Failure', 'Historical Unkown', 'Other', 'Other Description', 'Root Cause', 'Preventative Measures', 'Soil Excavated', 'Offsite Disposal', 'Onsite Treatment', 'Other Disposition', 'Other Disposition Description', 'Ground Water Removed', 'Surface Water Removed', 'Corrective Actions Completed', 'Approved Form 27', 'Form 27 Project Number', 'geometry', 'GEOID', 'TRACT_NAME', 'total_population', 'white_population', 'hispanic_population', 'median_household_income', 'poverty_population', 'unemployed_population', 'percent_white', 'percent_hispanic', 'percent_poverty', 'unemployment_rate', 'Report Delay (Days)', 'Report Year']\n",
"Analysis completed. Results saved to CSV files.\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"import geopandas as gpd\n",
"from shapely.geometry import Point\n",
"from dotenv import load_dotenv\n",
"import os\n",
"\n",
"load_dotenv()\n",
"\n",
"# Load GeoJSON data\n",
"schools_gdf = gpd.read_file(\"/home/dadams/Repos/colorado_spills/Public_School_Locations_-_Current.geojson\")\n",
"nursing_homes_gdf = gpd.read_file(\"/home/dadams/Repos/colorado_spills/Nursing_Homes.geojson\")\n",
"\n",
"# Convert CRS to match oil spill dataset (if needed)\n",
"target_crs = \"EPSG:4326\" # Adjust if necessary\n",
"schools_gdf = schools_gdf.to_crs(target_crs)\n",
"nursing_homes_gdf = nursing_homes_gdf.to_crs(target_crs)\n",
"\n",
"# Database connection details from zshrc environment variables\n",
"db_name = 'colorado_spills'\n",
"user = os.getenv('DB_USER')\n",
"password = os.getenv('DB_PASSWORD')\n",
"host = os.getenv('DB_HOST')\n",
"port = os.getenv('DB_PORT', '5432') # Ensure port is always set\n",
"\n",
"# Create an engine to connect to the PostgreSQL database\n",
"engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}')\n",
"\n",
"# Query Longitude and Latitude correctly (case-sensitive)\n",
"spills_query = \"\"\"\n",
"SELECT *, \"Longitude\", \"Latitude\" \n",
"FROM spills_with_demographics_geog;\n",
"\"\"\"\n",
"\n",
"# Load spills dataset from the correct table (fetch once for efficiency)\n",
"spills_df = pd.read_sql(spills_query, engine)\n",
"\n",
"# Drop duplicate Longitude & Latitude columns if they exist\n",
"spills_df = spills_df.loc[:, ~spills_df.columns.duplicated()]\n",
"\n",
"# Debugging step: Print unique column names to verify structure\n",
"print(\"Unique columns in spills_df:\", spills_df.columns.tolist())\n",
"\n",
"# Convert Longitude and Latitude to numeric\n",
"spills_df[\"Longitude\"] = pd.to_numeric(spills_df[\"Longitude\"], errors=\"coerce\")\n",
"spills_df[\"Latitude\"] = pd.to_numeric(spills_df[\"Latitude\"], errors=\"coerce\")\n",
"\n",
"# Drop any remaining NaNs\n",
"spills_df = spills_df.dropna(subset=[\"Longitude\", \"Latitude\"])\n",
"\n",
"# Convert into GeoDataFrame\n",
"spills_gdf = gpd.GeoDataFrame(\n",
" spills_df,\n",
" geometry=gpd.points_from_xy(spills_df[\"Longitude\"].values, spills_df[\"Latitude\"].values)\n",
")\n",
"\n",
"# Ensure CRS is set properly (assuming it's WGS 84)\n",
"spills_gdf = spills_gdf.set_crs(\"EPSG:4326\")\n",
"\n",
"# Set buffer distance (adjust as needed)\n",
"buffer_distance = 5000 # 5 km radius\n",
"\n",
"# Reproject to a projected CRS before buffering to avoid warnings\n",
"projected_crs = \"EPSG:3857\" # Web Mercator Projection\n",
"schools_gdf = schools_gdf.to_crs(projected_crs)\n",
"nursing_homes_gdf = nursing_homes_gdf.to_crs(projected_crs)\n",
"\n",
"# Apply buffering in projected CRS\n",
"schools_gdf[\"geometry\"] = schools_gdf.geometry.buffer(buffer_distance)\n",
"nursing_homes_gdf[\"geometry\"] = nursing_homes_gdf.geometry.buffer(buffer_distance)\n",
"\n",
"# Convert back to geographic CRS\n",
"schools_gdf = schools_gdf.to_crs(target_crs)\n",
"nursing_homes_gdf = nursing_homes_gdf.to_crs(target_crs)\n",
"\n",
"# Spatial join: Find spills near schools and nursing homes\n",
"spills_near_schools = gpd.sjoin(spills_gdf, schools_gdf, how=\"inner\", predicate=\"intersects\")\n",
"spills_near_nursing_homes = gpd.sjoin(spills_gdf, nursing_homes_gdf, how=\"inner\", predicate=\"intersects\")\n",
"\n",
"# Aggregate statistics\n",
"school_spill_summary = spills_near_schools.groupby(\"NCESSCH\").agg({\n",
" \"Oil Spill Volume\": \"sum\",\n",
" \"Produced Water Spill Volume\": \"sum\",\n",
" \"Report Delay (Days)\": \"mean\"\n",
"}).reset_index()\n",
"\n",
"nursing_home_spill_summary = spills_near_nursing_homes.groupby(\"ID\").agg({\n",
" \"Oil Spill Volume\": \"sum\",\n",
" \"Produced Water Spill Volume\": \"sum\",\n",
" \"Report Delay (Days)\": \"mean\"\n",
"}).reset_index()\n",
"\n",
"# Save results\n",
"school_spill_summary.to_csv(\"school_spill_analysis.csv\", index=False)\n",
"nursing_home_spill_summary.to_csv(\"nursing_home_spill_analysis.csv\", index=False)\n",
"\n",
"print(\"Analysis completed. Results saved to CSV files.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Columns in schools_gdf: ['OBJECTID', 'NCESSCH', 'LEAID', 'NAME', 'OPSTFIPS', 'STREET', 'CITY', 'STATE', 'ZIP', 'STFIP', 'CNTY', 'NMCNTY', 'LOCALE', 'LAT', 'LON', 'CBSA', 'NMCBSA', 'CBSATYPE', 'CSA', 'NMCSA', 'NECTA', 'NMNECTA', 'CD', 'SLDL', 'SLDU', 'SCHOOLYEAR', 'geometry']\n",
"Columns in nursing_homes_gdf: ['OBJECTID', 'ID', 'NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'ZIP4', 'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY', 'COUNTYFIPS', 'COUNTRY', 'LATITUDE', 'LONGITUDE', 'NAICS_CODE', 'NAICS_DESC', 'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE', 'WEBSITE', 'TOT_RES', 'TOT_STAFF', 'BEDS', 'EXCESS_BED', 'OWNERSHIP', 'MEDICAIDID', 'MEDICAREID', 'STATE_LIC', 'SOURCETYPE', 'geometry']\n"
]
}
],
"source": [
"# Debugging: Check column names in schools_gdf and nursing_homes_gdf\n",
"print(\"Columns in schools_gdf:\", schools_gdf.columns.tolist())\n",
"print(\"Columns in nursing_homes_gdf:\", nursing_homes_gdf.columns.tolist())\n",
"\n",
"# Use the correct column name based on the dataset\n",
"correct_school_id_col = \"school_id\" if \"school_id\" in schools_gdf.columns else schools_gdf.columns[0] # Choose first column if school_id is missing\n",
"correct_nursing_home_id_col = \"nursing_home_id\" if \"nursing_home_id\" in nursing_homes_gdf.columns else nursing_homes_gdf.columns[0]\n",
"\n",
"# Aggregate statistics using correct column names\n",
"school_spill_summary = spills_near_schools.groupby(correct_school_id_col).agg({\n",
" \"Oil Spill Volume\": \"sum\",\n",
" \"Produced Water Spill Volume\": \"sum\",\n",
" \"Report Delay (Days)\": \"mean\"\n",
"}).reset_index()\n",
"\n",
"nursing_home_spill_summary = spills_near_nursing_homes.groupby(correct_nursing_home_id_col).agg({\n",
" \"Oil Spill Volume\": \"sum\",\n",
" \"Produced Water Spill Volume\": \"sum\",\n",
" \"Report Delay (Days)\": \"mean\"\n",
"}).reset_index()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "spatial_env2",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}