In [ ]:
import pandas as pd
from sqlalchemy import create_engine
import geopandas as gpd
import os
# Database connection details from zshrc environment variables
db_name = 'colorado_spills'
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
# Create an engine to connect to the PostgreSQL database
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}')
# Read in the spills_with_demographics as spills
spills = pd.read_sql_table('spills_with_demographics', engine)
In [ ]:
# use longitude and latitude to create a GeoDataFrame from spills data
spills['geometry'] = gpd.points_from_xy(spills['Longitude'], spills['Latitude'])
In [ ]:
# Create a GeoDataFrame from spills
spills_gdf = gpd.GeoDataFrame(spills, crs='EPSG:4326')
# Write the GeoDataFrame to a new table in the database
spills_gdf.to_postgis('spills_with_demographics_geog', engine, if_exists='replace')
# Close the database connection
engine.dispose()
In [ ]:
# Ensure the date columns are in datetime format
spills_gdf['Date of Discovery'] = pd.to_datetime(spills_gdf['Date of Discovery'])
spills_gdf['Initial Report Date'] = pd.to_datetime(spills_gdf['Initial Report Date'])
# Calculate the time difference in days
spills_gdf['Report Delay (Days)'] = (spills_gdf['Initial Report Date'] - spills_gdf['Date of Discovery']).dt.days
# Display the first few rows to check the new column
spills_gdf[['Date of Discovery', 'Initial Report Date', 'Report Delay (Days)']].head()
Out[ ]:
| Date of Discovery | Initial Report Date | Report Delay (Days) | |
|---|---|---|---|
| 0 | 2014-06-11 | 2014-06-18 | 7 |
| 1 | 2014-06-14 | 2014-06-19 | 5 |
| 2 | 2014-06-14 | 2014-06-19 | 5 |
| 3 | 2014-06-19 | 2014-06-20 | 1 |
| 4 | 2014-06-20 | 2014-06-23 | 3 |
In [ ]:
# Set negative reporting delays to zero
spills_gdf['Report Delay (Days)'] = spills_gdf['Report Delay (Days)'].apply(lambda x: max(x, 0))
# Check how many negative values were corrected
corrected_values = (spills_gdf['Report Delay (Days)'] == 0).sum()
print(f'Corrected {corrected_values} negative report delays to zero.')
Corrected 5889 negative report delays to zero.
In [ ]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
# Identify rows with negative or missing median household income
negative_income_rows = spills_gdf[spills_gdf['median_household_income'] < 0].copy()
# Identify rows with valid median household income
valid_income_rows = spills_gdf[spills_gdf['median_household_income'] >= 0].copy()
In [ ]:
# Use latitude and longitude for nearest neighbor search
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(valid_income_rows[['Latitude', 'Longitude']])
distances, indices = nbrs.kneighbors(negative_income_rows[['Latitude', 'Longitude']])
# Extract the nearest neighbor's median household income for each negative income row
nearest_incomes = valid_income_rows.iloc[indices.flatten()]['median_household_income'].values
In [ ]:
# Replace the negative income values with the nearest neighbor's income
spills_gdf.loc[negative_income_rows.index, 'median_household_income'] = nearest_incomes
# Check the result
imputed_values = len(negative_income_rows)
print(f'Imputed {imputed_values} negative income values using nearest neighbors.')
Imputed 159 negative income values using nearest neighbors.
In [ ]:
# Summary statistics for reporting delay
report_delay_summary = spills_gdf['Report Delay (Days)'].describe()
report_delay_summary
Out[ ]:
count 16890.000000 mean 9.911131 std 126.932370 min 0.000000 25% 0.000000 50% 1.000000 75% 2.000000 max 9261.000000 Name: Report Delay (Days), dtype: float64
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot of reporting delay vs. median household income
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_household_income', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Median Household Income')
plt.xlabel('Median Household Income')
plt.ylabel('Reporting Delay (Days)')
plt.show()
In [ ]:
import statsmodels.api as sm
# Define the independent variable (adding a constant for intercept)
X = sm.add_constant(spills_gdf['median_household_income'])
# Define the dependent variable
y = spills_gdf['Report Delay (Days)']
# Fit the regression model
model = sm.OLS(y, X).fit()
# Get the summary of the model
model_summary = model.summary()
print(model_summary)
OLS Regression Results
===============================================================================
Dep. Variable: Report Delay (Days) R-squared: 0.000
Model: OLS Adj. R-squared: -0.000
Method: Least Squares F-statistic: 0.2970
Date: Fri, 09 Aug 2024 Prob (F-statistic): 0.586
Time: 15:02:29 Log-Likelihood: -1.0577e+05
No. Observations: 16890 AIC: 2.116e+05
Df Residuals: 16888 BIC: 2.116e+05
Df Model: 1
Covariance Type: nonrobust
===========================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------
const 8.1613 3.356 2.432 0.015 1.583 14.739
median_household_income 2.207e-05 4.05e-05 0.545 0.586 -5.73e-05 0.000
==============================================================================
Omnibus: 48059.926 Durbin-Watson: 1.990
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3185509630.382
Skew: 37.898 Prob(JB): 0.00
Kurtosis: 2129.202 Cond. No. 2.85e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.85e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
# Plot spill locations by median household income
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='median_household_income', bins=30, kde=True)
plt.title('Distribution of Spills by Median Household Income')
plt.xlabel('Median Household Income')
plt.ylabel('Number of Spills')
plt.show()
# Plot spill locations by percent poverty
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='percent_poverty', bins=30, kde=True)
plt.title('Distribution of Spills by Percent Poverty')
plt.xlabel('Percent Poverty')
plt.ylabel('Number of Spills')
plt.show()
# Plot spill locations by unemployment rate
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='unemployment_rate', bins=30, kde=True)
plt.title('Distribution of Spills by Unemployment Rate')
plt.xlabel('Unemployment Rate')
plt.ylabel('Number of Spills')
plt.show()
In [ ]:
# Plot spill locations by percent white
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='percent_white', bins=30, kde=True)
plt.title('Distribution of Spills by Percent White Population')
plt.xlabel('Percent White')
plt.ylabel('Number of Spills')
plt.show()
# Plot spill locations by percent hispanic
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='percent_hispanic', bins=30, kde=True)
plt.title('Distribution of Spills by Percent Hispanic Population')
plt.xlabel('Percent Hispanic')
plt.ylabel('Number of Spills')
plt.show()
In [ ]:
# Scatter plot of reporting delay vs. median household income
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_household_income', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Median Household Income')
plt.xlabel('Median Household Income')
plt.ylabel('Reporting Delay (Days)')
plt.show()
# Scatter plot of reporting delay vs. percent poverty
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_poverty', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Percent Poverty')
plt.xlabel('Percent Poverty')
plt.ylabel('Reporting Delay (Days)')
plt.show()
# Scatter plot of reporting delay vs. unemployment rate
plt.figure(figsize=(10, 6))
sns.scatterplot(x='unemployment_rate', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Unemployment Rate')
plt.xlabel('Unemployment Rate')
plt.ylabel('Reporting Delay (Days)')
plt.show()
In [ ]:
# Scatter plot of reporting delay vs. percent white
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_white', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Percent White Population')
plt.xlabel('Percent White')
plt.ylabel('Reporting Delay (Days)')
plt.show()
# Scatter plot of reporting delay vs. percent hispanic
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_hispanic', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Percent Hispanic Population')
plt.xlabel('Percent Hispanic')
plt.ylabel('Reporting Delay (Days)')
plt.show()
In [ ]:
# Select relevant columns for correlation analysis
correlation_columns = [
'Report Delay (Days)',
'median_household_income',
'percent_poverty',
'unemployment_rate',
'percent_white',
'percent_hispanic'
]
# Create a DataFrame with the selected columns
correlation_df = spills_gdf[correlation_columns]
In [ ]:
# Calculate the correlation matrix
correlation_matrix = correlation_df.corr()
# Display the correlation matrix
correlation_matrix
Out[ ]:
| Report Delay (Days) | median_household_income | percent_poverty | unemployment_rate | percent_white | percent_hispanic | |
|---|---|---|---|---|---|---|
| Report Delay (Days) | 1.000000 | 0.004194 | -0.011415 | -0.008741 | -0.033242 | -0.003342 |
| median_household_income | 0.004194 | 1.000000 | -0.572311 | -0.218962 | 0.070998 | -0.224321 |
| percent_poverty | -0.011415 | -0.572311 | 1.000000 | 0.374435 | -0.093304 | -0.032890 |
| unemployment_rate | -0.008741 | -0.218962 | 0.374435 | 1.000000 | 0.122596 | -0.131676 |
| percent_white | -0.033242 | 0.070998 | -0.093304 | 0.122596 | 1.000000 | -0.643538 |
| percent_hispanic | -0.003342 | -0.224321 | -0.032890 | -0.131676 | -0.643538 | 1.000000 |
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
# Set up the matplotlib figure
plt.figure(figsize=(10, 8))
# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
# Add title and labels
plt.title('Correlation Matrix of Spill-Related Variables and EJ Characteristics')
plt.show()
In [ ]:
## Count the number of spills by operator
operator_spill_count = spills_gdf['Operator'].value_counts()
# Display the top operators with the most spills
operator_spill_count.head(10)
Out[ ]:
Operator NOBLE ENERGY INC 2402 KERR MCGEE OIL & GAS ONSHORE LP 2243 PDC ENERGY INC 1182 CAERUS PICEANCE LLC 1051 BONANZA CREEK ENERGY OPERATING COMPANY LLC 643 KP KAUFFMAN COMPANY INC 468 CHEVRON USA INC 467 HIGHPOINT OPERATING CORPORATION 411 CRESTONE PEAK RESOURCES OPERATING LLC 394 LARAMIE ENERGY LLC 376 Name: count, dtype: int64
In [ ]:
# Count the number of spills by county
county_spill_count = spills_gdf['county'].value_counts()
# Display the top counties with the most spills
county_spill_count.head(10)
Out[ ]:
county WELD 9850 GARFIELD 1886 RIO BLANCO 1034 LAS ANIMAS 614 ADAMS 583 LA PLATA 496 MESA 362 JACKSON 271 YUMA 270 WASHINGTON 226 Name: count, dtype: int64
In [ ]:
# Identify operators with more than one spill
recurring_operator_spills = operator_spill_count[operator_spill_count > 1]
# Display the operators with recurring spills
recurring_operator_spills
Out[ ]:
Operator
NOBLE ENERGY INC 2402
KERR MCGEE OIL & GAS ONSHORE LP 2243
PDC ENERGY INC 1182
CAERUS PICEANCE LLC 1051
BONANZA CREEK ENERGY OPERATING COMPANY LLC 643
...
WOLVERINE RESOURCES LLC 2
DAN A HUGHES COMPANY LP 2
SUMMIT MIDSTREAM NIOBRARA LLC 2
LYSTER OIL COMPANY INC 2
SOVEREIGN OPERATING COMPANY LLC 2
Name: count, Length: 275, dtype: int64
In [ ]:
# Identify counties with more than one spill
recurring_county_spills = county_spill_count[county_spill_count > 1]
# Display the counties with recurring spills
recurring_county_spills
Out[ ]:
county WELD 9850 GARFIELD 1886 RIO BLANCO 1034 LAS ANIMAS 614 ADAMS 583 LA PLATA 496 MESA 362 JACKSON 271 YUMA 270 WASHINGTON 226 MOFFAT 172 ARAPAHOE 153 LARIMER 122 BOULDER 117 CHEYENNE 106 MONTEZUMA 97 LOGAN 85 LINCOLN 60 ARCHULETA 56 BROOMFIELD 52 ELBERT 50 GUNNISON 45 KIOWA 39 HUERFANO 32 MORGAN 31 BACA 29 PROWERS 10 DOLORES 9 ROUTT 9 KIT CARSON 8 PITKIN 7 DENVER 5 FREMONT 4 Name: count, dtype: int64
In [ ]:
# Group by GEOID and aggregate the number of spills and demographic characteristics
tract_spill_summary = spills_gdf.groupby('GEOID').agg({
'Document #': 'count',
'median_household_income': 'mean',
'percent_poverty': 'mean',
'percent_white': 'mean'
}).rename(columns={'Document #': 'Number of Spills'})
# Display the summary
tract_spill_summary.head()
Out[ ]:
| Number of Spills | median_household_income | percent_poverty | percent_white | |
|---|---|---|---|---|
| GEOID | ||||
| 08001008354 | 31 | 101250.0 | 10.757540 | 28.591783 |
| 08001008401 | 122 | 88286.0 | 5.918776 | 86.814788 |
| 08001008402 | 95 | 114276.0 | 4.121431 | 82.046643 |
| 08001008526 | 1 | 98283.0 | 3.801249 | 79.011675 |
| 08001008538 | 9 | 131231.0 | 4.150727 | 75.181714 |
In [ ]:
# Calculate the correlation matrix for the tract-level data
correlation_matrix_tract = tract_spill_summary.corr()
# Display the correlation matrix
correlation_matrix_tract
Out[ ]:
| Number of Spills | median_household_income | percent_poverty | percent_white | |
|---|---|---|---|---|
| Number of Spills | 1.000000 | -0.157514 | 0.133947 | -0.008777 |
| median_household_income | -0.157514 | 1.000000 | -0.625752 | -0.001162 |
| percent_poverty | 0.133947 | -0.625752 | 1.000000 | -0.067320 |
| percent_white | -0.008777 | -0.001162 | -0.067320 | 1.000000 |
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot of Number of Spills vs. Median Household Income
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_household_income', y='Number of Spills', data=tract_spill_summary)
plt.title('Number of Spills vs. Median Household Income (Census Tract Level)')
plt.xlabel('Median Household Income')
plt.ylabel('Number of Spills')
plt.show()
# Scatter plot of Number of Spills vs. Percent Poverty
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_poverty', y='Number of Spills', data=tract_spill_summary)
plt.title('Number of Spills vs. Percent Poverty (Census Tract Level)')
plt.xlabel('Percent Poverty')
plt.ylabel('Number of Spills')
plt.show()
# Scatter plot of Number of Spills vs. Percent White Population
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_white', y='Number of Spills', data=tract_spill_summary)
plt.title('Number of Spills vs. Percent White Population (Census Tract Level)')
plt.xlabel('Percent White Population')
plt.ylabel('Number of Spills')
plt.show()
In [ ]:
# Remove rows with NaN or infinite values
tract_spill_summary_cleaned = tract_spill_summary.replace([np.inf, -np.inf], np.nan).dropna()
# Check that the data is clean
tract_spill_summary_cleaned.isna().sum()
Out[ ]:
Number of Spills 0 median_household_income 0 percent_poverty 0 percent_white 0 dtype: int64
In [ ]:
# Define the independent variables (with a constant for intercept)
X = tract_spill_summary_cleaned[['median_household_income', 'percent_poverty', 'percent_white']]
X = sm.add_constant(X)
# Define the dependent variable
y = tract_spill_summary_cleaned['Number of Spills']
# Fit the regression model
model = sm.OLS(y, X).fit()
# Get the summary of the model
model_summary = model.summary()
print(model_summary)
OLS Regression Results
==============================================================================
Dep. Variable: Number of Spills R-squared: 0.027
Model: OLS Adj. R-squared: 0.008
Method: Least Squares F-statistic: 1.403
Date: Fri, 09 Aug 2024 Prob (F-statistic): 0.244
Time: 15:08:59 Log-Likelihood: -1091.6
No. Observations: 157 AIC: 2191.
Df Residuals: 153 BIC: 2203.
Df Model: 3
Covariance Type: nonrobust
===========================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------
const 185.0017 197.832 0.935 0.351 -205.834 575.837
median_household_income -0.0010 0.001 -1.178 0.241 -0.003 0.001
percent_poverty 2.5445 4.494 0.566 0.572 -6.335 11.424
percent_white -0.1193 1.909 -0.063 0.950 -3.890 3.652
==============================================================================
Omnibus: 196.289 Durbin-Watson: 1.335
Prob(Omnibus): 0.000 Jarque-Bera (JB): 6950.948
Skew: 4.985 Prob(JB): 0.00
Kurtosis: 34.035 Cond. No. 9.33e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.33e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# Re-calculate the correlation matrix
correlation_matrix_cleaned = tract_spill_summary_cleaned.corr()
# Display the cleaned correlation matrix
correlation_matrix_cleaned
Out[ ]:
| Number of Spills | median_household_income | percent_poverty | percent_white | |
|---|---|---|---|---|
| Number of Spills | 1.000000 | -0.157012 | 0.133947 | -0.008777 |
| median_household_income | -0.157012 | 1.000000 | -0.625752 | -0.001162 |
| percent_poverty | 0.133947 | -0.625752 | 1.000000 | -0.067320 |
| percent_white | -0.008777 | -0.001162 | -0.067320 | 1.000000 |
In [ ]: