In [ ]:
import pandas as pd
from sqlalchemy import create_engine
import geopandas as gpd

import os

# Database connection details from zshrc environment variables
db_name = 'colorado_spills'
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')


# Create an engine to connect to the PostgreSQL database
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}')

# Read in the spills_with_demographics as spills
spills = pd.read_sql_table('spills_with_demographics', engine)
In [ ]:
# use longitude and latitude to create a GeoDataFrame from spills data
spills['geometry'] = gpd.points_from_xy(spills['Longitude'], spills['Latitude'])
In [ ]:
# Create a GeoDataFrame from spills
spills_gdf = gpd.GeoDataFrame(spills, crs='EPSG:4326') 

# Write the GeoDataFrame to a new table in the database
spills_gdf.to_postgis('spills_with_demographics_geog', engine, if_exists='replace')

# Close the database connection
engine.dispose()
In [ ]:
# Ensure the date columns are in datetime format
spills_gdf['Date of Discovery'] = pd.to_datetime(spills_gdf['Date of Discovery'])
spills_gdf['Initial Report Date'] = pd.to_datetime(spills_gdf['Initial Report Date'])

# Calculate the time difference in days
spills_gdf['Report Delay (Days)'] = (spills_gdf['Initial Report Date'] - spills_gdf['Date of Discovery']).dt.days

# Display the first few rows to check the new column
spills_gdf[['Date of Discovery', 'Initial Report Date', 'Report Delay (Days)']].head()
Out[ ]:
Date of Discovery Initial Report Date Report Delay (Days)
0 2014-06-11 2014-06-18 7
1 2014-06-14 2014-06-19 5
2 2014-06-14 2014-06-19 5
3 2014-06-19 2014-06-20 1
4 2014-06-20 2014-06-23 3
In [ ]:
# Set negative reporting delays to zero
spills_gdf['Report Delay (Days)'] = spills_gdf['Report Delay (Days)'].apply(lambda x: max(x, 0))

# Check how many negative values were corrected
corrected_values = (spills_gdf['Report Delay (Days)'] == 0).sum()
print(f'Corrected {corrected_values} negative report delays to zero.')
Corrected 5889 negative report delays to zero.
In [ ]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Identify rows with negative or missing median household income
negative_income_rows = spills_gdf[spills_gdf['median_household_income'] < 0].copy()

# Identify rows with valid median household income
valid_income_rows = spills_gdf[spills_gdf['median_household_income'] >= 0].copy()
In [ ]:
# Use latitude and longitude for nearest neighbor search
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(valid_income_rows[['Latitude', 'Longitude']])
distances, indices = nbrs.kneighbors(negative_income_rows[['Latitude', 'Longitude']])

# Extract the nearest neighbor's median household income for each negative income row
nearest_incomes = valid_income_rows.iloc[indices.flatten()]['median_household_income'].values
In [ ]:
# Replace the negative income values with the nearest neighbor's income
spills_gdf.loc[negative_income_rows.index, 'median_household_income'] = nearest_incomes

# Check the result
imputed_values = len(negative_income_rows)
print(f'Imputed {imputed_values} negative income values using nearest neighbors.')
Imputed 159 negative income values using nearest neighbors.
In [ ]:
# Summary statistics for reporting delay
report_delay_summary = spills_gdf['Report Delay (Days)'].describe()
report_delay_summary
Out[ ]:
count    16890.000000
mean         9.911131
std        126.932370
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max       9261.000000
Name: Report Delay (Days), dtype: float64
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot of reporting delay vs. median household income
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_household_income', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Median Household Income')
plt.xlabel('Median Household Income')
plt.ylabel('Reporting Delay (Days)')
plt.show()
No description has been provided for this image
In [ ]:
import statsmodels.api as sm

# Define the independent variable (adding a constant for intercept)
X = sm.add_constant(spills_gdf['median_household_income'])

# Define the dependent variable
y = spills_gdf['Report Delay (Days)']

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the summary of the model
model_summary = model.summary()
print(model_summary)
                             OLS Regression Results                            
===============================================================================
Dep. Variable:     Report Delay (Days)   R-squared:                       0.000
Model:                             OLS   Adj. R-squared:                 -0.000
Method:                  Least Squares   F-statistic:                    0.2970
Date:                 Fri, 09 Aug 2024   Prob (F-statistic):              0.586
Time:                         15:02:29   Log-Likelihood:            -1.0577e+05
No. Observations:                16890   AIC:                         2.116e+05
Df Residuals:                    16888   BIC:                         2.116e+05
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       8.1613      3.356      2.432      0.015       1.583      14.739
median_household_income  2.207e-05   4.05e-05      0.545      0.586   -5.73e-05       0.000
==============================================================================
Omnibus:                    48059.926   Durbin-Watson:                   1.990
Prob(Omnibus):                  0.000   Jarque-Bera (JB):       3185509630.382
Skew:                          37.898   Prob(JB):                         0.00
Kurtosis:                    2129.202   Cond. No.                     2.85e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.85e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot spill locations by median household income
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='median_household_income', bins=30, kde=True)
plt.title('Distribution of Spills by Median Household Income')
plt.xlabel('Median Household Income')
plt.ylabel('Number of Spills')
plt.show()

# Plot spill locations by percent poverty
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='percent_poverty', bins=30, kde=True)
plt.title('Distribution of Spills by Percent Poverty')
plt.xlabel('Percent Poverty')
plt.ylabel('Number of Spills')
plt.show()

# Plot spill locations by unemployment rate
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='unemployment_rate', bins=30, kde=True)
plt.title('Distribution of Spills by Unemployment Rate')
plt.xlabel('Unemployment Rate')
plt.ylabel('Number of Spills')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Plot spill locations by percent white
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='percent_white', bins=30, kde=True)
plt.title('Distribution of Spills by Percent White Population')
plt.xlabel('Percent White')
plt.ylabel('Number of Spills')
plt.show()

# Plot spill locations by percent hispanic
plt.figure(figsize=(10, 6))
sns.histplot(data=spills_gdf, x='percent_hispanic', bins=30, kde=True)
plt.title('Distribution of Spills by Percent Hispanic Population')
plt.xlabel('Percent Hispanic')
plt.ylabel('Number of Spills')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Scatter plot of reporting delay vs. median household income
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_household_income', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Median Household Income')
plt.xlabel('Median Household Income')
plt.ylabel('Reporting Delay (Days)')
plt.show()

# Scatter plot of reporting delay vs. percent poverty
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_poverty', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Percent Poverty')
plt.xlabel('Percent Poverty')
plt.ylabel('Reporting Delay (Days)')
plt.show()

# Scatter plot of reporting delay vs. unemployment rate
plt.figure(figsize=(10, 6))
sns.scatterplot(x='unemployment_rate', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Unemployment Rate')
plt.xlabel('Unemployment Rate')
plt.ylabel('Reporting Delay (Days)')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Scatter plot of reporting delay vs. percent white
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_white', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Percent White Population')
plt.xlabel('Percent White')
plt.ylabel('Reporting Delay (Days)')
plt.show()

# Scatter plot of reporting delay vs. percent hispanic
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_hispanic', y='Report Delay (Days)', data=spills_gdf)
plt.title('Reporting Delay vs. Percent Hispanic Population')
plt.xlabel('Percent Hispanic')
plt.ylabel('Reporting Delay (Days)')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Select relevant columns for correlation analysis
correlation_columns = [
    'Report Delay (Days)', 
    'median_household_income', 
    'percent_poverty', 
    'unemployment_rate', 
    'percent_white', 
    'percent_hispanic'
]

# Create a DataFrame with the selected columns
correlation_df = spills_gdf[correlation_columns]
In [ ]:
# Calculate the correlation matrix
correlation_matrix = correlation_df.corr()

# Display the correlation matrix
correlation_matrix
Out[ ]:
Report Delay (Days) median_household_income percent_poverty unemployment_rate percent_white percent_hispanic
Report Delay (Days) 1.000000 0.004194 -0.011415 -0.008741 -0.033242 -0.003342
median_household_income 0.004194 1.000000 -0.572311 -0.218962 0.070998 -0.224321
percent_poverty -0.011415 -0.572311 1.000000 0.374435 -0.093304 -0.032890
unemployment_rate -0.008741 -0.218962 0.374435 1.000000 0.122596 -0.131676
percent_white -0.033242 0.070998 -0.093304 0.122596 1.000000 -0.643538
percent_hispanic -0.003342 -0.224321 -0.032890 -0.131676 -0.643538 1.000000
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)

# Add title and labels
plt.title('Correlation Matrix of Spill-Related Variables and EJ Characteristics')
plt.show()
No description has been provided for this image
In [ ]:
## Count the number of spills by operator
operator_spill_count = spills_gdf['Operator'].value_counts()

# Display the top operators with the most spills
operator_spill_count.head(10)
Out[ ]:
Operator
NOBLE ENERGY INC                              2402
KERR MCGEE OIL & GAS ONSHORE LP               2243
PDC ENERGY INC                                1182
CAERUS PICEANCE LLC                           1051
BONANZA CREEK ENERGY OPERATING COMPANY LLC     643
KP KAUFFMAN COMPANY INC                        468
CHEVRON USA INC                                467
HIGHPOINT OPERATING CORPORATION                411
CRESTONE PEAK RESOURCES OPERATING LLC          394
LARAMIE ENERGY LLC                             376
Name: count, dtype: int64
In [ ]:
# Count the number of spills by county
county_spill_count = spills_gdf['county'].value_counts()

# Display the top counties with the most spills
county_spill_count.head(10)
Out[ ]:
county
WELD               9850
GARFIELD           1886
RIO BLANCO         1034
LAS ANIMAS          614
ADAMS               583
LA PLATA            496
MESA                362
JACKSON             271
YUMA                270
WASHINGTON          226
Name: count, dtype: int64
In [ ]:
# Identify operators with more than one spill
recurring_operator_spills = operator_spill_count[operator_spill_count > 1]

# Display the operators with recurring spills
recurring_operator_spills
Out[ ]:
Operator
NOBLE ENERGY INC                              2402
KERR MCGEE OIL & GAS ONSHORE LP               2243
PDC ENERGY INC                                1182
CAERUS PICEANCE LLC                           1051
BONANZA CREEK ENERGY OPERATING COMPANY LLC     643
                                              ... 
WOLVERINE RESOURCES LLC                          2
DAN A HUGHES COMPANY LP                          2
SUMMIT MIDSTREAM NIOBRARA LLC                    2
LYSTER OIL COMPANY INC                           2
SOVEREIGN OPERATING COMPANY LLC                  2
Name: count, Length: 275, dtype: int64
In [ ]:
# Identify counties with more than one spill
recurring_county_spills = county_spill_count[county_spill_count > 1]

# Display the counties with recurring spills
recurring_county_spills
Out[ ]:
county
WELD               9850
GARFIELD           1886
RIO BLANCO         1034
LAS ANIMAS          614
ADAMS               583
LA PLATA            496
MESA                362
JACKSON             271
YUMA                270
WASHINGTON          226
MOFFAT              172
ARAPAHOE            153
LARIMER             122
BOULDER             117
CHEYENNE            106
MONTEZUMA            97
LOGAN                85
LINCOLN              60
ARCHULETA            56
BROOMFIELD           52
ELBERT               50
GUNNISON             45
KIOWA                39
HUERFANO             32
MORGAN               31
BACA                 29
PROWERS              10
DOLORES               9
ROUTT                 9
KIT CARSON            8
PITKIN                7
DENVER                5
FREMONT               4
Name: count, dtype: int64
In [ ]:
# Group by GEOID and aggregate the number of spills and demographic characteristics
tract_spill_summary = spills_gdf.groupby('GEOID').agg({
    'Document #': 'count',
    'median_household_income': 'mean',
    'percent_poverty': 'mean',
    'percent_white': 'mean'
}).rename(columns={'Document #': 'Number of Spills'})

# Display the summary
tract_spill_summary.head()
Out[ ]:
Number of Spills median_household_income percent_poverty percent_white
GEOID
08001008354 31 101250.0 10.757540 28.591783
08001008401 122 88286.0 5.918776 86.814788
08001008402 95 114276.0 4.121431 82.046643
08001008526 1 98283.0 3.801249 79.011675
08001008538 9 131231.0 4.150727 75.181714
In [ ]:
# Calculate the correlation matrix for the tract-level data
correlation_matrix_tract = tract_spill_summary.corr()

# Display the correlation matrix
correlation_matrix_tract
Out[ ]:
Number of Spills median_household_income percent_poverty percent_white
Number of Spills 1.000000 -0.157514 0.133947 -0.008777
median_household_income -0.157514 1.000000 -0.625752 -0.001162
percent_poverty 0.133947 -0.625752 1.000000 -0.067320
percent_white -0.008777 -0.001162 -0.067320 1.000000
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot of Number of Spills vs. Median Household Income
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_household_income', y='Number of Spills', data=tract_spill_summary)
plt.title('Number of Spills vs. Median Household Income (Census Tract Level)')
plt.xlabel('Median Household Income')
plt.ylabel('Number of Spills')
plt.show()

# Scatter plot of Number of Spills vs. Percent Poverty
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_poverty', y='Number of Spills', data=tract_spill_summary)
plt.title('Number of Spills vs. Percent Poverty (Census Tract Level)')
plt.xlabel('Percent Poverty')
plt.ylabel('Number of Spills')
plt.show()

# Scatter plot of Number of Spills vs. Percent White Population
plt.figure(figsize=(10, 6))
sns.scatterplot(x='percent_white', y='Number of Spills', data=tract_spill_summary)
plt.title('Number of Spills vs. Percent White Population (Census Tract Level)')
plt.xlabel('Percent White Population')
plt.ylabel('Number of Spills')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Remove rows with NaN or infinite values
tract_spill_summary_cleaned = tract_spill_summary.replace([np.inf, -np.inf], np.nan).dropna()

# Check that the data is clean
tract_spill_summary_cleaned.isna().sum()
Out[ ]:
Number of Spills           0
median_household_income    0
percent_poverty            0
percent_white              0
dtype: int64
In [ ]:
# Define the independent variables (with a constant for intercept)
X = tract_spill_summary_cleaned[['median_household_income', 'percent_poverty', 'percent_white']]
X = sm.add_constant(X)

# Define the dependent variable
y = tract_spill_summary_cleaned['Number of Spills']

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the summary of the model
model_summary = model.summary()
print(model_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       Number of Spills   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     1.403
Date:                Fri, 09 Aug 2024   Prob (F-statistic):              0.244
Time:                        15:08:59   Log-Likelihood:                -1091.6
No. Observations:                 157   AIC:                             2191.
Df Residuals:                     153   BIC:                             2203.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     185.0017    197.832      0.935      0.351    -205.834     575.837
median_household_income    -0.0010      0.001     -1.178      0.241      -0.003       0.001
percent_poverty             2.5445      4.494      0.566      0.572      -6.335      11.424
percent_white              -0.1193      1.909     -0.063      0.950      -3.890       3.652
==============================================================================
Omnibus:                      196.289   Durbin-Watson:                   1.335
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6950.948
Skew:                           4.985   Prob(JB):                         0.00
Kurtosis:                      34.035   Cond. No.                     9.33e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.33e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# Re-calculate the correlation matrix
correlation_matrix_cleaned = tract_spill_summary_cleaned.corr()

# Display the cleaned correlation matrix
correlation_matrix_cleaned
Out[ ]:
Number of Spills median_household_income percent_poverty percent_white
Number of Spills 1.000000 -0.157012 0.133947 -0.008777
median_household_income -0.157012 1.000000 -0.625752 -0.001162
percent_poverty 0.133947 -0.625752 1.000000 -0.067320
percent_white -0.008777 -0.001162 -0.067320 1.000000
In [ ]: