import pandas as pd
+from sqlalchemy import create_engine
+import geopandas as gpd
+from dotenv import load_dotenv
+load_dotenv()
+
+import os
+
+# Database connection details from zshrc environment variables
+# use .env file for local development
+
+
+db_name = 'colorado_spills'
+user = os.getenv('DB_USER')
+password = os.getenv('DB_PASSWORD')
+host = os.getenv('DB_HOST')
+port = os.getenv('DB_PORT')
+
+
+# Create an engine to connect to the PostgreSQL database
+engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}')
+
+# Read in the spills_with_demographics_geog as spills
+spills = pd.read_sql_table('spills_with_demographics_geog', engine)
+/home/dadams/miniconda3/envs/spatial_env2/lib/python3.10/site-packages/pandas/io/sql.py:1725: SAWarning: Did not recognize type 'geometry' of column 'geometry' + self.meta.reflect(bind=self.con, only=[table_name], views=True) ++
# use longitude and latitude to create a GeoDataFrame from spills data
+spills['geometry'] = gpd.points_from_xy(spills['Longitude'], spills['Latitude'])
+spills_gdf = gpd.GeoDataFrame(spills, crs='EPSG:4326')
+# Ensure date columns are in datetime format
+spills_gdf['Date of Discovery'] = pd.to_datetime(spills_gdf['Date of Discovery'])
+spills_gdf['Initial Report Date'] = pd.to_datetime(spills_gdf['Initial Report Date'])
+# drop 2024 data for date of discovery and initial report date
+spills_gdf = spills_gdf[spills_gdf['Date of Discovery'].dt.year < 2024]
+spills_gdf = spills_gdf[spills_gdf['Initial Report Date'].dt.year < 2024]
+# Separate Historical vs. Recent Spills
+historical_spills = spills_gdf[spills_gdf['Spill Type'] == 'Historical']
+recent_spills = spills_gdf[spills_gdf['Spill Type'] == 'Recent']
+# Compare Reporting Delay
+report_delay_summary = spills_gdf.groupby('Spill Type')['Report Delay (Days)'].describe()
+print(report_delay_summary)
+
+# Ensure the output directory exists
+output_dir = 'analysis/output'
+os.makedirs(output_dir, exist_ok=True)
+
+# Save the report delay summary to a markdown file
+with open(os.path.join(output_dir, 'report_delay_summary.md'), 'w') as f:
+ f.write("# Report Delay Summary\n\n")
+ f.write(report_delay_summary.to_markdown())
+count mean std min 25% 50% 75% max +Spill Type +Historical 5312.0 21.490023 205.615913 0.0 0.0 1.0 2.0 9261.0 +Recent 10678.0 3.964506 46.997749 0.0 0.0 1.0 2.0 2232.0 ++
import matplotlib.pyplot as plt
+
+# Compare Spill Trends Over Time
+spills_by_year = spills_gdf.groupby(['Report Year', 'Spill Type']).size().unstack()
+ax = spills_by_year.plot(kind='bar', figsize=(12,6), title='Spills by Year and Type')
+ax.set_xlabel('Year')
+ax.set_ylabel('Number of Spills')
+ax.legend(title='Spill Type')
+
+# Ensure the output directory exists
+output_dir = 'analysis/output'
+os.makedirs(output_dir, exist_ok=True)
+
+# Save the plot
+plt.savefig(f'{output_dir}/spills_by_year_and_type.png')
+
+# Display the plot
+plt.show()
+import matplotlib.pyplot as plt
+import geopandas as gpd
+
+# Ensure spills_gdf is already a GeoDataFrame with the necessary data
+# spills_gdf = gpd.GeoDataFrame(...)
+
+# Create a plot for the spatial distribution of spills
+fig, ax = plt.subplots(figsize=(10, 6))
+
+# Plot the GeoDataFrame with optimizations
+spills_gdf.plot(
+ ax=ax,
+ marker='o',
+ alpha=0.6, # Adjust transparency for better visibility
+ column='Spill Type',
+ legend=True,
+ legend_kwds={'title': "Spill Type"}, # Correctly specify the legend title
+ cmap='viridis' # Use a colormap for better visual distinction
+)
+
+# Set plot title and labels
+ax.set_title('Spatial Distribution of Historical vs. Recent Spills', fontsize=14)
+ax.set_xlabel('Longitude', fontsize=12)
+ax.set_ylabel('Latitude', fontsize=12)
+
+# Improve layout
+plt.tight_layout()
+
+# Display the plot
+plt.show()
+import matplotlib.pyplot as plt
+import contextily as ctx
+import numpy as np
+
+# Use your ORIGINAL plot code exactly as it is, just adding the basemap at the end
+# This ensures all your data points, colors and categories remain intact
+
+# Starting with your original code:
+fig, ax = plt.subplots(figsize=(10, 6))
+
+# Plot the GeoDataFrame with optimizations - YOUR ORIGINAL PLOTTING CODE
+spills_gdf.plot(
+ ax=ax,
+ marker='o',
+ alpha=0.6, # Adjust transparency for better visibility
+ column='Spill Type',
+ legend=True,
+ legend_kwds={'title': "Spill Type"}, # Correctly specify the legend title
+ cmap='viridis' # Use a colormap for better visual distinction
+)
+
+# Set plot title and labels
+ax.set_title('Spatial Distribution of Historical vs. Recent Spills in Colorado', fontsize=14)
+ax.set_xlabel('Longitude', fontsize=12)
+ax.set_ylabel('Latitude', fontsize=12)
+
+# Store the current axis limits to restore them after adding basemap
+xlim = ax.get_xlim()
+ylim = ax.get_ylim()
+
+# Before adding the basemap, convert the axes coordinates to Web Mercator
+# This is a crucial step that lets us keep your original points exactly as they are
+from pyproj import Transformer
+
+# Create transformer from lat/lon to Web Mercator
+transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)
+
+# Transform the limits
+west, south = transformer.transform(xlim[0], ylim[0])
+east, north = transformer.transform(xlim[1], ylim[1])
+
+# Add the basemap - this will be UNDER your existing points
+ctx.add_basemap(
+ ax,
+ crs=spills_gdf.crs,
+ source=ctx.providers.OpenStreetMap.Mapnik,
+ zoom="auto"
+)
+
+# Restore original axis limits
+ax.set_xlim(xlim)
+ax.set_ylim(ylim)
+
+# Improve layout
+plt.tight_layout()
+
+# Save the plot to the analysis/output directory
+output_dir = 'analysis/output'
+os.makedirs(output_dir, exist_ok=True)
+plt.savefig(f'{output_dir}/spatial_distribution_spills.png')
+
+# Show the plot
+plt.show()
+# Add a column to indicate whether the spill occurred before or after 2020
+spills_gdf['Period'] = spills_gdf['Report Year'].apply(lambda x: 'Before 2020' if x < 2020 else '2020 and After')
+
+# Group by the new period column and calculate the mean response time
+response_time_summary = spills_gdf.groupby('Period')['Report Delay (Days)'].describe()
+print(response_time_summary)
+
+# Ensure the output directory exists
+output_dir = 'analysis/output'
+os.makedirs(output_dir, exist_ok=True)
+
+# Save the response time summary to a markdown file
+with open(os.path.join(output_dir, 'response_time_summary.md'), 'w') as f:
+ f.write("# Response Time Summary\n\n")
+ f.write(response_time_summary.to_markdown())
+count mean std min 25% 50% 75% max +Period +2020 and After 7177.0 6.924342 124.965839 0.0 0.0 1.0 1.0 9261.0 +Before 2020 8813.0 12.117554 124.705876 0.0 0.0 1.0 2.0 5681.0 ++
# t-test for the difference in response time before and after 2020
+from scipy.stats import ttest_ind
+
+before_2020 = spills_gdf[spills_gdf['Period'] == 'Before 2020']['Report Delay (Days)']
+after_2020 = spills_gdf[spills_gdf['Period'] == '2020 and After']['Report Delay (Days)']
+
+t_stat, p_value = ttest_ind(before_2020, after_2020, equal_var=False)
+print(f'T-Statistic: {t_stat:.4f}')
+print(f'P-Value: {p_value:.4f}')
+
+# Check if the difference is statistically significant
+if p_value < 0.05:
+ print('The difference in response time is statistically significant.')
+else:
+ print('The difference in response time is not statistically significant.')
+
+# Create a plot for the response time distribution before and after 2020
+fig, ax = plt.subplots(figsize=(10, 6))
+
+# Plot the response time distribution for each period
+spills_gdf.boxplot(column='Report Delay (Days)', by='Period', ax=ax)
+T-Statistic: 2.6161 +P-Value: 0.0089 +The difference in response time is statistically significant. ++
<Axes: title={'center': 'Report Delay (Days)'}, xlabel='Period'>
+%pip install tabulate
+Requirement already satisfied: tabulate in /home/dadams/miniconda3/envs/spatial_env2/lib/python3.10/site-packages (0.9.0) ++
Note: you may need to restart the kernel to use updated packages. ++
import tabulate
+from scipy.stats import ttest_ind
+
+# Ensure the output directory exists
+output_dir = 'analysis/output'
+os.makedirs(output_dir, exist_ok=True)
+
+# Add a column to indicate whether the spill occurred before or after 2020 for historical and recent spills
+historical_spills['Period'] = historical_spills['Report Year'].apply(lambda x: 'Before 2020' if x < 2020 else '2020 and After')
+recent_spills['Period'] = recent_spills['Report Year'].apply(lambda x: 'Before 2020' if x < 2020 else '2020 and After')
+
+# Historical Spills Analysis
+historical_report_delay_summary = historical_spills['Report Delay (Days)'].describe()
+print("Historical Spills Report Delay Summary:")
+print(historical_report_delay_summary)
+
+# Recent Spills Analysis
+recent_report_delay_summary = recent_spills['Report Delay (Days)'].describe()
+print("\nRecent Spills Report Delay Summary:")
+print(recent_report_delay_summary)
+
+# Historical Spills Response Time Summary
+historical_response_time_summary = historical_spills.groupby('Period')['Report Delay (Days)'].describe()
+print("\nHistorical Spills Response Time Summary:")
+print(historical_response_time_summary)
+
+# Recent Spills Response Time Summary
+recent_response_time_summary = recent_spills.groupby('Period')['Report Delay (Days)'].describe()
+print("\nRecent Spills Response Time Summary:")
+print(recent_response_time_summary)
+
+# t-test for the difference in response time before and after 2020 for historical spills
+historical_before_2020 = historical_spills[historical_spills['Period'] == 'Before 2020']['Report Delay (Days)']
+historical_after_2020 = historical_spills[historical_spills['Period'] == '2020 and After']['Report Delay (Days)']
+
+t_stat_historical, p_value_historical = ttest_ind(historical_before_2020, historical_after_2020, equal_var=False)
+print(f'\nHistorical Spills T-Statistic: {t_stat_historical:.4f}')
+print(f'Historical Spills P-Value: {p_value_historical:.4f}')
+
+# Check if the difference is statistically significant for historical spills
+if p_value_historical < 0.05:
+ print('The difference in response time for historical spills is statistically significant.')
+else:
+ print('The difference in response time for historical spills is not statistically significant.')
+
+# t-test for the difference in response time before and after 2020 for recent spills
+recent_before_2020 = recent_spills[recent_spills['Period'] == 'Before 2020']['Report Delay (Days)']
+recent_after_2020 = recent_spills[recent_spills['Period'] == '2020 and After']['Report Delay (Days)']
+
+t_stat_recent, p_value_recent = ttest_ind(recent_before_2020, recent_after_2020, equal_var=False)
+print(f'\nRecent Spills T-Statistic: {t_stat_recent:.4f}')
+print(f'Recent Spills P-Value: {p_value_recent:.4f}')
+
+# Check if the difference is statistically significant for recent spills
+if p_value_recent < 0.05:
+ print('The difference in response time for recent spills is statistically significant.')
+else:
+ print('The difference in response time for recent spills is not statistically significant.')
+
+# Save the summaries and t-test results to a markdown file
+with open(os.path.join(output_dir, 'analysis_results.md'), 'w') as f:
+ f.write("# Analysis Results\n\n")
+
+ f.write("## Historical Spills Report Delay Summary:\n")
+ f.write(historical_report_delay_summary.to_markdown() + "\n\n")
+
+ f.write("## Recent Spills Report Delay Summary:\n")
+ f.write(recent_report_delay_summary.to_markdown() + "\n\n")
+
+ f.write("## Historical Spills Response Time Summary:\n")
+ f.write(historical_response_time_summary.to_markdown() + "\n\n")
+
+ f.write("## Recent Spills Response Time Summary:\n")
+ f.write(recent_response_time_summary.to_markdown() + "\n\n")
+
+ f.write("## T-Test Results for Historical Spills\n")
+ f.write(f"T-Statistic: {t_stat_historical:.4f}\n")
+ f.write(f"P-Value: {p_value_historical:.4f}\n")
+ if p_value_historical < 0.05:
+ f.write('The difference in response time for historical spills is statistically significant.\n')
+ else:
+ f.write('The difference in response time for historical spills is not statistically significant.\n')
+
+ f.write("\n## T-Test Results for Recent Spills\n")
+ f.write(f"T-Statistic: {t_stat_recent:.4f}\n")
+ f.write(f"P-Value: {p_value_recent:.4f}\n")
+ if p_value_recent < 0.05:
+ f.write('The difference in response time for recent spills is statistically significant.\n')
+ else:
+ f.write('The difference in response time for recent spills is not statistically significant.\n')
+Historical Spills Report Delay Summary: +count 5312.000000 +mean 21.490023 +std 205.615913 +min 0.000000 +25% 0.000000 +50% 1.000000 +75% 2.000000 +max 9261.000000 +Name: Report Delay (Days), dtype: float64 + +Recent Spills Report Delay Summary: +count 10678.000000 +mean 3.964506 +std 46.997749 +min 0.000000 +25% 0.000000 +50% 1.000000 +75% 2.000000 +max 2232.000000 +Name: Report Delay (Days), dtype: float64 + +Historical Spills Response Time Summary: + count mean std min 25% 50% 75% max +Period +2020 and After 2705.0 13.533087 200.352339 0.0 0.0 0.0 1.0 9261.0 +Before 2020 2607.0 29.746068 210.659491 0.0 0.0 1.0 2.0 5681.0 + +Recent Spills Response Time Summary: + count mean std min 25% 50% 75% max +Period +2020 and After 4472.0 2.926878 27.302062 0.0 0.0 1.0 1.0 1329.0 +Before 2020 6206.0 4.712214 57.116100 0.0 1.0 1.0 2.0 2232.0 + +Historical Spills T-Statistic: 2.8723 +Historical Spills P-Value: 0.0041 +The difference in response time for historical spills is statistically significant. + +Recent Spills T-Statistic: 2.1457 +Recent Spills P-Value: 0.0319 +The difference in response time for recent spills is statistically significant. ++
/home/dadams/miniconda3/envs/spatial_env2/lib/python3.10/site-packages/geopandas/geodataframe.py:1819: SettingWithCopyWarning: +A value is trying to be set on a copy of a slice from a DataFrame. +Try using .loc[row_indexer,col_indexer] = value instead + +See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy + super().__setitem__(key, value) +/home/dadams/miniconda3/envs/spatial_env2/lib/python3.10/site-packages/geopandas/geodataframe.py:1819: SettingWithCopyWarning: +A value is trying to be set on a copy of a slice from a DataFrame. +Try using .loc[row_indexer,col_indexer] = value instead + +See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy + super().__setitem__(key, value) ++
import os
+
+# Ensure the output directory exists
+output_dir = 'analysis/output'
+os.makedirs(output_dir, exist_ok=True)
+
+# Save the t-test results to a markdown file
+with open(os.path.join(output_dir, 't_test_results.md'), 'w') as f:
+ f.write(f"## T-Test Results for Historical Spills\n")
+ f.write(f"T-Statistic: {t_stat_historical:.4f}\n")
+ f.write(f"P-Value: {p_value_historical:.4f}\n")
+ if p_value_historical < 0.05:
+ f.write('The difference in response time for historical spills is statistically significant.\n')
+ else:
+ f.write('The difference in response time for historical spills is not statistically significant.\n')
+
+ f.write(f"\n## T-Test Results for Recent Spills\n")
+ f.write(f"T-Statistic: {t_stat_recent:.4f}\n")
+ f.write(f"P-Value: {p_value_recent:.4f}\n")
+ if p_value_recent < 0.05:
+ f.write('The difference in response time for recent spills is statistically significant.\n')
+ else:
+ f.write('The difference in response time for recent spills is not statistically significant.\n')
+
+# Create and save the plots
+fig, ax = plt.subplots(figsize=(10, 6))
+historical_spills.boxplot(column='Report Delay (Days)', by='Period', ax=ax)
+ax.set_title('Historical Spills Response Time Distribution Before and After 2020', fontsize=14)
+ax.set_xlabel('Period', fontsize=12)
+ax.set_ylabel('Report Delay (Days)', fontsize=12)
+plt.suptitle('')
+plt.tight_layout()
+plt.savefig(os.path.join(output_dir, 'historical_spills_response_time_distribution.png'))
+
+fig, ax = plt.subplots(figsize=(10, 6))
+recent_spills.boxplot(column='Report Delay (Days)', by='Period', ax=ax)
+ax.set_title('Recent Spills Response Time Distribution Before and After 2020', fontsize=14)
+ax.set_xlabel('Period', fontsize=12)
+ax.set_ylabel('Report Delay (Days)', fontsize=12)
+plt.suptitle('')
+plt.tight_layout()
+plt.savefig(os.path.join(output_dir, 'recent_spills_response_time_distribution.png'))
+
+
+# t-test for the difference in response time before and after 2020 for historical spills
+historical_before_2020 = historical_spills[historical_spills['Period'] == 'Before 2020']['Report Delay (Days)']
+historical_after_2020 = historical_spills[historical_spills['Period'] == '2020 and After']['Report Delay (Days)']
+
+t_stat, p_value = ttest_ind(historical_before_2020, historical_after_2020, equal_var=False)
+print(f'\nHistorical Spills T-Statistic: {t_stat:.4f}')
+print(f'Historical Spills P-Value: {p_value:.4f}')
+
+# Check if the difference is statistically significant for historical spills
+if p_value < 0.05:
+ print('The difference in response time for historical spills is statistically significant.')
+else:
+ print('The difference in response time for historical spills is not statistically significant.')
+
+# t-test for the difference in response time before and after 2020 for recent spills
+recent_before_2020 = recent_spills[recent_spills['Period'] == 'Before 2020']['Report Delay (Days)']
+recent_after_2020 = recent_spills[recent_spills['Period'] == '2020 and After']['Report Delay (Days)']
+
+t_stat, p_value = ttest_ind(recent_before_2020, recent_after_2020, equal_var=False)
+print(f'\nRecent Spills T-Statistic: {t_stat:.4f}')
+print(f'Recent Spills P-Value: {p_value:.4f}')
+
+# Check if the difference is statistically significant for recent spills
+if p_value < 0.05:
+ print('The difference in response time for recent spills is statistically significant.')
+else:
+ print('The difference in response time for recent spills is not statistically significant.')
+
+# Create a plot for the response time distribution before and after 2020 for historical and recent spills
+fig, ax = plt.subplots(figsize=(10, 6))
+
+# Plot the response time distribution for historical spills
+historical_spills.boxplot(column='Report Delay (Days)', by='Period', ax=ax, positions=[1, 2])
+
+# Plot the response time distribution for recent spills
+recent_spills.boxplot(column='Report Delay (Days)', by='Period', ax=ax, positions=[4, 5])
+
+# Save the plot to the /output directory
+plt.savefig('analysis/output/response_time_distribution.png')
++Historical Spills T-Statistic: 2.8723 +Historical Spills P-Value: 0.0041 +The difference in response time for historical spills is statistically significant. + +Recent Spills T-Statistic: 2.1457 +Recent Spills P-Value: 0.0319 +The difference in response time for recent spills is statistically significant. ++
# Historical only
+historical_spills = historical_spills.copy()
+historical_spills['Post2020'] = historical_spills['Period'].apply(lambda x: 1 if x == '2020 and After' else 0)
+model_hist = smf.ols('Q("Report Delay (Days)") ~ Post2020', data=historical_spills).fit()
+print("Historical Spills:\n", model_hist.summary())
+
+# Recent only
+recent_spills = recent_spills.copy()
+recent_spills['Post2020'] = recent_spills['Period'].apply(lambda x: 1 if x == '2020 and After' else 0)
+model_recent = smf.ols('Q("Report Delay (Days)") ~ Post2020', data=recent_spills).fit()
+print("\nRecent Spills:\n", model_recent.summary())
+Historical Spills:
+ OLS Regression Results
+====================================================================================
+Dep. Variable: Q("Report Delay (Days)") R-squared: 0.002
+Model: OLS Adj. R-squared: 0.001
+Method: Least Squares F-statistic: 8.265
+Date: Mon, 31 Mar 2025 Prob (F-statistic): 0.00406
+Time: 13:58:43 Log-Likelihood: -35825.
+No. Observations: 5312 AIC: 7.165e+04
+Df Residuals: 5310 BIC: 7.167e+04
+Df Model: 1
+Covariance Type: nonrobust
+==============================================================================
+ coef std err t P>|t| [0.025 0.975]
+------------------------------------------------------------------------------
+Intercept 29.7461 4.024 7.392 0.000 21.857 37.635
+Post2020 -16.2130 5.639 -2.875 0.004 -27.269 -5.157
+==============================================================================
+Omnibus: 12943.048 Durbin-Watson: 1.944
+Prob(Omnibus): 0.000 Jarque-Bera (JB): 189644913.057
+Skew: 25.310 Prob(JB): 0.00
+Kurtosis: 927.266 Cond. No. 2.64
+==============================================================================
+
+Notes:
+[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
+
+Recent Spills:
+ OLS Regression Results
+====================================================================================
+Dep. Variable: Q("Report Delay (Days)") R-squared: 0.000
+Model: OLS Adj. R-squared: 0.000
+Method: Least Squares F-statistic: 3.752
+Date: Mon, 31 Mar 2025 Prob (F-statistic): 0.0528
+Time: 13:58:43 Log-Likelihood: -56260.
+No. Observations: 10678 AIC: 1.125e+05
+Df Residuals: 10676 BIC: 1.125e+05
+Df Model: 1
+Covariance Type: nonrobust
+==============================================================================
+ coef std err t P>|t| [0.025 0.975]
+------------------------------------------------------------------------------
+Intercept 4.7122 0.597 7.900 0.000 3.543 5.881
+Post2020 -1.7853 0.922 -1.937 0.053 -3.592 0.021
+==============================================================================
+Omnibus: 28541.449 Durbin-Watson: 1.643
+Prob(Omnibus): 0.000 Jarque-Bera (JB): 676851501.928
+Skew: 32.390 Prob(JB): 0.00
+Kurtosis: 1234.707 Cond. No. 2.47
+==============================================================================
+
+Notes:
+[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
+
+# Label both groups, combine
+historical_spills['Group'] = 'Historical'
+recent_spills['Group'] = 'Recent'
+combined = pd.concat([historical_spills, recent_spills], ignore_index=True)
+
+combined['Post2020'] = combined['Period'].apply(lambda x: 1 if x == '2020 and After' else 0)
+
+# Run regression with interactions by group
+model = smf.ols('Q("Report Delay (Days)") ~ Post2020 * Group', data=combined).fit()
+print(model.summary())
+ OLS Regression Results
+====================================================================================
+Dep. Variable: Q("Report Delay (Days)") R-squared: 0.006
+Model: OLS Adj. R-squared: 0.006
+Method: Least Squares F-statistic: 31.12
+Date: Mon, 31 Mar 2025 Prob (F-statistic): 4.78e-20
+Time: 13:53:41 Log-Likelihood: -99827.
+No. Observations: 15990 AIC: 1.997e+05
+Df Residuals: 15986 BIC: 1.997e+05
+Df Model: 3
+Covariance Type: nonrobust
+============================================================================================
+ coef std err t P>|t| [0.025 0.975]
+--------------------------------------------------------------------------------------------
+Intercept 29.7461 2.438 12.200 0.000 24.967 34.525
+Group[T.Recent] -25.0339 2.906 -8.616 0.000 -30.729 -19.339
+Post2020 -16.2130 3.417 -4.745 0.000 -22.910 -9.516
+Post2020:Group[T.Recent] 14.4276 4.200 3.435 0.001 6.196 22.660
+==============================================================================
+Omnibus: 45995.174 Durbin-Watson: 1.915
+Prob(Omnibus): 0.000 Jarque-Bera (JB): 3525625525.882
+Skew: 38.959 Prob(JB): 0.00
+Kurtosis: 2302.059 Cond. No. 8.21
+==============================================================================
+
+Notes:
+[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
+
+import matplotlib.pyplot as plt
+
+# Data
+pre_means = [29.75, 4.71]
+post_means = [29.75 - 16.21, 4.71 - 1.79]
+labels = ['Historical Spills', 'Recent Spills']
+
+x = range(len(labels))
+width = 0.35
+
+# Plot
+fig, ax = plt.subplots(figsize=(8, 6))
+ax.bar([i - width / 2 for i in x], pre_means, width, label='Pre-2020', color='skyblue')
+ax.bar([i + width / 2 for i in x], post_means, width, label='Post-2020', color='salmon')
+
+# Labels and aesthetics
+ax.set_ylabel('Mean Report Delay (Days)')
+ax.set_title('Mean Report Delay Before and After 2020 by Spill Type')
+ax.set_xticks(x)
+ax.set_xticklabels(labels)
+ax.legend()
+
+# Save and show
+plt.tight_layout()
+plt.savefig('report_delay_barplot.png')
+plt.show()
+| Group | +Pre-2020 Mean | +Post-2020 Change (coef) | +p-value | +Significant (p < 0.05) | +
|---|---|---|---|---|
| Historical Spills | +29.75 | +-16.21 | +0.004 | +Yes | +
| Recent Spills | +4.71 | +-1.79 | +0.053 | +No | +
Analysis of Reporting Delays for Historical and Recent Spills¶
We analyzed reporting delays for historical and recent spills before and after 2020 using simple OLS regressions. For historical spills, the average delay decreased significantly by approximately 16.2 days after 2020 (p = 0.004), dropping from a pre-2020 mean of 29.75 days. In contrast, the reduction for recent spills was smaller—about 1.8 days—and only marginally significant (p = 0.053), starting from a much lower pre-2020 average of 4.71 days. These results suggest that the most substantial improvements in timeliness occurred within the historical spill reporting process.
+Differences in Reporting Delays by Spill Type¶
+import pandas as pd
+import statsmodels.formula.api as smf
+
+# Combine the datasets and label the groups
+historical_spills['SpillType'] = 0 # historical = 0
+recent_spills['SpillType'] = 1 # recent = 1
+
+combined = pd.concat([historical_spills, recent_spills], ignore_index=True)
+
+# Convert 'Period' to binary: 0 = Before 2020, 1 = 2020 and After
+combined['Post2020'] = combined['Period'].apply(lambda x: 1 if x == '2020 and After' else 0)
+
+# Interaction term is created implicitly in the regression
+model = smf.ols('Q("Report Delay (Days)") ~ Post2020 + SpillType + Post2020:SpillType', data=combined).fit()
+
+print(model.summary())
+ OLS Regression Results
+====================================================================================
+Dep. Variable: Q("Report Delay (Days)") R-squared: 0.006
+Model: OLS Adj. R-squared: 0.006
+Method: Least Squares F-statistic: 31.12
+Date: Mon, 31 Mar 2025 Prob (F-statistic): 4.78e-20
+Time: 14:02:43 Log-Likelihood: -99827.
+No. Observations: 15990 AIC: 1.997e+05
+Df Residuals: 15986 BIC: 1.997e+05
+Df Model: 3
+Covariance Type: nonrobust
+======================================================================================
+ coef std err t P>|t| [0.025 0.975]
+--------------------------------------------------------------------------------------
+Intercept 29.7461 2.438 12.200 0.000 24.967 34.525
+Post2020 -16.2130 3.417 -4.745 0.000 -22.910 -9.516
+SpillType -25.0339 2.906 -8.616 0.000 -30.729 -19.339
+Post2020:SpillType 14.4276 4.200 3.435 0.001 6.196 22.660
+==============================================================================
+Omnibus: 45995.174 Durbin-Watson: 1.915
+Prob(Omnibus): 0.000 Jarque-Bera (JB): 3525625525.882
+Skew: 38.959 Prob(JB): 0.00
+Kurtosis: 2302.059 Cond. No. 8.21
+==============================================================================
+
+Notes:
+[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
+
+| Term | +Coefficient | +Std. Error | +p-value | +Interpretation | +
|---|---|---|---|---|
| Intercept | +29.75 | +2.44 | +<0.001 | +Historical spills, pre-2020 average report delay | +
| Post2020 | +-16.21 | +3.42 | +<0.001 | +Historical spills saw a 16.2-day reduction in delay after 2020 | +
| Group: Recent Spills | +-25.03 | +2.91 | +<0.001 | +Recent spills had a 25-day shorter delay than historical spills pre-2020 | +
| Post2020 × Recent Spills | ++14.43 | +4.20 | +0.001 | +Recent spills improved 14.4 days less than historical spills post-2020 | +
import matplotlib.pyplot as plt
+
+# X-axis positions: 0 = Before 2020, 1 = 2020 and After
+x = [0, 1]
+
+# Y-values for each group
+historical = [29.75, 29.75 - 16.21] # Before and after for historical
+recent = [4.71, 4.71 - 1.79] # Before and after for recent
+
+# Plot
+fig, ax = plt.subplots(figsize=(8, 6))
+
+ax.plot(x, historical, marker='o', label='Historical Spills', linewidth=2)
+ax.plot(x, recent, marker='o', label='Recent Spills', linewidth=2)
+
+# Aesthetics
+ax.set_xticks(x)
+ax.set_xticklabels(['Before 2020', '2020 and After'])
+ax.set_ylabel('Mean Report Delay (Days)')
+ax.set_title('Difference-in-Differences: Report Delay by Spill Type')
+ax.legend()
+ax.grid(True)
+plt.tight_layout()
+plt.show()
+import matplotlib.pyplot as plt
+
+improvements = [-16.21, -1.79]
+labels = ['Historical Spills', 'Recent Spills']
+
+fig, ax = plt.subplots(figsize=(7, 5))
+ax.bar(labels, improvements, color=['steelblue', 'orange'])
+ax.axhline(0, color='black', linewidth=0.8)
+ax.set_ylabel('Change in Report Delay (Days)')
+ax.set_title('Change in Report Delay After 2020')
+plt.tight_layout()
+plt.show()
+Interpretation of Difference-in-Differences Model
+This Difference-in-Differences model estimates how report delays changed after 2020 for two groups—historical and recent spills—and compares the magnitude of change between them. The pre-2020 average delay for historical spills was approximately 29.75 days. After 2020, historical spills saw a statistically significant reduction of about 16.2 days (p < 0.001).
+Recent spills, by contrast, already had much shorter average delays (about 25 days less than historical spills before 2020). The DiD interaction term (+14.4 days, p = 0.001) shows that the improvement in timeliness for recent spills was significantly smaller than that for historical spills. In other words, while both groups saw reductions in report delays after 2020, the gains were substantially greater for historical spills.
+This suggests that any policy or process changes implemented around 2020 had a disproportionately stronger effect on addressing delays in the historical spill category.
+