import os
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load environment variables (e.g., DB_USER, DB_PASSWORD)
load_dotenv()

# Database credentials from .env or shell
db_name = 'colorado_spills'
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')

# SQLAlchemy engine for PostgreSQL/PostGIS
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}')

# --- Load data with geometry preserved ---
try:
    spills = gpd.read_postgis(
        sql='SELECT * FROM spills_with_ruca',
        con=engine,
        geom_col='geometry',
        crs='EPSG:4326'
    )
except Exception as e:
    print("GeoPandas failed to load geometry. Falling back to plain Pandas.")
    spills = pd.read_sql('SELECT * FROM spills_with_ruca', engine)
    # Optional: spills.drop(columns='geometry', inplace=True, errors='ignore')

# use longitude and latitude to create a GeoDataFrame from spills data
spills['geometry'] = gpd.points_from_xy(spills['Longitude'], spills['Latitude'])

spills_gdf = gpd.GeoDataFrame(spills, crs='EPSG:4326')

# Ensure date columns are in datetime format
spills_gdf['Date of Discovery'] = pd.to_datetime(spills_gdf['Date of Discovery'])
spills_gdf['Initial Report Date'] = pd.to_datetime(spills_gdf['Initial Report Date'])

# create a report year column
spills_gdf['Report Year'] = spills_gdf['Date of Discovery'].dt.year

from datetime import datetime

# 48 months before and after January 1, 2021
start_date = datetime(2017, 7, 1)
end_date = datetime(2025, 7, 1)  # exclusive end date

spills_gdf = spills_gdf[
    (spills_gdf['Date of Discovery'] >= start_date) &
    (spills_gdf['Date of Discovery'] < end_date)
]

# Separate Historical vs. Recent Spills
historical_spills = spills_gdf[spills_gdf['Spill Type'] == 'Historical']
recent_spills = spills_gdf[spills_gdf['Spill Type'] == 'Recent']

# only use the top 3 counties by number of spills
top_counties = spills_gdf['county'].value_counts().nlargest(3).index.tolist()
# filter spills_gdf to only include top counties
spills_gdf = spills_gdf[spills_gdf['county'].isin(top_counties)]
# filter historical spills to only include top counties
historical_spills = historical_spills[historical_spills['county'].isin(top_counties)]
# filter recent spills to only include top counties
recent_spills = recent_spills[recent_spills['county'].isin(top_counties)]

import statsmodels.formula.api as smf

# Create 'Period' column
spills_gdf['Period'] = spills_gdf['Report Year'].apply(lambda x: 'Before 2021' if x < 2021 else '2021 and After')

spills_gdf['Initial Report Date'] = pd.to_datetime(spills_gdf['Initial Report Date'])
spills_gdf['Date of Discovery'] = pd.to_datetime(spills_gdf['Date of Discovery'])

spills_gdf['report_delay'] = (spills_gdf['Initial Report Date'] - spills_gdf['Date of Discovery']).dt.days

spills_gdf = spills_gdf[spills_gdf['report_delay'] >= 0]

spills_gdf = spills_gdf.rename(columns={
    'Report Delay (Days)': 'report_delay',
    'Spill Type': 'spill_type'
})

import statsmodels.formula.api as smf

model = smf.ols("report_delay ~ C(spill_type) * C(Period)", data=spills_gdf).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           report_delay   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     19.14
Date:                Wed, 06 Aug 2025   Prob (F-statistic):           2.26e-12
Time:                        09:11:58   Log-Likelihood:                -53847.
No. Observations:               11196   AIC:                         1.077e+05
Df Residuals:                   11192   BIC:                         1.077e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
====================================================================================================================
                                                       coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------
Intercept                                            6.7689      0.478     14.166      0.000       5.832       7.706
C(spill_type)[T.Recent]                             -4.9332      0.691     -7.139      0.000      -6.288      -3.579
C(Period)[T.Before 2021]                            -3.4361      0.986     -3.484      0.000      -5.369      -1.503
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]     4.3645      1.249      3.496      0.000       1.917       6.812
==============================================================================
Omnibus:                    24335.353   Durbin-Watson:                   1.954
Prob(Omnibus):                  0.000   Jarque-Bera (JB):        152505098.951
Skew:                          19.556   Prob(JB):                         0.00
Kurtosis:                     573.424   Cond. No.                         7.15
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

import statsmodels.formula.api as smf
import statsmodels.api as sm

# Make sure spill_type and Period are clean, and report_delay is numeric
spills_gdf = spills_gdf.rename(columns={
    'Report Delay (Days)': 'report_delay',
    'Spill Type': 'spill_type'
})

# Fit the Negative Binomial model with interaction
nb_model = smf.glm(
    formula="report_delay ~ C(spill_type) * C(Period)",
    data=spills_gdf,
    family=sm.families.NegativeBinomial()
).fit()

# View the results
print(nb_model.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:           report_delay   No. Observations:                11196
Model:                            GLM   Df Residuals:                    11192
Model Family:        NegativeBinomial   Df Model:                            3
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -26493.
Date:                Wed, 06 Aug 2025   Deviance:                       29512.
Time:                        09:12:11   Pearson chi2:                 4.90e+05
No. Iterations:                     9   Pseudo R-squ. (CS):             0.2067
Covariance Type:            nonrobust                                         
====================================================================================================================
                                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------
Intercept                                            1.9123      0.017    110.902      0.000       1.879       1.946
C(spill_type)[T.Recent]                             -1.3049      0.027    -48.162      0.000      -1.358      -1.252
C(Period)[T.Before 2021]                            -0.7085      0.037    -18.968      0.000      -0.782      -0.635
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]     1.1178      0.049     23.044      0.000       1.023       1.213
====================================================================================================================

/home/dadams/miniconda3/lib/python3.12/site-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
  warnings.warn("Negative binomial dispersion parameter alpha not "

import numpy as np
np.exp(nb_model.params)

Intercept                                           6.768912
C(spill_type)[T.Recent]                             0.271201
C(Period)[T.Before 2021]                            0.492364
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]    3.058085
dtype: float64

import matplotlib.pyplot as plt
import pandas as pd

# Predicted delays by group
group_labels = [
    "Historical, After 2021",     # Reference categories (Intercept only)
    "Recent, After 2021",         # Recent effect only
    "Historical, Before 2021",    # Period effect only  
    "Recent, Before 2021"         # Recent + Period + Interaction
]

# From your latest exponentiated results
intercept = 6.768912
recent_effect = 0.271201
period_effect = 0.492364
interaction_effect = 3.058085

predicted_delays = [
    intercept,                                                    # 6.77
    intercept * recent_effect,                                    # 6.77 * 0.271 = 1.84
    intercept * period_effect,                                    # 6.77 * 0.492 = 3.33
    intercept * recent_effect * period_effect * interaction_effect # 6.77 * 0.271 * 0.492 * 3.058 = 2.75
]

# Create DataFrame for plotting
df_plot = pd.DataFrame({
    "Group": group_labels,
    "Predicted Delay (Days)": predicted_delays
})

# Plot
plt.figure(figsize=(10, 6))
bars = plt.bar(df_plot["Group"], df_plot["Predicted Delay (Days)"], alpha=0.8)
plt.title("Predicted Report Delay by Spill Type and Period", fontsize=14)
plt.ylabel("Predicted Delay (Days)", fontsize=12)
plt.xticks(rotation=15, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Annotate bars
for bar in bars:
    height = bar.get_height()
    plt.annotate(f"{height:.2f}",
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom')
plt.show()

print("Predicted delays:")
for group, delay in zip(group_labels, predicted_delays):
    print(f"{group}: {delay:.2f} days")

Predicted delays:
Historical, After 2021: 6.77 days
Recent, After 2021: 1.84 days
Historical, Before 2021: 3.33 days
Recent, Before 2021: 2.76 days

import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# 1. Convert to monthly time series
spills_gdf['report_month'] = spills_gdf['Initial Report Date'].dt.to_period('M')
monthly = spills_gdf.groupby('report_month')['report_delay'].mean().reset_index()
monthly['report_month'] = monthly['report_month'].dt.to_timestamp()

# 2. Create ITS variables
monthly['time'] = (monthly['report_month'] - monthly['report_month'].min()).dt.days // 30
monthly['post_2021'] = (monthly['report_month'] >= pd.Timestamp('2021-01-01')).astype(int)
monthly['time_post'] = monthly['time'] * monthly['post_2021']

# 3. Run the ITS model
its_model = smf.ols("report_delay ~ time + post_2021 + time_post", data=monthly).fit()
print(its_model.summary())

# 4. Predict and plot
monthly['predicted'] = its_model.predict(monthly)

plt.figure(figsize=(12, 6))
plt.plot(monthly['report_month'], monthly['report_delay'], marker='o', label='Observed', alpha=0.6)
plt.plot(monthly['report_month'], monthly['predicted'], linestyle='--', color='red', label='Predicted (ITS)')
plt.axvline(x=pd.Timestamp('2021-01-01'), color='black', linestyle='--', label='Policy Change (2021)')
plt.title('Interrupted Time Series: Monthly Report Delay')
plt.xlabel('Month')
plt.ylabel('Mean Report Delay (Days)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           report_delay   R-squared:                       0.216
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     8.150
Date:                Wed, 06 Aug 2025   Prob (F-statistic):           7.46e-05
Time:                        09:14:03   Log-Likelihood:                -248.17
No. Observations:                  93   AIC:                             504.3
Df Residuals:                      89   BIC:                             514.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.0214      1.081      1.870      0.065      -0.127       4.170
time           0.0280      0.045      0.617      0.539      -0.062       0.118
post_2021     -6.4546      2.524     -2.558      0.012     -11.469      -1.440
time_post      0.1062      0.056      1.893      0.062      -0.005       0.218
==============================================================================
Omnibus:                       35.237   Durbin-Watson:                   2.065
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               67.022
Skew:                           1.515   Prob(JB):                     2.79e-15
Kurtosis:                       5.849   Cond. No.                         511.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# Group by month and spill type
spills_gdf['report_month'] = spills_gdf['Initial Report Date'].dt.to_period('M')
monthly_by_type = spills_gdf.groupby(['report_month', 'spill_type'])['report_delay'].mean().reset_index()
monthly_by_type['report_month'] = monthly_by_type['report_month'].dt.to_timestamp()

# ITS variables
monthly_by_type['time'] = (monthly_by_type['report_month'] - monthly_by_type['report_month'].min()).dt.days // 30
monthly_by_type['post_2021'] = (monthly_by_type['report_month'] >= pd.Timestamp('2021-01-01')).astype(int)
monthly_by_type['time_post'] = monthly_by_type['time'] * monthly_by_type['post_2021']

# Run ITS models and print summaries
its_results_by_type = {}
for stype in monthly_by_type['spill_type'].unique():
    df = monthly_by_type[monthly_by_type['spill_type'] == stype].copy()
    model = smf.ols("report_delay ~ time + post_2021 + time_post", data=df).fit()
    df['predicted'] = model.predict(df)
    its_results_by_type[stype] = (model, df)
    
    print(f"\n=== ITS Model Summary for {stype} Spills ===\n")
    print(model.summary())

=== ITS Model Summary for Historical Spills ===

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           report_delay   R-squared:                       0.229
Model:                            OLS   Adj. R-squared:                  0.203
Method:                 Least Squares   F-statistic:                     8.814
Date:                Wed, 06 Aug 2025   Prob (F-statistic):           3.54e-05
Time:                        09:14:17   Log-Likelihood:                -277.64
No. Observations:                  93   AIC:                             563.3
Df Residuals:                      89   BIC:                             573.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.3388      1.484      2.249      0.027       0.389       6.288
time          -0.0264      0.062     -0.423      0.673      -0.150       0.097
post_2021    -10.3194      3.465     -2.978      0.004     -17.204      -3.435
time_post      0.2159      0.077      2.803      0.006       0.063       0.369
==============================================================================
Omnibus:                       32.115   Durbin-Watson:                   1.983
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               59.323
Skew:                           1.383   Prob(JB):                     1.31e-13
Kurtosis:                       5.767   Cond. No.                         511.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

=== ITS Model Summary for Recent Spills ===

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           report_delay   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                 -0.020
Method:                 Least Squares   F-statistic:                    0.3852
Date:                Wed, 06 Aug 2025   Prob (F-statistic):              0.764
Time:                        09:14:17   Log-Likelihood:                -259.51
No. Observations:                  93   AIC:                             527.0
Df Residuals:                      89   BIC:                             537.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4880      1.221      1.218      0.226      -0.939       3.915
time           0.0414      0.051      0.808      0.421      -0.060       0.143
post_2021      2.6802      2.851      0.940      0.350      -2.985       8.345
time_post     -0.0677      0.063     -1.068      0.288      -0.194       0.058
==============================================================================
Omnibus:                      129.874   Durbin-Watson:                   2.198
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3176.252
Skew:                           4.916   Prob(JB):                         0.00
Kurtosis:                      29.889   Cond. No.                         511.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# 1. Group by month and spill type
spills_gdf['report_month'] = spills_gdf['Initial Report Date'].dt.to_period('M')
monthly_by_type = spills_gdf.groupby(['report_month', 'spill_type'])['report_delay'].mean().reset_index()
monthly_by_type['report_month'] = monthly_by_type['report_month'].dt.to_timestamp()

# 2. Create ITS variables
monthly_by_type['time'] = (monthly_by_type['report_month'] - monthly_by_type['report_month'].min()).dt.days // 30
monthly_by_type['post_2021'] = (monthly_by_type['report_month'] >= pd.Timestamp('2021-01-01')).astype(int)
monthly_by_type['time_post'] = monthly_by_type['time'] * monthly_by_type['post_2021']

# 3. Run ITS models separately by spill type
its_results_by_type = {}
for stype in monthly_by_type['spill_type'].unique():
    df = monthly_by_type[monthly_by_type['spill_type'] == stype].copy()
    model = smf.ols("report_delay ~ time + post_2021 + time_post", data=df).fit()
    df['predicted'] = model.predict(df)
    its_results_by_type[stype] = (model, df)

# 4. Plot side-by-side
fig, axs = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

for ax, (stype, (model, df)) in zip(axs, its_results_by_type.items()):
    ax.plot(df['report_month'], df['report_delay'], marker='o', label='Observed', alpha=0.6)
    ax.plot(df['report_month'], df['predicted'], linestyle='--', color='red', label='Predicted (ITS)')
    ax.axvline(x=pd.Timestamp('2021-01-01'), color='black', linestyle='--', label='Policy Change (2021)')
    ax.set_title(f"ITS Model for {stype} Spills")
    ax.set_ylabel("Mean Report Delay (Days)")
    ax.legend()
    ax.grid(True)

axs[1].set_xlabel("Month")
plt.tight_layout()
plt.show()

def classify_rurality(ruca_code):
    if pd.isna(ruca_code):
        return 'Unknown'
    elif ruca_code <= 3:
        return 'Urban'
    elif ruca_code <= 6:
        return 'Suburban'
    else:
        return 'Rural'

spills['ruca_code'] = pd.to_numeric(spills['ruca_code'], errors='coerce')
spills['rurality'] = spills['ruca_code'].apply(classify_rurality)
spills = spills[spills['rurality'] != 'Unknown']  # drop unknowns

import statsmodels.formula.api as smf
import statsmodels.api as sm

# Make sure spill_type and Period are clean, and report_delay is numeric
spills_gdf = spills_gdf.rename(columns={
    'Report Delay (Days)': 'report_delay',
    'Spill Type': 'spill_type',
    'rurality': 'rurality'
})

# Fit the Negative Binomial model with interaction
nb_model = smf.glm(
    formula="report_delay ~ C(spill_type) * C(Period) * C(rurality)",
    data=spills_gdf,
    family=sm.families.NegativeBinomial()
).fit()

# View the results
print(nb_model.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:           report_delay   No. Observations:                11196
Model:                            GLM   Df Residuals:                    11184
Model Family:        NegativeBinomial   Df Model:                           11
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -26264.
Date:                Wed, 06 Aug 2025   Deviance:                       29052.
Time:                        09:45:31   Pearson chi2:                 5.16e+05
No. Iterations:                     9   Pseudo R-squ. (CS):             0.2386
Covariance Type:            nonrobust                                         
============================================================================================================================================
                                                                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------------------------
Intercept                                                                    2.4066      0.039     62.491      0.000       2.331       2.482
C(spill_type)[T.Recent]                                                     -1.8821      0.048    -39.000      0.000      -1.977      -1.788
C(Period)[T.Before 2021]                                                    -1.0298      0.086    -11.979      0.000      -1.198      -0.861
C(rurality)[T.Suburban]                                                     -0.6149      0.086     -7.174      0.000      -0.783      -0.447
C(rurality)[T.Urban]                                                        -0.6602      0.043    -15.208      0.000      -0.745      -0.575
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]                             1.4185      0.096     14.839      0.000       1.231       1.606
C(spill_type)[T.Recent]:C(rurality)[T.Suburban]                              0.1016      0.118      0.864      0.387      -0.129       0.332
C(spill_type)[T.Recent]:C(rurality)[T.Urban]                                 0.9560      0.062     15.433      0.000       0.835       1.077
C(Period)[T.Before 2021]:C(rurality)[T.Suburban]                             1.1389      0.174      6.558      0.000       0.798       1.479
C(Period)[T.Before 2021]:C(rurality)[T.Urban]                                0.3593      0.096      3.730      0.000       0.170       0.548
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]:C(rurality)[T.Suburban]    -0.1548      0.208     -0.743      0.457      -0.563       0.253
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]:C(rurality)[T.Urban]       -0.4859      0.117     -4.150      0.000      -0.715      -0.256
============================================================================================================================================

/home/dadams/miniconda3/lib/python3.12/site-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
  warnings.warn("Negative binomial dispersion parameter alpha not "

import numpy as np
np.exp(nb_model.params)

Intercept                                                                   11.096599
C(spill_type)[T.Recent]                                                      0.152271
C(Period)[T.Before 2021]                                                     0.357070
C(rurality)[T.Suburban]                                                      0.540706
C(rurality)[T.Urban]                                                         0.516745
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]                             4.130812
C(spill_type)[T.Recent]:C(rurality)[T.Suburban]                              1.106906
C(spill_type)[T.Recent]:C(rurality)[T.Urban]                                 2.601337
C(Period)[T.Before 2021]:C(rurality)[T.Suburban]                             3.123185
C(Period)[T.Before 2021]:C(rurality)[T.Urban]                                1.432260
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]:C(rurality)[T.Suburban]     0.856609
C(spill_type)[T.Recent]:C(Period)[T.Before 2021]:C(rurality)[T.Urban]        0.615116
dtype: float64

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Create the data in a more structured way
spill_types = ['Historical', 'Recent']
periods = ['After 2021', 'Before 2021']
rurality = ['Rural', 'Suburban', 'Urban']

# Calculate all combinations
data = []
for spill in spill_types:
    for period in periods:
        for rural in rurality:
            # Calculate predicted delay based on the coefficients
            delay = intercept  # Start with intercept
            
            if spill == 'Recent':
                delay *= recent_effect
            if period == 'Before 2021':
                delay *= period_effect
            if rural == 'Suburban':
                delay *= suburban_effect
            elif rural == 'Urban':
                delay *= urban_effect
            
            # Add interactions
            if spill == 'Recent' and period == 'Before 2021':
                delay *= recent_period_interaction
            if spill == 'Recent' and rural == 'Suburban':
                delay *= recent_suburban_interaction
            elif spill == 'Recent' and rural == 'Urban':
                delay *= recent_urban_interaction
            if period == 'Before 2021' and rural == 'Suburban':
                delay *= period_suburban_interaction
            elif period == 'Before 2021' and rural == 'Urban':
                delay *= period_urban_interaction
            
            # Add three-way interactions
            if spill == 'Recent' and period == 'Before 2021' and rural == 'Suburban':
                delay *= three_way_suburban
            elif spill == 'Recent' and period == 'Before 2021' and rural == 'Urban':
                delay *= three_way_urban
            
            data.append([spill, period, rural, delay])

# Create DataFrame
df = pd.DataFrame(data, columns=['Spill_Type', 'Period', 'Rurality', 'Predicted_Delay'])

# Option 1: Grouped Bar Chart
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Subplot 1: Grouped by Period and Rurality
df_pivot1 = df.pivot_table(values='Predicted_Delay', 
                          index=['Period', 'Rurality'], 
                          columns='Spill_Type')

x = np.arange(len(df_pivot1.index))
width = 0.35

bars1 = ax1.bar(x - width/2, df_pivot1['Historical'], width, 
                label='Historical', alpha=0.8, color='steelblue')
bars2 = ax1.bar(x + width/2, df_pivot1['Recent'], width, 
                label='Recent', alpha=0.8, color='orange')

ax1.set_xlabel('Period and Rurality')
ax1.set_ylabel('Predicted Delay (Days)')
ax1.set_title('Predicted Report Delay by Spill Type')
ax1.set_xticks(x)
ax1.set_xticklabels([f"{period}\n{rural}" for period, rural in df_pivot1.index], 
                   rotation=0, ha='center')
ax1.legend()
ax1.grid(axis='y', linestyle='--', alpha=0.7)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax1.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

# Subplot 2: Heatmap
df_heatmap = df.pivot_table(values='Predicted_Delay', 
                           index=['Spill_Type', 'Period'], 
                           columns='Rurality')

im = ax2.imshow(df_heatmap.values, cmap='YlOrRd', aspect='auto')
ax2.set_xticks(range(len(df_heatmap.columns)))
ax2.set_xticklabels(df_heatmap.columns)
ax2.set_yticks(range(len(df_heatmap.index)))
ax2.set_yticklabels([f"{spill}\n{period}" for spill, period in df_heatmap.index])
ax2.set_title('Heatmap of Predicted Delays')
ax2.set_xlabel('Rurality')

# Add text annotations to heatmap
for i in range(len(df_heatmap.index)):
    for j in range(len(df_heatmap.columns)):
        text = ax2.text(j, i, f'{df_heatmap.iloc[i, j]:.1f}',
                       ha="center", va="center", color="black", fontweight='bold')

# Add colorbar
cbar = plt.colorbar(im, ax=ax2)
cbar.set_label('Predicted Delay (Days)')

plt.tight_layout()
plt.show()

# Print summary table
print("\nSummary Table:")
print(df.pivot_table(values='Predicted_Delay', 
                    index=['Spill_Type', 'Period'], 
                    columns='Rurality').round(1))

Summary Table:
Rurality                Rural  Suburban  Urban
Spill_Type Period                             
Historical After 2021    11.1       6.0    5.7
           Before 2021    4.0       6.7    2.9
Recent     After 2021     1.7       1.0    2.3
           Before 2021    2.5       4.0    3.0

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Use the same coefficient values from your model
intercept = 11.096599
recent_effect = 0.152271
period_effect = 0.357070
suburban_effect = 0.540706
urban_effect = 0.516745
recent_period_interaction = 4.130812
recent_suburban_interaction = 1.106906
recent_urban_interaction = 2.601337
period_suburban_interaction = 3.123185
period_urban_interaction = 1.432260
three_way_suburban = 0.856609
three_way_urban = 0.615116

# Calculate delays for each combination
def calculate_delay(spill_type, period, rurality):
    delay = intercept
    
    if spill_type == 'Recent':
        delay *= recent_effect
    if period == 'Before 2021':
        delay *= period_effect
    if rurality == 'Suburban':
        delay *= suburban_effect
    elif rurality == 'Urban':
        delay *= urban_effect
    
    # Add interactions
    if spill_type == 'Recent' and period == 'Before 2021':
        delay *= recent_period_interaction
    if spill_type == 'Recent' and rurality == 'Suburban':
        delay *= recent_suburban_interaction
    elif spill_type == 'Recent' and rurality == 'Urban':
        delay *= recent_urban_interaction
    if period == 'Before 2021' and rurality == 'Suburban':
        delay *= period_suburban_interaction
    elif period == 'Before 2021' and rurality == 'Urban':
        delay *= period_urban_interaction
    
    # Add three-way interactions
    if spill_type == 'Recent' and period == 'Before 2021' and rurality == 'Suburban':
        delay *= three_way_suburban
    elif spill_type == 'Recent' and period == 'Before 2021' and rurality == 'Urban':
        delay *= three_way_urban
    
    return delay

# Create data for comparison
rurality_types = ['Rural', 'Suburban', 'Urban']
spill_types = ['Historical', 'Recent']

fig, axes = plt.subplots(1, 3, figsize=(18, 6))
colors = ['#2E86AB', '#A23B72']  # Blue for Historical, Red for Recent

for idx, rurality in enumerate(rurality_types):
    ax = axes[idx]
    
    # Calculate delays for each combination
    before_hist = calculate_delay('Historical', 'Before 2021', rurality)
    after_hist = calculate_delay('Historical', 'After 2021', rurality)
    before_recent = calculate_delay('Recent', 'Before 2021', rurality)
    after_recent = calculate_delay('Recent', 'After 2021', rurality)
    
    x = np.arange(2)  # Before 2021, After 2021
    width = 0.35
    
    # Create bars (Before first, then After)
    bars1 = ax.bar(x - width/2, [before_hist, after_hist], width, 
                   label='Historical', color=colors[0], alpha=0.8)
    bars2 = ax.bar(x + width/2, [before_recent, after_recent], width,
                   label='Recent', color=colors[1], alpha=0.8)
    
    # Customize subplot
    ax.set_title(f'{rurality} Areas', fontsize=14, fontweight='bold')
    ax.set_ylabel('Predicted Delay (Days)' if idx == 0 else '')
    ax.set_xticks(x)
    ax.set_xticklabels(['Before 2021', 'After 2021'])  # Before comes first
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    
    # Add value labels
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.1f}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    # Add legend to first subplot only
    if idx == 0:
        ax.legend(loc='upper right')

plt.suptitle('Predicted Report Delay: Before 2021 vs After 2021', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print summary table with Before first
print("\nSummary: Predicted Delays (Days)")
print("=" * 50)

data_summary = []
for rurality in rurality_types:
    for spill in spill_types:
        before = calculate_delay(spill, 'Before 2021', rurality)
        after = calculate_delay(spill, 'After 2021', rurality)
        change = after - before  # After minus Before (positive = increase)
        pct_change = ((after - before) / before) * 100
        
        data_summary.append({
            'Rurality': rurality,
            'Spill_Type': spill,
            'Before_2021': round(before, 1),
            'After_2021': round(after, 1),
            'Change': round(change, 1),
            'Pct_Change': round(pct_change, 1)
        })

summary_df = pd.DataFrame(data_summary)
print(summary_df.to_string(index=False))

print(f"\nKey Insights:")
print(f"- Positive 'Change' means delays INCREASED after 2021")
print(f"- Negative 'Change' means delays DECREASED after 2021")

Summary: Predicted Delays (Days)
==================================================
Rurality Spill_Type  Before_2021  After_2021  Change  Pct_Change
   Rural Historical          4.0        11.1     7.1       180.1
   Rural     Recent          2.5         1.7    -0.8       -32.2
Suburban Historical          6.7         6.0    -0.7       -10.3
Suburban     Recent          4.0         1.0    -3.0       -74.7
   Urban Historical          2.9         5.7     2.8        95.5
   Urban     Recent          3.0         2.3    -0.7       -23.0

Key Insights:
- Positive 'Change' means delays INCREASED after 2021
- Negative 'Change' means delays DECREASED after 2021

import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import numpy as np

# STEP 1: Prepare the data with rurality
print("APPROACH 1: Separate ITS Models by Rurality")
print("=" * 50)

# Prepare your data (replace with your actual spills_gdf)
spills_gdf['report_month'] = spills_gdf['Initial Report Date'].dt.to_period('M')

# Group by month, spill type, AND rurality
monthly_by_type_rurality = spills_gdf.groupby(['report_month', 'spill_type', 'rurality'])['report_delay'].mean().reset_index()
monthly_by_type_rurality['report_month'] = monthly_by_type_rurality['report_month'].dt.to_timestamp()

# ITS variables
monthly_by_type_rurality['time'] = (monthly_by_type_rurality['report_month'] - monthly_by_type_rurality['report_month'].min()).dt.days // 30
monthly_by_type_rurality['post_2021'] = (monthly_by_type_rurality['report_month'] >= pd.Timestamp('2021-01-01')).astype(int)
monthly_by_type_rurality['time_post'] = monthly_by_type_rurality['time'] * monthly_by_type_rurality['post_2021']

# STEP 2: Run separate ITS for each combination
its_results_detailed = {}
full_results_summary = []

for stype in monthly_by_type_rurality['spill_type'].unique():
    for rurality in monthly_by_type_rurality['rurality'].unique():
        df_subset = monthly_by_type_rurality[
            (monthly_by_type_rurality['spill_type'] == stype) & 
            (monthly_by_type_rurality['rurality'] == rurality)
        ].copy()
        
        if len(df_subset) >= 8:  # Need at least 8 data points for reliable ITS
            try:
                model = smf.ols("report_delay ~ time + post_2021 + time_post", data=df_subset).fit()
                df_subset['predicted'] = model.predict(df_subset)
                its_results_detailed[(stype, rurality)] = (model, df_subset)
                
                # Store results for summary table
                full_results_summary.append({
                    'Spill_Type': stype,
                    'Rurality': rurality,
                    'N_months': len(df_subset),
                    'R_squared': round(model.rsquared, 3),
                    'Model_Status': 'Success'
                })
                
                print(f"\n✅ {stype} Spills in {rurality} Areas:")
                print(f"   Data points: {len(df_subset)} months")
                print(f"   R-squared: {model.rsquared:.3f}")
                
            except Exception as e:
                print(f"\n❌ {stype} Spills in {rurality} Areas: Model failed - {str(e)[:50]}...")
                full_results_summary.append({
                    'Spill_Type': stype,
                    'Rurality': rurality,
                    'N_months': len(df_subset),
                    'R_squared': None,
                    'Model_Status': 'Failed'
                })
        else:
            print(f"\n⚠️  {stype} Spills in {rurality} Areas: Insufficient data ({len(df_subset)} months)")
            full_results_summary.append({
                'Spill_Type': stype,
                'Rurality': rurality,
                'N_months': len(df_subset),
                'R_squared': None,
                'Model_Status': 'Insufficient Data'
            })

print(f"\n\nSUCCESSFUL MODELS: {len(its_results_detailed)} out of {len(full_results_summary)} possible combinations")

# STEP 3: KEY METRICS FOR MISSION CHANGE EVALUATION
print("\n" + "="*70)
print("KEY METRICS: State Agency Mission Change Impact Analysis (2021)")
print("="*70)

mission_change_results = []

for (stype, rurality), (model, df) in its_results_detailed.items():
    # Extract key coefficients and p-values
    immediate_effect = model.params.get('post_2021', 0)
    immediate_pval = model.pvalues.get('post_2021', 1)
    
    slope_change = model.params.get('time_post', 0)
    slope_pval = model.pvalues.get('time_post', 1)
    
    time_trend = model.params.get('time', 0)
    
    # Calculate practical significance
    pre_2021_mean = df[df['post_2021'] == 0]['report_delay'].mean() if len(df[df['post_2021'] == 0]) > 0 else 0
    post_2021_mean = df[df['post_2021'] == 1]['report_delay'].mean() if len(df[df['post_2021'] == 1]) > 0 else 0
    
    mission_change_results.append({
        'Spill_Type': stype,
        'Rurality': rurality,
        'Immediate_Effect': round(immediate_effect, 2),
        'Immediate_Significant': 'Yes' if immediate_pval < 0.05 else 'No',
        'Slope_Change': round(slope_change, 4),
        'Slope_Significant': 'Yes' if slope_pval < 0.05 else 'No',
        'Pre_2021_Average': round(pre_2021_mean, 1),
        'Post_2021_Average': round(post_2021_mean, 1),
        'Overall_Change': round(post_2021_mean - pre_2021_mean, 1),
        'R_squared': round(model.rsquared, 3)
    })
    
    print(f"\n📊 {stype} Spills in {rurality} Areas:")
    print(f"   Immediate Mission Change Effect: {immediate_effect:+.2f} days ({'✓ significant' if immediate_pval < 0.05 else '✗ not significant'} p={immediate_pval:.3f})")
    print(f"   Monthly Trend Change: {slope_change:+.4f} days/month ({'✓ significant' if slope_pval < 0.05 else '✗ not significant'} p={slope_pval:.3f})")
    print(f"   Pre-2021 Average: {pre_2021_mean:.1f} days")
    print(f"   Post-2021 Average: {post_2021_mean:.1f} days")
    print(f"   Overall Change: {post_2021_mean - pre_2021_mean:+.1f} days")

# STEP 4: Summary table
print(f"\n\n📋 SUMMARY TABLE: Mission Change Impact")
print("="*100)
results_df = pd.DataFrame(mission_change_results)
print(results_df.to_string(index=False))

# STEP 5: Policy interpretation
print(f"\n\n🏛️  POLICY INTERPRETATION:")
print("="*50)

significant_immediate = results_df[results_df['Immediate_Significant'] == 'Yes']
significant_slope = results_df[results_df['Slope_Significant'] == 'Yes']

print(f"📈 IMMEDIATE EFFECTS (right at mission change):")
if len(significant_immediate) > 0:
    for _, row in significant_immediate.iterrows():
        direction = "DECREASED" if row['Immediate_Effect'] < 0 else "INCREASED"
        print(f"   • {row['Spill_Type']} spills in {row['Rurality']} areas: {direction} by {abs(row['Immediate_Effect']):.1f} days")
else:
    print("   • No statistically significant immediate effects detected")

print(f"\n📉 TREND CHANGES (ongoing impact over time):")
if len(significant_slope) > 0:
    for _, row in significant_slope.iterrows():
        if row['Slope_Change'] < 0:
            print(f"   • {row['Spill_Type']} spills in {row['Rurality']} areas: IMPROVING by {abs(row['Slope_Change'])*12:.2f} days per year")
        else:
            print(f"   • {row['Spill_Type']} spills in {row['Rurality']} areas: WORSENING by {row['Slope_Change']*12:.2f} days per year")
else:
    print("   • No statistically significant trend changes detected")

print(f"\n🎯 GEOGRAPHIC EQUITY:")
rural_results = results_df[results_df['Rurality'] == 'Rural']
urban_results = results_df[results_df['Rurality'] == 'Urban']
if len(rural_results) > 0 and len(urban_results) > 0:
    rural_avg_change = rural_results['Overall_Change'].mean()
    urban_avg_change = urban_results['Overall_Change'].mean()
    if abs(rural_avg_change - urban_avg_change) > 1:  # More than 1 day difference
        print(f"   • Mission change may have created geographic disparities")
        print(f"   • Rural average change: {rural_avg_change:+.1f} days")
        print(f"   • Urban average change: {urban_avg_change:+.1f} days")
    else:
        print(f"   • Mission change appears to have affected rural and urban areas similarly")

# STEP 6: Create visualization
def plot_its_results():
    n_models = len(its_results_detailed)
    if n_models == 0:
        print("No models to plot")
        return
        
    # Calculate subplot layout
    cols = min(3, n_models)
    rows = (n_models + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(6*cols, 4*rows))
    if n_models == 1:
        axes = [axes]
    elif rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for idx, ((stype, rurality), (model, df)) in enumerate(its_results_detailed.items()):
        ax = axes[idx]
        
        # Plot actual data
        ax.scatter(df['report_month'], df['report_delay'], alpha=0.6, s=30, label='Actual', color='gray')
        
        # Plot trend lines
        pre_2021 = df[df['post_2021'] == 0].sort_values('report_month')
        post_2021 = df[df['post_2021'] == 1].sort_values('report_month')
        
        if len(pre_2021) > 0:
            ax.plot(pre_2021['report_month'], pre_2021['predicted'], 
                   'r-', linewidth=3, label='Pre-Mission Change')
        if len(post_2021) > 0:
            ax.plot(post_2021['report_month'], post_2021['predicted'], 
                   'b-', linewidth=3, label='Post-Mission Change')
        
        # Add vertical line at intervention
        ax.axvline(x=pd.Timestamp('2021-01-01'), color='black', 
                  linestyle='--', linewidth=2, alpha=0.8, label='Mission Change')
        
        ax.set_title(f'{stype} Spills - {rurality}', fontweight='bold')
        ax.set_ylabel('Report Delay (Days)')
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='x', rotation=45)
    
    # Hide empty subplots
    for idx in range(n_models, len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.suptitle('State Agency Mission Change Impact: Interrupted Time Series Analysis', 
                 fontsize=14, fontweight='bold', y=1.02)
    plt.show()

# Generate the plot
print(f"\n📊 Generating visualization...")
plot_its_results()

print(f"\n✅ Analysis complete! You now have:")
print(f"   • Separate ITS models for each spill type × rurality combination")
print(f"   • Statistical significance testing for mission change effects")
print(f"   • Policy interpretation focusing on geographic equity")
print(f"   • Visual time series plots showing before/after trends")

APPROACH 1: Separate ITS Models by Rurality
==================================================

✅ Historical Spills in Rural Areas:
   Data points: 90 months
   R-squared: 0.048

✅ Historical Spills in Urban Areas:
   Data points: 93 months
   R-squared: 0.207

✅ Historical Spills in Suburban Areas:
   Data points: 68 months
   R-squared: 0.028

✅ Recent Spills in Rural Areas:
   Data points: 93 months
   R-squared: 0.020

✅ Recent Spills in Urban Areas:
   Data points: 93 months
   R-squared: 0.009

✅ Recent Spills in Suburban Areas:
   Data points: 79 months
   R-squared: 0.091


SUCCESSFUL MODELS: 6 out of 6 possible combinations

======================================================================
KEY METRICS: State Agency Mission Change Impact Analysis (2021)
======================================================================

📊 Historical Spills in Rural Areas:
   Immediate Mission Change Effect: -3.20 days (✗ not significant p=0.758)
   Monthly Trend Change: +0.1643 days/month (✗ not significant p=0.493)
   Pre-2021 Average: 4.1 days
   Post-2021 Average: 9.9 days
   Overall Change: +5.8 days

📊 Historical Spills in Urban Areas:
   Immediate Mission Change Effect: -11.75 days (✓ significant p=0.002)
   Monthly Trend Change: +0.2301 days/month (✓ significant p=0.007)
   Pre-2021 Average: 2.3 days
   Post-2021 Average: 4.8 days
   Overall Change: +2.5 days

📊 Historical Spills in Suburban Areas:
   Immediate Mission Change Effect: -13.96 days (✗ not significant p=0.480)
   Monthly Trend Change: +0.1509 days/month (✗ not significant p=0.793)
   Pre-2021 Average: 5.3 days
   Post-2021 Average: 8.7 days
   Overall Change: +3.4 days

📊 Recent Spills in Rural Areas:
   Immediate Mission Change Effect: +4.24 days (✗ not significant p=0.276)
   Monthly Trend Change: -0.0387 days/month (✗ not significant p=0.653)
   Pre-2021 Average: 1.7 days
   Post-2021 Average: 2.5 days
   Overall Change: +0.9 days

📊 Recent Spills in Urban Areas:
   Immediate Mission Change Effect: +3.06 days (✗ not significant p=0.669)
   Monthly Trend Change: -0.1301 days/month (✗ not significant p=0.414)
   Pre-2021 Average: 3.3 days
   Post-2021 Average: 2.9 days
   Overall Change: -0.3 days

📊 Recent Spills in Suburban Areas:
   Immediate Mission Change Effect: +3.15 days (✗ not significant p=0.709)
   Monthly Trend Change: -0.3345 days/month (✗ not significant p=0.081)
   Pre-2021 Average: 5.2 days
   Post-2021 Average: 1.1 days
   Overall Change: -4.1 days


📋 SUMMARY TABLE: Mission Change Impact
====================================================================================================
Spill_Type Rurality  Immediate_Effect Immediate_Significant  Slope_Change Slope_Significant  Pre_2021_Average  Post_2021_Average  Overall_Change  R_squared
Historical    Rural             -3.20                    No        0.1643                No               4.1                9.9             5.8      0.048
Historical    Urban            -11.75                   Yes        0.2301               Yes               2.3                4.8             2.5      0.207
Historical Suburban            -13.96                    No        0.1509                No               5.3                8.7             3.4      0.028
    Recent    Rural              4.24                    No       -0.0387                No               1.7                2.5             0.9      0.020
    Recent    Urban              3.06                    No       -0.1301                No               3.3                2.9            -0.3      0.009
    Recent Suburban              3.15                    No       -0.3345                No               5.2                1.1            -4.1      0.091


🏛️  POLICY INTERPRETATION:
==================================================
📈 IMMEDIATE EFFECTS (right at mission change):
   • Historical spills in Urban areas: DECREASED by 11.8 days

📉 TREND CHANGES (ongoing impact over time):
   • Historical spills in Urban areas: WORSENING by 2.76 days per year

🎯 GEOGRAPHIC EQUITY:
   • Mission change may have created geographic disparities
   • Rural average change: +3.4 days
   • Urban average change: +1.1 days

📊 Generating visualization...

✅ Analysis complete! You now have:
   • Separate ITS models for each spill type × rurality combination
   • Statistical significance testing for mission change effects
   • Policy interpretation focusing on geographic equity
   • Visual time series plots showing before/after trends

# Check average delays by broader time periods
pre_mission = spills_gdf[spills_gdf['Initial Report Date'] < '2021-01-01']
post_mission = spills_gdf[spills_gdf['Initial Report Date'] >= '2021-01-01']

print("RECONCILIATION CHECK:")
print(f"Pre-Mission Average: {pre_mission['report_delay'].mean():.1f} days")
print(f"Post-Mission Average: {post_mission['report_delay'].mean():.1f} days")
print(f"Overall Change: {post_mission['report_delay'].mean() - pre_mission['report_delay'].mean():+.1f} days")

# By rurality to see if aggregation is masking
print("\nBy Rurality:")
for rurality in ['Rural', 'Suburban', 'Urban']:
    pre_rural = pre_mission[pre_mission['rurality'] == rurality]['report_delay'].mean()
    post_rural = post_mission[post_mission['rurality'] == rurality]['report_delay'].mean()
    print(f"{rurality}: {pre_rural:.1f} → {post_rural:.1f} days ({post_rural-pre_rural:+.1f})")

RECONCILIATION CHECK:
Pre-Mission Average: 2.5 days
Post-Mission Average: 4.6 days
Overall Change: +2.1 days

By Rurality:
Rural: 1.9 → 4.8 days (+2.9)
Suburban: 4.6 → 2.8 days (-1.8)
Urban: 2.8 → 4.7 days (+1.9)

# print columns of spills_gdf
print("\nColumns in spills_gdf:")
for col in spills_gdf.columns:
    print(f"- {col}")

Columns in spills_gdf:
- Document #
- Report
- Operator
- Operator #
- Tracking #
- Initial Report Date
- Date of Discovery
- spill_type
- Qtr Qtr
- Section
- Township
- range
- meridian
- Latitude
- Longitude
- Municipality
- county
- Facility Type
- Facility ID
- API County Code
- API Sequence Number
- Spilled outside of berms
- More than five barrels spilled
- Oil Spill Volume
- Condensate Spill Volume
- Flow Back Spill Volume
- Produced Water Spill Volume
- E&P Waste Spill Volume
- Other Waste
- Drilling Fluid Spill Volume
- Current Land Use
- Other Land Use
- Weather Conditions
- Surface Owner
- Surface Owner Other
- Waters of the State
- Residence / Occupied Structure
- livestock
- Public Byway
- Surface Water Supply Area
- Spill Description
- Supplemental Report Date
- Oil BBLs Spilled
- Oil BBLs Recovered
- Oil Unknown
- Condensate BBLs Spilled
- Condensate BBLs Recovered
- Condensate Unknown
- Produced Water BBLs Spilled
- Produced Water BBLs Recovered
- Produced Water Unknown
- Drilling Fluid BBLs Spilled
- Drilling Fluid BBLs Recovered
- Drilling Fluid Unknown
- Flow Back Fluid BBLs Spilled
- Flow Back Fluid BBLs Recovered
- Flow Back Fluid Unkown
- Other E&P Waste BBLS Spilled
- Other E&P Waste BBLS Recovered
- Other E&P Waste Unknown
- Other E&P Waste
- Spill Contained within Berm
- Emergency Pit Constructed
- soil
- groundwater
- Surface Water
- Dry Drainage Feature
- Surface Area Length
- Surface Area Width
- Depth of Impact in Feet
- Depth of Impact in Inches
- Area Depth Determined
- Geology Description
- Depth to Groundwater
- Water wells in area
- Water Wells
- Water Wells None
- Surface Water Near
- Surface Water None
- Wetlands
- Wetlands None
- Springs
- Springs None
- Livestock Near
- Livestock None
- Occupied Buildings
- Occupied Buildings None
- Additional Spill Details
- Supplemental Report Date CA
- Human Error
- Equipment Failure
- Historical Unkown
- Other
- Other Description
- Root Cause
- Preventative Measures
- Soil Excavated
- Offsite Disposal
- Onsite Treatment
- Other Disposition
- Other Disposition Description
- Ground Water Removed
- Surface Water Removed
- Corrective Actions Completed
- Approved Form 27
- Form 27 Project Number
- GEOID
- TRACT_NAME
- total_population
- white_population
- hispanic_population
- median_household_income
- poverty_population
- unemployed_population
- percent_white
- percent_hispanic
- percent_poverty
- unemployment_rate
- geometry
- ruca_code
- ruca_description
- rurality
- Report Year
- Period
- report_delay
- report_month

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf

print("🏭 TESTING INDUSTRY RESISTANCE HYPOTHESIS")
print("=" * 60)

# First, let's check what Period values we actually have
print("Checking data structure:")
print("Unique values in Period column:", spills_gdf['Period'].unique())
print("Period value counts:", spills_gdf['Period'].value_counts())
print("Sample of data:")
print(spills_gdf[['Operator', 'Period', 'report_delay']].head())

# 1. OPERATOR-LEVEL RESISTANCE PATTERNS
print("\n1️⃣ OPERATOR-LEVEL RESISTANCE ANALYSIS")
print("-" * 45)

# Calculate operator size (number of spills as proxy for company size)
operator_stats = spills_gdf.groupby('Operator').agg({
    'Document #': 'count',
    'report_delay': 'mean'
}).rename(columns={'Document #': 'total_spills', 'report_delay': 'avg_delay'})

print(f"Total operators: {len(operator_stats)}")
print(f"Spills per operator - Min: {operator_stats['total_spills'].min()}, Max: {operator_stats['total_spills'].max()}")

# Categorize operators by size
operator_stats['size_category'] = pd.cut(operator_stats['total_spills'], 
                                       bins=[0, 5, 20, 50, float('inf')], 
                                       labels=['Small (1-5)', 'Medium (6-20)', 'Large (21-50)', 'Major (50+)'])

print("\nOperator size distribution:")
print(operator_stats['size_category'].value_counts())

# Get period values
period_values = sorted(spills_gdf['Period'].unique())
if len(period_values) >= 2:
    before_period = period_values[0]  
    after_period = period_values[1]   
    print(f"\nAnalyzing periods: '{before_period}' vs '{after_period}'")
    
    # Calculate average delays by operator and period
    operator_period_stats = spills_gdf.groupby(['Operator', 'Period']).agg({
        'report_delay': 'mean',
        'Document #': 'count'
    }).reset_index()
    
    print(f"Operator-period combinations: {len(operator_period_stats)}")
    
    # Add size categories by merging
    operator_period_stats = operator_period_stats.merge(
        operator_stats[['size_category']].reset_index(), 
        on='Operator', 
        how='left'
    )
    
    # Summary by size and period
    size_period_summary = operator_period_stats.groupby(['size_category', 'Period']).agg({
        'report_delay': ['mean', 'count'],
        'Document #': 'sum'
    }).round(2)
    
    print("\nAverage delays by operator size and period:")
    print(size_period_summary)
    
    # 2. OPERATOR CHANGES ANALYSIS
    print("\n2️⃣ OPERATOR DELAY CHANGES")
    print("-" * 35)
    
    # Pivot to get before/after for each operator
    operator_pivot = operator_period_stats.pivot_table(
        index='Operator', 
        columns='Period', 
        values='report_delay', 
        aggfunc='mean'
    )
    
    print(f"Operators with data in both periods: {len(operator_pivot.dropna())}")
    
    # Only keep operators with data in both periods
    complete_operators = operator_pivot.dropna()
    
    if len(complete_operators) > 0:
        complete_operators['delay_change'] = complete_operators[after_period] - complete_operators[before_period]
        
        # Add operator size info
        complete_with_size = complete_operators.merge(
            operator_stats[['size_category', 'total_spills']], 
            left_index=True, 
            right_index=True, 
            how='left'
        )
        
        print(f"Operators with size data: {len(complete_with_size)}")
        
        # Top 10 worst performers
        if len(complete_with_size) >= 10:
            worst_performers = complete_with_size.nlargest(10, 'delay_change')
            print(f"\n🚨 TOP 10 OPERATORS WITH BIGGEST DELAY INCREASES:")
            for idx, (operator, data) in enumerate(worst_performers.iterrows(), 1):
                print(f"{idx:2d}. {operator[:50]:50s} | Change: +{data['delay_change']:5.1f} days | Size: {data['size_category']}")
        
        # Best performers
        if len(complete_with_size) >= 10:
            best_performers = complete_with_size.nsmallest(10, 'delay_change')
            print(f"\n✅ TOP 10 OPERATORS WITH BIGGEST IMPROVEMENTS:")
            for idx, (operator, data) in enumerate(best_performers.iterrows(), 1):
                print(f"{idx:2d}. {operator[:50]:50s} | Change: {data['delay_change']:5.1f} days | Size: {data['size_category']}")
        
        # 3. SIZE-BASED ANALYSIS
        print("\n3️⃣ OPERATOR SIZE EFFECT ANALYSIS")
        print("-" * 40)
        
        size_changes = complete_with_size.groupby('size_category')['delay_change'].agg(['mean', 'count', 'std']).round(2)
        print("Average delay changes by operator size:")
        print(size_changes)
        
        # Statistical test: Large vs Small operators
        large_ops = complete_with_size[complete_with_size['size_category'].isin(['Large (21-50)', 'Major (50+)'])]
        small_ops = complete_with_size[complete_with_size['size_category'].isin(['Small (1-5)', 'Medium (6-20)'])]
        
        if len(large_ops) > 0 and len(small_ops) > 0:
            t_stat, p_val = stats.ttest_ind(large_ops['delay_change'], small_ops['delay_change'])
            print(f"\n🔍 Large vs Small Operators:")
            print(f"   Large operators (n={len(large_ops)}): {large_ops['delay_change'].mean():+.2f} days average change")
            print(f"   Small operators (n={len(small_ops)}): {small_ops['delay_change'].mean():+.2f} days average change")
            print(f"   Statistical test: t={t_stat:.3f}, p={p_val:.4f} ({'significant' if p_val < 0.05 else 'not significant'})")
    
    # 4. SPILL SIZE ANALYSIS
    print("\n4️⃣ SPILL SIZE RESISTANCE ANALYSIS")
    print("-" * 40)
    
    # Create total volume
    volume_cols = ['Oil Spill Volume', 'Condensate Spill Volume', 'Flow Back Spill Volume', 
                   'Produced Water Spill Volume', 'E&P Waste Spill Volume', 'Drilling Fluid Spill Volume']
    
    for col in volume_cols:
        if col in spills_gdf.columns:
            spills_gdf[col] = pd.to_numeric(spills_gdf[col], errors='coerce').fillna(0)
    
    # Sum available volume columns
    available_vol_cols = [col for col in volume_cols if col in spills_gdf.columns]
    if available_vol_cols:
        spills_gdf['total_volume'] = spills_gdf[available_vol_cols].sum(axis=1)
        
        spills_gdf['spill_size_category'] = pd.cut(spills_gdf['total_volume'], 
                                                  bins=[0, 1, 10, 50, float('inf')], 
                                                  labels=['Small (0-1 bbl)', 'Medium (1-10 bbl)', 
                                                        'Large (10-50 bbl)', 'Major (50+ bbl)'])
        
        size_period_analysis = spills_gdf.groupby(['spill_size_category', 'Period'])['report_delay'].mean().unstack()
        
        if len(size_period_analysis.columns) == 2:
            size_period_analysis['change'] = size_period_analysis.iloc[:, 1] - size_period_analysis.iloc[:, 0]
            print("Delay changes by spill size:")
            print(size_period_analysis['change'].sort_values(ascending=False).round(2))
    
    # 5. GEOGRAPHIC PATTERNS
    print("\n5️⃣ GEOGRAPHIC RESISTANCE PATTERNS")
    print("-" * 40)
    
    county_analysis = spills_gdf.groupby(['county', 'Period'])['report_delay'].mean().unstack()
    
    if len(county_analysis.columns) == 2:
        county_analysis['change'] = county_analysis.iloc[:, 1] - county_analysis.iloc[:, 0]
        county_analysis['total_spills'] = spills_gdf.groupby('county').size()
        
        # Filter for counties with significant data
        significant_counties = county_analysis[county_analysis['total_spills'] >= 20]
        
        if len(significant_counties) > 0:
            worst_counties = significant_counties.nlargest(10, 'change')
            print("🚨 COUNTIES WITH BIGGEST DELAY INCREASES (min 20 spills):")
            for county, data in worst_counties.iterrows():
                print(f"   {county:25s}: {data['change']:+5.1f} days ({data['total_spills']:3.0f} spills)")

    # 6. SUMMARY ASSESSMENT
    print("\n6️⃣ RESISTANCE HYPOTHESIS ASSESSMENT")
    print("-" * 45)
    
    evidence_count = 0
    total_tests = 0
    
    # Test 1: Do major operators show up disproportionately in worst performers?
    if 'worst_performers' in locals() and len(worst_performers) > 0:
        total_tests += 1
        major_in_worst = (worst_performers['size_category'] == 'Major (50+)').sum()
        if major_in_worst >= 3:  # 30% or more
            evidence_count += 1
            print("✅ Major operators overrepresented in worst performers")
        else:
            print("❌ Major operators not overrepresented in worst performers")
    
    # Test 2: Do large operators have bigger average increases?
    if 'large_ops' in locals() and 'small_ops' in locals() and len(large_ops) > 0 and len(small_ops) > 0:
        total_tests += 1
        if large_ops['delay_change'].mean() > small_ops['delay_change'].mean():
            evidence_count += 1
            print("✅ Large operators had bigger delay increases than small operators")
        else:
            print("❌ Small operators had bigger delay increases than large operators")
    
    # Test 3: Do larger spills have bigger increases?
    if 'size_period_analysis' in locals() and 'change' in size_period_analysis.columns:
        total_tests += 1
        if size_period_analysis['change'].loc['Major (50+ bbl)'] > size_period_analysis['change'].mean():
            evidence_count += 1
            print("✅ Major spills had above-average delay increases")
        else:
            print("❌ Major spills did not have above-average delay increases")
    
    if total_tests > 0:
        resistance_score = (evidence_count / total_tests) * 100
        print(f"\n📊 RESISTANCE HYPOTHESIS SCORE: {resistance_score:.0f}% ({evidence_count}/{total_tests} tests support)")
        
        if resistance_score >= 67:
            print("🚨 STRONG EVIDENCE for systematic industry resistance")
        elif resistance_score >= 33:
            print("⚠️  MODERATE EVIDENCE for industry resistance")
        else:
            print("✅ LIMITED EVIDENCE for systematic resistance")
    else:
        print("⚠️ Insufficient data to assess resistance hypothesis")

else:
    print("⚠️ Need at least 2 time periods for analysis")

print(f"\n✅ Analysis complete!")

🏭 TESTING INDUSTRY RESISTANCE HYPOTHESIS
============================================================
Checking data structure:
Unique values in Period column: ['2021 and After' 'Before 2021']
Period value counts: Period
2021 and After    7397
Before 2021       3799
Name: count, dtype: int64
Sample of data:
             Operator          Period  report_delay
28     PDC ENERGY INC  2021 and After             1
163    PDC ENERGY INC  2021 and After             3
226  NOBLE ENERGY INC     Before 2021             2
334  NOBLE ENERGY INC     Before 2021             2
342  NOBLE ENERGY INC     Before 2021             0

1️⃣ OPERATOR-LEVEL RESISTANCE ANALYSIS
---------------------------------------------
Total operators: 117
Spills per operator - Min: 1, Max: 2371

Operator size distribution:
size_category
Small (1-5)      51
Medium (6-20)    29
Major (50+)      24
Large (21-50)    13
Name: count, dtype: int64

Analyzing periods: '2021 and After' vs 'Before 2021'
Operator-period combinations: 161

Average delays by operator size and period:
                             report_delay       Document #
                                     mean count        sum
size_category Period                                      
Small (1-5)   2021 and After        49.30    25         69
              Before 2021            5.85    28         65
Medium (6-20) 2021 and After        10.74    19        137
              Before 2021            4.43    24        216
Large (21-50) 2021 and After         7.99    11        259
              Before 2021            1.89     8        170
Major (50+)   2021 and After         5.19    23       6932
              Before 2021            3.52    23       3348

2️⃣ OPERATOR DELAY CHANGES
-----------------------------------
Operators with data in both periods: 44
Operators with size data: 44

🚨 TOP 10 OPERATORS WITH BIGGEST DELAY INCREASES:
 1. KP KAUFFMAN COMPANY INC                            | Change: + 15.0 days | Size: Major (50+)
 2. AKA ENERGY GROUP LLC                               | Change: +  6.6 days | Size: Medium (6-20)
 3. HUNTER RIDGE ENERGY SERVICES LLC                   | Change: +  5.0 days | Size: Small (1-5)
 4. EXTRACTION OIL & GAS INC                           | Change: +  5.0 days | Size: Major (50+)
 5. RED HAWK PETROLEUM LLC                             | Change: +  3.8 days | Size: Medium (6-20)
 6. PETRO MEX RESOURCES                                | Change: +  3.8 days | Size: Medium (6-20)
 7. DCP OPERATING COMPANY LP                           | Change: +  3.6 days | Size: Major (50+)
 8. ENERPLUS RESOURCES (USA) CORPORATION               | Change: +  1.9 days | Size: Medium (6-20)
 9. ANADARKO WATTENBERG OIL COMPLEX LLC                | Change: +  1.8 days | Size: Medium (6-20)
10. GRAND RIVER GATHERING LLC                          | Change: +  1.7 days | Size: Major (50+)

✅ TOP 10 OPERATORS WITH BIGGEST IMPROVEMENTS:
 1. PETRO OPERATING COMPANY LLC                        | Change: -80.9 days | Size: Medium (6-20)
 2. FOUNDATION ENERGY MANAGEMENT LLC                   | Change: -63.0 days | Size: Large (21-50)
 3. BLUE CHIP OIL INC                                  | Change: -54.1 days | Size: Medium (6-20)
 4. UTAH GAS OP LTD DBA UTAH GAS CORP                  | Change: -23.2 days | Size: Major (50+)
 5. CHEVRON USA INC                                    | Change: -17.3 days | Size: Major (50+)
 6. BAYSWATER EXPLORATION & PRODUCTION LLC             | Change:  -9.7 days | Size: Major (50+)
 7. GREAT WESTERN OPERATING COMPANY LLC                | Change:  -7.4 days | Size: Major (50+)
 8. CRESTONE PEAK RESOURCES OPERATING LLC              | Change:  -3.8 days | Size: Major (50+)
 9. NOBLE ENERGY INC                                   | Change:  -3.3 days | Size: Major (50+)
10. KERR MCGEE GATHERING LLC                           | Change:  -3.0 days | Size: Major (50+)

3️⃣ OPERATOR SIZE EFFECT ANALYSIS
----------------------------------------
Average delay changes by operator size:
                mean  count    std
size_category                     
Small (1-5)     3.00      2   2.83
Medium (6-20)  -8.61     14  25.61
Large (21-50) -10.17      6  25.90
Major (50+)    -2.05     22   7.59

🔍 Large vs Small Operators:
   Large operators (n=28): -3.79 days average change
   Small operators (n=16): -7.16 days average change
   Statistical test: t=0.596, p=0.5543 (not significant)

4️⃣ SPILL SIZE RESISTANCE ANALYSIS
----------------------------------------
Delay changes by spill size:
spill_size_category
Small (0-1 bbl)     NaN
Medium (1-10 bbl)   NaN
Large (10-50 bbl)   NaN
Major (50+ bbl)     NaN
Name: change, dtype: float64

5️⃣ GEOGRAPHIC RESISTANCE PATTERNS
----------------------------------------
🚨 COUNTIES WITH BIGGEST DELAY INCREASES (min 20 spills):
   GARFIELD                 :  -0.1 days (1437 spills)
   WELD                     :  -1.3 days (9041 spills)
   RIO BLANCO               :  -6.2 days (718 spills)

6️⃣ RESISTANCE HYPOTHESIS ASSESSMENT
---------------------------------------------
✅ Major operators overrepresented in worst performers
✅ Large operators had bigger delay increases than small operators
❌ Major spills did not have above-average delay increases

📊 RESISTANCE HYPOTHESIS SCORE: 67% (2/3 tests support)
⚠️  MODERATE EVIDENCE for industry resistance

✅ Analysis complete!

/tmp/ipykernel_1241247/2502002660.py:62: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  size_period_summary = operator_period_stats.groupby(['size_category', 'Period']).agg({
/tmp/ipykernel_1241247/2502002660.py:88: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complete_operators['delay_change'] = complete_operators[after_period] - complete_operators[before_period]
/tmp/ipykernel_1241247/2502002660.py:118: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  size_changes = complete_with_size.groupby('size_category')['delay_change'].agg(['mean', 'count', 'std']).round(2)
/tmp/ipykernel_1241247/2502002660.py:155: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  size_period_analysis = spills_gdf.groupby(['spill_size_category', 'Period'])['report_delay'].mean().unstack()

Variable	Coefficient	Std Error	z-value	p-value	Significant?	Interpretation
Intercept	2.4066	0.039	62.49	< 0.001	✅ YES	Historical spills in rural areas after 2021 have significant baseline delay
Recent	-1.8821	0.048	-39.00	< 0.001	✅ YES	Recent spills have significantly SHORTER delays than Historical
Before 2021	-1.0298	0.086	-11.98	< 0.001	✅ YES	Before 2021 had significantly SHORTER delays than after 2021
Suburban	-0.6149	0.086	-7.17	< 0.001	✅ YES	Suburban areas have significantly SHORTER delays than rural
Urban	-0.6602	0.043	-15.21	< 0.001	✅ YES	Urban areas have significantly SHORTER delays than rural

Interaction	Coefficient	Std Error	z-value	p-value	Significant?	Interpretation
Recent × Before 2021	1.4185	0.096	14.84	< 0.001	✅ YES	The combination of Recent + Before 2021 has a significant positive interaction
Recent × Suburban	0.1016	0.118	0.86	0.387	❌ NO	No significant interaction between Recent spills and Suburban areas
Recent × Urban	0.9560	0.062	15.43	< 0.001	✅ YES	Significant positive interaction between Recent spills and Urban areas
Before 2021 × Suburban	1.1389	0.174	6.56	< 0.001	✅ YES	Significant positive interaction between Before 2021 and Suburban areas
Before 2021 × Urban	0.3593	0.096	3.73	< 0.001	✅ YES	Significant positive interaction between Before 2021 and Urban areas

Three-Way Interaction	Coefficient	Std Error	z-value	p-value	Significant?	Interpretation
Recent × Before 2021 × Suburban	-0.1548	0.208	-0.74	0.457	❌ NO	No significant three-way interaction for suburban areas
Recent × Before 2021 × Urban	-0.4859	0.117	-4.15	< 0.001	✅ YES	Significant negative three-way interaction for urban areas

Colorado Spills¶

OLS, NBR, Interrupted Time Series Analysis¶

Just top 3 Counties¶

Author: David P. Adams, PhD ¶

Date: 2025-08-06¶

Setup¶

OLS Regression¶

Negative Binomial Regression¶

Interrupted Time Series¶

ITS by Spill Type¶

incorporating rurality data¶

This section is to incorporate the rurality data into the spills data¶

Statistical Significance Interpretation¶

Overview¶

Individual Coefficients¶

Two-Way Interactions¶

Three-Way Interactions¶

Key Statistical Insights¶

What's Statistically Significant:¶

What's NOT Statistically Significant:¶

Practical Implications:¶

Model Fit Statistics¶

Alpha Level Note¶

Key Findings - Mission Change Appears Problematic¶

Historical Spills (Legacy Cases):¶

Recent Spills (New Cases):¶

Major Policy Concerns¶

Reconciliation check resolves the apparent contradiction in Neg Binomial and ITS.¶

The Results are 100% Consistent¶

What the Reconciliation Shows:¶

This Perfectly Matches the ITS Results:¶

So Why Did the Negative Binomial Show Different Results?¶

Compositional Changes Post-2021:¶

The GLM Controls for These Compositional Changes¶

Translation:¶

Both Analyses Are Correct and Complementary:¶

Policy Interpretation:¶

Colorado Spills¶

OLS, NBR, Interrupted Time Series Analysis¶

Just top 3 Counties¶

Author: David P. Adams, PhD¶

Date: 2025-08-06¶

Setup¶

OLS Regression¶

Negative Binomial Regression¶

Interrupted Time Series¶

ITS by Spill Type¶

incorporating rurality data¶

This section is to incorporate the rurality data into the spills data¶

Statistical Significance Interpretation¶

Overview¶

Individual Coefficients¶

Two-Way Interactions¶

Three-Way Interactions¶

Key Statistical Insights¶

What's Statistically Significant:¶

What's NOT Statistically Significant:¶

Practical Implications:¶

Model Fit Statistics¶

Alpha Level Note¶

Key Findings - Mission Change Appears Problematic¶

Historical Spills (Legacy Cases):¶

Recent Spills (New Cases):¶

Major Policy Concerns¶

Reconciliation check resolves the apparent contradiction in Neg Binomial and ITS.¶

The Results are 100% Consistent¶

What the Reconciliation Shows:¶

This Perfectly Matches the ITS Results:¶

So Why Did the Negative Binomial Show Different Results?¶

Compositional Changes Post-2021:¶

The GLM Controls for These Compositional Changes¶

Translation:¶

Both Analyses Are Correct and Complementary:¶

Policy Interpretation:¶

Author: David P. Adams, PhD ¶