import os
import warnings
from pathlib import Path
from urllib.parse import quote_plus

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import statsmodels.formula.api as smf
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from scipy.spatial.distance import cdist

warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option("display.float_format", "{:,.2f}".format)

load_dotenv(override=False)

host     = os.getenv("PGHOST",     "localhost")
port     = os.getenv("5433",     "5433")
user     = os.getenv("PGUSER",     "postgres")
password = quote_plus(os.getenv("PGPASSWORD", ""))
database = os.getenv("PGDATABASE", "texas_data")

engine = create_engine(
    f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
)
print(f"Connected → {database} on {host}:{port}")

Connected → texas_data on localhost:5433

# District-year inspection metrics aggregated in SQL.
# LAG() computes days since the previous inspection for the same well (api_norm).
insp_sql = """
WITH lagged AS (
    SELECT
        district,
        EXTRACT(year FROM inspection_date)::int                                    AS year,
        api_norm,
        inspection_date,
        CASE WHEN UPPER(compliance::text) IN ('YES', 'Y') THEN 1.0 ELSE 0.0 END   AS is_compliant,
        EXTRACT(EPOCH FROM (
            inspection_date
            - LAG(inspection_date) OVER (PARTITION BY api_norm ORDER BY inspection_date)
        )) / 86400.0                                                               AS days_since_prev
    FROM inspections
    WHERE inspection_date IS NOT NULL
      AND district        IS NOT NULL
      AND EXTRACT(year FROM inspection_date) BETWEEN 2016 AND 2025
)
SELECT
    district,
    year,
    COUNT(*)                                       AS total_inspections,
    COUNT(DISTINCT api_norm)                       AS unique_wells,
    ROUND(AVG(is_compliant)::numeric * 100, 2)     AS compliance_rate,
    ROUND(AVG(days_since_prev)::numeric, 1)        AS avg_days_between_inspections
FROM lagged
GROUP BY district, year
ORDER BY district, year
"""

insp = pd.read_sql(text(insp_sql), engine)
print(f"Inspections panel: {len(insp):,} district-year rows | {insp['district'].nunique()} districts")
insp.head()

Inspections panel: 130 district-year rows | 13 districts

# District-year violation metrics. Blank last_enf_action strings treated as no action.
viol_sql = """
SELECT
    district,
    EXTRACT(year FROM violation_disc_date)::int                                        AS year,
    COUNT(*)                                                                            AS total_violations,
    COUNT(DISTINCT api_norm)                                                            AS unique_wells_with_violations,
    SUM(CASE WHEN major_viol_ind = 'Y' THEN 1 ELSE 0 END)                             AS major_violations,
    ROUND(AVG(CASE WHEN compliant_on_reinsp = 'Y' THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)
                                                                                        AS resolution_rate,
    ROUND(AVG(CASE WHEN last_enf_action IS NOT NULL AND last_enf_action <> ''
              THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)                                AS enforcement_rate,
    ROUND(AVG(
        CASE WHEN last_enf_action_date IS NOT NULL
        THEN EXTRACT(EPOCH FROM (last_enf_action_date - violation_disc_date)) / 86400.0
        END
    )::numeric, 1)                                                                      AS avg_days_to_enforcement
FROM violations
WHERE violation_disc_date IS NOT NULL
  AND district             IS NOT NULL
  AND EXTRACT(year FROM violation_disc_date) BETWEEN 2016 AND 2025
GROUP BY district, year
ORDER BY district, year
"""

viol = pd.read_sql(text(viol_sql), engine)
print(f"Violations panel: {len(viol):,} district-year rows")
viol.head()

Violations panel: 130 district-year rows

BUDGET_PATH = Path("RRC Budget Data.xlsx")
raw  = pd.read_excel(BUDGET_PATH, header=None)
YEARS = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
COLS  = slice(1, 10)   # spreadsheet columns 1-9 map to years 2016-2024

# ── Section 1: Energy Resource Development (rows 7-18) ──────────────────────
erd = pd.DataFrame({
    "year":              YEARS,
    "strategy":          "Energy Resource Development",
    "total_budget":      raw.iloc[1,  COLS].values.astype(float),
    "salaries":          raw.iloc[7,  COLS].values.astype(float),
    "other_personnel":   raw.iloc[8,  COLS].values.astype(float),
    "professional_fees": raw.iloc[9,  COLS].values.astype(float),
    "travel":            raw.iloc[13, COLS].values.astype(float),
    "other_operating":   raw.iloc[16, COLS].values.astype(float),
    "capital_exp":       raw.iloc[17, COLS].values.astype(float),
    "fte":               raw.iloc[18, COLS].values.astype(float),
})

# ── Section 2: Oil/Gas Monitoring & Inspections (rows 20-31) ────────────────
ogi = pd.DataFrame({
    "year":              YEARS,
    "strategy":          "Oil/Gas Monitoring & Inspections",
    "total_budget":      raw.iloc[2,  COLS].values.astype(float),
    "salaries":          raw.iloc[20, COLS].values.astype(float),
    "other_personnel":   raw.iloc[21, COLS].values.astype(float),
    "professional_fees": raw.iloc[22, COLS].values.astype(float),
    "travel":            raw.iloc[26, COLS].values.astype(float),
    "other_operating":   raw.iloc[29, COLS].values.astype(float),
    "capital_exp":       raw.iloc[30, COLS].values.astype(float),
    "fte":               raw.iloc[31, COLS].values.astype(float),
})

budget_long = pd.concat([erd, ogi], ignore_index=True)
print(f"Budget long: {len(budget_long)} rows  (2 strategies × {len(YEARS)} years)")
budget_long

Budget long: 18 rows  (2 strategies × 9 years)

# ── Wide budget: one row per year with ogi_ / erd_ prefixed columns ──────────
ogi_wide = ogi.drop(columns="strategy").add_prefix("ogi_")
erd_wide = erd.drop(columns="strategy").add_prefix("erd_")

budget_wide = (
    ogi_wide
    .merge(erd_wide, left_on="ogi_year", right_on="erd_year")
    .rename(columns={"ogi_year": "year"})
    .drop(columns="erd_year")
)

# ── Merge inspections + violations, then join statewide budget on year ────────
panel = (
    insp
    .merge(viol, on=["district", "year"], how="left")
    .merge(budget_wide, on="year", how="left")
)

# ── Derived columns ───────────────────────────────────────────────────────────
panel["violations_per_inspection"] = panel["total_violations"] / panel["total_inspections"]
panel["ogi_budget_m"]              = panel["ogi_total_budget"] / 1_000_000   # dollars → millions
panel["erd_budget_m"]              = panel["erd_total_budget"] / 1_000_000
panel["post_2019"]                 = (panel["year"] >= 2019).astype(int)
# 2024 = budget estimate; 2025 = no budget data — exclude both from regressions
panel["is_budget_year"]            = (panel["year"] >= 2024).astype(int)

# Goal ambiguity: share of combined budget going to the inspection mission.
# Higher share = clearer mission focus; lower share = more goal ambiguity.
panel["inspection_budget_share"] = (
    panel["ogi_total_budget"] / (panel["ogi_total_budget"] + panel["erd_total_budget"])
)

# Fill violation NaNs for districts with zero violations in a given year
fill_cols = [
    "total_violations", "unique_wells_with_violations", "major_violations",
    "resolution_rate", "enforcement_rate", "avg_days_to_enforcement",
    "violations_per_inspection",
]
panel[fill_cols] = panel[fill_cols].fillna(0)

print(f"Analysis panel: {len(panel):,} rows | "
      f"{panel['district'].nunique()} districts | "
      f"{panel['year'].nunique()} years")
panel.head()

Analysis panel: 130 rows | 13 districts | 10 years

# Year-level means across districts
yearly = panel.groupby("year").agg(
    ogi_budget_m          = ("ogi_budget_m",           "first"),
    ogi_fte               = ("ogi_fte",                "first"),
    total_inspections     = ("total_inspections",      "mean"),
    compliance_rate       = ("compliance_rate",        "mean"),
    total_violations      = ("total_violations",       "mean"),
    resolution_rate       = ("resolution_rate",        "mean"),
    avg_days_to_enf       = ("avg_days_to_enforcement","mean"),
).round(2)

print(yearly.to_string())

fig, axes = plt.subplots(2, 3, figsize=(16, 8))
axes = axes.flatten()

yearly["ogi_budget_m"].plot(ax=axes[0], marker="o", title="OGI Budget ($M)")
axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"${x:.0f}M"))

yearly["ogi_fte"].plot(ax=axes[1], marker="o", title="OGI FTE Positions")

yearly["total_inspections"].plot(ax=axes[2], marker="o", title="Avg Inspections / District")

yearly["compliance_rate"].plot(ax=axes[3], marker="o", title="Avg Compliance Rate (%)")

yearly["resolution_rate"].plot(ax=axes[4], marker="o", title="Avg Resolution Rate (%)")

yearly["avg_days_to_enf"].plot(ax=axes[5], marker="o", title="Avg Days to Enforcement")

for ax in axes:
    ax.axvline(2019, color="red", linestyle="--", alpha=0.5, label="2019 policy")
    ax.set_xlabel("Year")

plt.tight_layout()
plt.show()

      ogi_budget_m  ogi_fte  total_inspections  compliance_rate  total_violations  resolution_rate  avg_days_to_enf
year                                                                                                               
2016         18.47   256.70          18,277.85            83.11          3,398.15            36.78           131.86
2017         17.20   249.50          20,138.54            86.52          2,915.69            59.02           185.01
2018         17.56   229.90          25,703.54            90.17          3,197.62            59.46           207.25
2019         21.95   255.60          25,058.46            89.85          2,550.77            61.44           170.36
2020         26.06   284.00          27,669.46            89.57          2,750.92            56.81           154.66
2021         28.76   277.80          24,115.54            88.76          2,556.38            66.18           118.82
2022         25.91   264.00          32,023.54            89.82          2,819.92            67.85            91.50
2023         34.33   271.20          33,805.69            91.62          2,598.62            69.65           105.15
2024         38.51   280.80          36,552.77            92.58          2,221.15            65.13            76.93
2025           NaN      NaN          34,082.08            90.52          2,530.38            52.06            36.62

corr_cols = [
    "ogi_budget_m", "ogi_fte", "inspection_budget_share",
    "total_inspections", "compliance_rate",
    "total_violations", "resolution_rate", "avg_days_to_enforcement",
]
corr = panel[corr_cols].corr().round(2)

fig, ax = plt.subplots(figsize=(9, 7))
im = ax.imshow(corr, cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_xticks(range(len(corr_cols)))
ax.set_yticks(range(len(corr_cols)))
ax.set_xticklabels(corr_cols, rotation=45, ha="right", fontsize=9)
ax.set_yticklabels(corr_cols, fontsize=9)
for i in range(len(corr_cols)):
    for j in range(len(corr_cols)):
        ax.text(j, i, corr.iloc[i, j], ha="center", va="center", fontsize=8)
plt.colorbar(im, ax=ax)
ax.set_title("Correlation Matrix — Key Variables")
plt.tight_layout()
plt.show()

actuals = panel[panel["is_budget_year"] == 0].copy()

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

actuals.plot.scatter(x="ogi_budget_m", y="total_inspections",
                     alpha=0.4, ax=axes[0], title="Budget → Inspections")
actuals.plot.scatter(x="ogi_budget_m", y="compliance_rate",
                     alpha=0.4, ax=axes[1], title="Budget → Compliance Rate (%)")
actuals.plot.scatter(x="ogi_budget_m", y="resolution_rate",
                     alpha=0.4, ax=axes[2], title="Budget → Resolution Rate (%)")

for ax in axes:
    ax.set_xlabel("OGI Budget ($M)")

plt.tight_layout()
plt.show()

m_inspections = smf.ols(
    "total_inspections ~ ogi_budget_m + C(district)",
    data=actuals,
).fit(cov_type="cluster", cov_kwds={"groups": actuals["district"]})

m_compliance = smf.ols(
    "compliance_rate ~ ogi_budget_m + C(district)",
    data=actuals,
).fit(cov_type="cluster", cov_kwds={"groups": actuals["district"]})

m_resolution = smf.ols(
    "resolution_rate ~ ogi_budget_m + C(district)",
    data=actuals,
).fit(cov_type="cluster", cov_kwds={"groups": actuals["district"]})

# Detect actual column names — statsmodels uses z/P>|z| with robust SEs in some versions
_tbl = m_inspections.summary2().tables[1]
_t   = "t"     if "t"     in _tbl.columns else "z"
_p   = "P>|t|" if "P>|t|" in _tbl.columns else "P>|z|"
display_cols = ["Coef.", "Std.Err.", _t, _p]

print("H1a — OGI Budget ($M) → Total Inspections")
print(m_inspections.summary2().tables[1][display_cols].loc[["ogi_budget_m"]])
print(f"  R² = {m_inspections.rsquared:.3f}  Adj. R² = {m_inspections.rsquared_adj:.3f}\n")

print("H1b — OGI Budget ($M) → Compliance Rate (%)")
print(m_compliance.summary2().tables[1][display_cols].loc[["ogi_budget_m"]])
print(f"  R² = {m_compliance.rsquared:.3f}  Adj. R² = {m_compliance.rsquared_adj:.3f}\n")

print("H1c — OGI Budget ($M) → Resolution Rate (%)")
print(m_resolution.summary2().tables[1][display_cols].loc[["ogi_budget_m"]])
print(f"  R² = {m_resolution.rsquared:.3f}  Adj. R² = {m_resolution.rsquared_adj:.3f}")

H1a — OGI Budget ($M) → Total Inspections
              Coef.  Std.Err.    z  P>|z|
ogi_budget_m 666.30    212.98 3.13   0.00
  R² = 0.769  Adj. R² = 0.736

H1b — OGI Budget ($M) → Compliance Rate (%)
              Coef.  Std.Err.    z  P>|z|
ogi_budget_m   0.26      0.11 2.31   0.02
  R² = 0.538  Adj. R² = 0.471

H1c — OGI Budget ($M) → Resolution Rate (%)
              Coef.  Std.Err.    z  P>|z|
ogi_budget_m   1.05      0.32 3.28   0.00
  R² = 0.624  Adj. R² = 0.569

m_h2 = smf.ols(
    "compliance_rate ~ ogi_budget_m * inspection_budget_share + C(district)",
    data=actuals,
).fit(cov_type="cluster", cov_kwds={"groups": actuals["district"]})

key_rows = ["ogi_budget_m", "inspection_budget_share", "ogi_budget_m:inspection_budget_share"]
print("H2 — Goal Ambiguity Moderation (DV: compliance_rate)")
print(m_h2.summary2().tables[1][display_cols].loc[key_rows])
print(f"\nR² = {m_h2.rsquared:.3f}  Adj. R² = {m_h2.rsquared_adj:.3f}")

# ── Same model with resolution rate as DV ────────────────────────────────────
m_h2_res = smf.ols(
    "resolution_rate ~ ogi_budget_m * inspection_budget_share + C(district)",
    data=actuals,
).fit(cov_type="cluster", cov_kwds={"groups": actuals["district"]})

print("\nH2 — Goal Ambiguity Moderation (DV: resolution_rate)")
print(m_h2_res.summary2().tables[1][display_cols].loc[key_rows])
print(f"\nR² = {m_h2_res.rsquared:.3f}  Adj. R² = {m_h2_res.rsquared_adj:.3f}")

H2 — Goal Ambiguity Moderation (DV: compliance_rate)
                                      Coef.  Std.Err.     z  P>|z|
ogi_budget_m                           4.20      1.09  3.86   0.00
inspection_budget_share              170.18     44.79  3.80   0.00
ogi_budget_m:inspection_budget_share  -6.53      1.84 -3.55   0.00

R² = 0.567  Adj. R² = 0.493

H2 — Goal Ambiguity Moderation (DV: resolution_rate)
                                      Coef.  Std.Err.     z  P>|z|
ogi_budget_m                           6.68      4.67  1.43   0.15
inspection_budget_share              230.67    204.30  1.13   0.26
ogi_budget_m:inspection_budget_share  -9.42      7.99 -1.18   0.24

R² = 0.629  Adj. R² = 0.566

m_h3 = smf.ols(
    "compliance_rate ~ ogi_budget_m * C(district)",
    data=actuals,
).fit(cov_type="cluster", cov_kwds={"groups": actuals["district"]})

coef_table = m_h3.summary2().tables[1]

# Baseline budget slope (reference district)
baseline_row = coef_table.loc[["ogi_budget_m"]]
print("H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)")
print(f"Baseline (reference district) budget slope:")
print(baseline_row[display_cols])

# District-specific deviations from baseline
interaction_rows = coef_table[coef_table.index.str.contains("ogi_budget_m:C")]
print("\nDistrict interaction terms (deviation from reference slope):")
print(interaction_rows[display_cols].round(4))
print(f"\nR² = {m_h3.rsquared:.3f}  Adj. R² = {m_h3.rsquared_adj:.3f}")

# ── Plot district-specific budget slopes ─────────────────────────────────────
districts = actuals["district"].unique()
slopes = {}
for d in districts:
    key = f"ogi_budget_m:C(district)[T.{d}]"
    base = m_h3.params.get("ogi_budget_m", 0)
    delta = m_h3.params.get(key, 0)
    slopes[str(d)] = base + delta

slope_df = pd.Series(slopes).sort_values()
fig, ax = plt.subplots(figsize=(10, 4))
slope_df.plot.barh(ax=ax, color=["#d62728" if v < 0 else "#1f77b4" for v in slope_df])
ax.axvline(0, color="black", linewidth=0.8)
ax.set_xlabel("Budget slope (compliance rate pp per $M)")
ax.set_title("H3 — District-Specific Budget → Compliance Slopes")
plt.tight_layout()
plt.show()

H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)
Baseline (reference district) budget slope:
              Coef.  Std.Err.                     z  P>|z|
ogi_budget_m   0.09      0.00 56,876,193,472,228.37   0.00

District interaction terms (deviation from reference slope):
                                Coef.  Std.Err.                      z  P>|z|
ogi_budget_m:C(district)[T.02]   0.15      0.00  22,633,237,551,336.32   0.00
ogi_budget_m:C(district)[T.03]  -0.43      0.00 -59,804,100,493,329.36   0.00
ogi_budget_m:C(district)[T.04]   0.19      0.00  78,131,153,896,367.78   0.00
ogi_budget_m:C(district)[T.05]  -0.04      0.00 -23,701,820,832,698.50   0.00
ogi_budget_m:C(district)[T.06]   0.34      0.00  60,365,540,001,288.30   0.00
ogi_budget_m:C(district)[T.08]   0.19      0.00  10,356,376,563,126.46   0.00
ogi_budget_m:C(district)[T.09]  -0.09      0.00 -14,544,886,315,847.22   0.00
ogi_budget_m:C(district)[T.10]   0.04      0.00   5,748,033,218,673.02   0.00
ogi_budget_m:C(district)[T.6E]   1.27      0.00  64,743,648,722,385.09   0.00
ogi_budget_m:C(district)[T.7B]   0.18      0.00  27,978,802,690,136.84   0.00
ogi_budget_m:C(district)[T.7C]   0.31      0.00  24,243,474,173,332.52   0.00
ogi_budget_m:C(district)[T.8A]   0.10      0.00  59,702,739,775,453.20   0.00

R² = 0.662  Adj. R² = 0.554

# Texas RRC district geography flags (based on known RRC district locations)
OFFSHORE_DISTRICTS = {"02", "03", "04"}          # dual onshore + offshore jurisdiction
BORDER_DISTRICTS   = {"01", "02", "03", "04"}    # south / gulf coast proximity

actuals = actuals.copy()
actuals["district_str"] = actuals["district"].astype(str).str.strip()
actuals["offshore"]     = actuals["district_str"].isin(OFFSHORE_DISTRICTS).astype(int)
actuals["border"]       = actuals["district_str"].isin(BORDER_DISTRICTS).astype(int)

print("District classification:")
print(
    actuals.groupby(["district_str", "offshore", "border"])
           .size()
           .reset_index(name="district_year_obs")
           .to_string(index=False)
)

District classification:
district_str  offshore  border  district_year_obs
          01         0       1                  8
          02         1       1                  8
          03         1       1                  8
          04         1       1                  8
          05         0       0                  8
          06         0       0                  8
          08         0       0                  8
          09         0       0                  8
          10         0       0                  8
          6E         0       0                  8
          7B         0       0                  8
          7C         0       0                  8
          8A         0       0                  8

# ── Spatial regression: offshore and border interactions ─────────────────────
m_h4 = smf.ols(
    "compliance_rate ~ ogi_budget_m + offshore + border "
    "+ ogi_budget_m:offshore + ogi_budget_m:border + C(district)",
    data=actuals,
).fit(cov_type="cluster", cov_kwds={"groups": actuals["district"]})

spatial_rows = [
    "ogi_budget_m", "offshore", "border",
    "ogi_budget_m:offshore", "ogi_budget_m:border",
]
available = [r for r in spatial_rows if r in m_h4.params.index]
print("H4 — Spatial Moderators (DV: compliance_rate)")
print(m_h4.summary2().tables[1][display_cols].loc[available])
print(f"\nR² = {m_h4.rsquared:.3f}  Adj. R² = {m_h4.rsquared_adj:.3f}")

# ── Moran's I on H1 residuals ─────────────────────────────────────────────────
# Compute district centroids from well lat/lon joined via inspections
centroids_sql = """
SELECT
    i.district,
    AVG(w.latitude)  AS lat,
    AVG(w.longitude) AS lon
FROM inspections i
JOIN well_shape_tract w USING (api_norm)
WHERE w.latitude  IS NOT NULL
  AND w.longitude IS NOT NULL
  AND i.district  IS NOT NULL
GROUP BY i.district
"""

try:
    centroids = pd.read_sql(text(centroids_sql), engine)

    # Average H1 compliance residuals to district level
    resid_df = actuals[["district", "compliance_rate"]].copy()
    resid_df["resid"] = m_compliance.resid.reindex(actuals.index).values
    resid_by_district = resid_df.groupby("district")["resid"].mean().reset_index()

    centroids = centroids.merge(resid_by_district, on="district").dropna()

    # Row-normalised inverse-distance weights matrix
    coords = centroids[["lon", "lat"]].values
    D = cdist(coords, coords)
    np.fill_diagonal(D, np.inf)
    W = 1 / D
    W = W / W.sum(axis=1, keepdims=True)

    z = centroids["resid"].values
    z = z - z.mean()
    n = len(z)
    morans_i = (n / W.sum()) * (z @ W @ z) / (z @ z)

    print(f"\nMoran's I on H1 compliance residuals = {morans_i:.4f}")
    print("  > 0  → residuals cluster spatially (similar neighbours)")
    print("  ≈ 0  → no spatial pattern")
    print("  < 0  → spatial dispersion (dissimilar neighbours)")

    print("\nDistrict centroids used:")
    print(centroids[["district", "lat", "lon"]].round(2).to_string(index=False))

except Exception as e:
    print(f"Moran's I skipped: {e}")

H4 — Spatial Moderators (DV: compliance_rate)
                       Coef.  Std.Err.     z  P>|z|
ogi_budget_m            0.35      0.15  2.39   0.02
offshore                7.61      3.29  2.31   0.02
border                  6.03      2.84  2.12   0.03
ogi_budget_m:offshore  -0.03      0.18 -0.16   0.87
ogi_budget_m:border    -0.25      0.15 -1.74   0.08

R² = 0.553  Adj. R² = 0.476

Moran's I on H1 compliance residuals = -0.0512
  > 0  → residuals cluster spatially (similar neighbours)
  ≈ 0  → no spatial pattern
  < 0  → spatial dispersion (dissimilar neighbours)

District centroids used:
district   lat     lon
      01 29.15  -98.62
      02 28.85  -97.41
      03 30.12  -95.43
      04 27.44  -98.36
      05 31.85  -96.15
      06 32.29  -94.67
      08 31.84 -102.30
      09 33.42  -98.22
      10 35.77 -101.02
      6E 32.40  -94.89
      7B 32.75  -99.40
      7C 31.11 -101.26
      8A 33.12 -102.06

# Wild cluster bootstrap (Rademacher weights, B=999)
# For each draw: multiply each cluster's residuals by ±1, re-fit, record t-stat.
# p-value = share of |t*| >= |t_observed|.

def wild_cluster_bootstrap(model, data, dv, cluster_col="district",
                           coef="ogi_budget_m", B=999, seed=42):
    rng           = np.random.default_rng(seed)
    groups        = data[cluster_col].values
    unique_groups = np.unique(groups)
    t_obs         = model.tvalues[coef]
    yhat          = model.fittedvalues.values
    ehat          = model.resid.values

    t_boot = np.empty(B)
    for b in range(B):
        # One Rademacher weight per cluster, broadcast to observations
        cw  = {g: rng.choice([-1.0, 1.0]) for g in unique_groups}
        w   = np.array([cw[g] for g in groups])
        df_b = data.copy()
        df_b[dv] = yhat + ehat * w
        m_b = smf.ols(
            f"{dv} ~ {coef} + C({cluster_col})", data=df_b
        ).fit(cov_type="cluster", cov_kwds={"groups": df_b[cluster_col]})
        t_boot[b] = m_b.tvalues.get(coef, np.nan)

    p_boot = float((np.abs(t_boot) >= np.abs(t_obs)).mean())
    return t_obs, float(model.pvalues[coef]), p_boot

print("Wild Cluster Bootstrap — H1 Models (B = 999 Rademacher draws)")
print(f"{'Outcome':<28} {'t-stat':>7}  {'p asymptotic':>13}  {'p bootstrap':>12}")
print("─" * 65)

for dv, model in [
    ("total_inspections", m_inspections),
    ("compliance_rate",   m_compliance),
    ("resolution_rate",   m_resolution),
]:
    t, p_a, p_b = wild_cluster_bootstrap(model, actuals, dv)
    sig_a = "*" * (1 + (p_a < .05) + (p_a < .01))
    sig_b = "*" * (1 + (p_b < .05) + (p_b < .01))
    print(f"{dv:<28} {t:>7.3f}  {p_a:>12.3f}{sig_a:<3}  {p_b:>10.3f}{sig_b:<3}")

print("\n* p<.10  ** p<.05  *** p<.01")

Wild Cluster Bootstrap — H1 Models (B = 999 Rademacher draws)
Outcome                       t-stat   p asymptotic   p bootstrap
─────────────────────────────────────────────────────────────────
total_inspections              3.128         0.002***       0.494*  
compliance_rate                2.307         0.021**        0.473*  
resolution_rate                3.277         0.001***       0.509*  

* p<.10  ** p<.05  *** p<.01

# Distributed lag: 1-year lag of OGI budget (shift within district).
# Lag is NaN for 2016 (no 2015 data), so regression sample is 2017-2023 (N=91).

panel_lag = panel.copy()
panel_lag["ogi_budget_m_lag1"] = (
    panel_lag.sort_values("year")
             .groupby("district")["ogi_budget_m"]
             .shift(1)
)

lag_actuals = panel_lag[
    (panel_lag["is_budget_year"] == 0) &
    (panel_lag["ogi_budget_m_lag1"].notna())
].copy()

print(f"Distributed lag sample: {len(lag_actuals)} obs | "
      f"years {lag_actuals['year'].min()}–{lag_actuals['year'].max()}")

# ── Model A: lagged budget only ───────────────────────────────────────────────
m_lag_only = smf.ols(
    "compliance_rate ~ ogi_budget_m_lag1 + C(district)", data=lag_actuals
).fit(cov_type="cluster", cov_kwds={"groups": lag_actuals["district"]})

# ── Model B: contemporaneous + 1-year lag ────────────────────────────────────
m_lag_both = smf.ols(
    "compliance_rate ~ ogi_budget_m + ogi_budget_m_lag1 + C(district)",
    data=lag_actuals
).fit(cov_type="cluster", cov_kwds={"groups": lag_actuals["district"]})

# ── Also run for resolution rate ──────────────────────────────────────────────
m_lag_res_only = smf.ols(
    "resolution_rate ~ ogi_budget_m_lag1 + C(district)", data=lag_actuals
).fit(cov_type="cluster", cov_kwds={"groups": lag_actuals["district"]})

m_lag_res_both = smf.ols(
    "resolution_rate ~ ogi_budget_m + ogi_budget_m_lag1 + C(district)",
    data=lag_actuals
).fit(cov_type="cluster", cov_kwds={"groups": lag_actuals["district"]})

print("\n── Compliance Rate ───────────────────────────────────────────")

print("\nModel A — Lagged budget only (t−1):")
print(m_lag_only.summary2().tables[1][display_cols].loc[["ogi_budget_m_lag1"]])
print(f"  R² = {m_lag_only.rsquared:.3f}  Adj. R² = {m_lag_only.rsquared_adj:.3f}")

print("\nModel B — Contemporaneous + 1-year lag:")
print(m_lag_both.summary2().tables[1][display_cols].loc[
    ["ogi_budget_m", "ogi_budget_m_lag1"]
])
print(f"  R² = {m_lag_both.rsquared:.3f}  Adj. R² = {m_lag_both.rsquared_adj:.3f}")

print("\n── Resolution Rate ───────────────────────────────────────────")

print("\nModel A — Lagged budget only (t−1):")
print(m_lag_res_only.summary2().tables[1][display_cols].loc[["ogi_budget_m_lag1"]])
print(f"  R² = {m_lag_res_only.rsquared:.3f}  Adj. R² = {m_lag_res_only.rsquared_adj:.3f}")

print("\nModel B — Contemporaneous + 1-year lag:")
print(m_lag_res_both.summary2().tables[1][display_cols].loc[
    ["ogi_budget_m", "ogi_budget_m_lag1"]
])
print(f"  R² = {m_lag_res_both.rsquared:.3f}  Adj. R² = {m_lag_res_both.rsquared_adj:.3f}")

Distributed lag sample: 91 obs | years 2017–2023

── Compliance Rate ───────────────────────────────────────────

Model A — Lagged budget only (t−1):
                   Coef.  Std.Err.    z  P>|z|
ogi_budget_m_lag1   0.10      0.13 0.77   0.44
  R² = 0.543  Adj. R² = 0.466

Model B — Contemporaneous + 1-year lag:
                   Coef.  Std.Err.     z  P>|z|
ogi_budget_m        0.24      0.11  2.08   0.04
ogi_budget_m_lag1  -0.14      0.09 -1.55   0.12
  R² = 0.569  Adj. R² = 0.490

── Resolution Rate ───────────────────────────────────────────

Model A — Lagged budget only (t−1):
                   Coef.  Std.Err.    z  P>|z|
ogi_budget_m_lag1   0.83      0.49 1.69   0.09
  R² = 0.696  Adj. R² = 0.644

Model B — Contemporaneous + 1-year lag:
                   Coef.  Std.Err.    z  P>|z|
ogi_budget_m        0.24      0.19 1.22   0.22
ogi_budget_m_lag1   0.59      0.40 1.46   0.14
  R² = 0.698  Adj. R² = 0.642

	district	year	total_inspections	unique_wells	compliance_rate	avg_days_between_inspections
0	01	2016	13975	4055	69.42	18.90
1	01	2017	18022	6153	83.52	56.80
2	01	2018	23826	9109	85.61	53.50
3	01	2019	19790	6447	84.97	79.80
4	01	2020	26006	8716	85.52	122.90

	district	year	total_violations	unique_wells_with_violations	major_violations	resolution_rate	enforcement_rate	avg_days_to_enforcement
0	01	2016	5720	1009	0	21.42	100.00	198.60
1	01	2017	4380	767	0	44.36	100.00	269.50
2	01	2018	5766	997	0	64.46	100.00	229.00
3	01	2019	3593	902	4	49.37	100.00	239.00
4	01	2020	4838	1019	5	27.43	100.00	402.90

	year	strategy	total_budget	salaries	other_personnel	professional_fees	travel	other_operating	capital_exp	fte
0	2016	Energy Resource Development	11,708,475.00	7,669,719.00	398,589.00	3,366,389.00	16,477.00	210,293.00	0.00	130.60
1	2017	Energy Resource Development	10,911,094.00	7,273,775.00	389,348.00	3,118,066.00	6,792.00	77,855.00	0.00	120.30
2	2018	Energy Resource Development	9,846,886.00	7,292,933.00	282,337.00	977,645.00	28,694.00	1,045,727.00	0.00	131.00
3	2019	Energy Resource Development	11,123,757.00	8,068,497.00	217,988.00	1,493,755.00	73,651.00	988,740.00	13,232.00	137.40
4	2020	Energy Resource Development	17,280,569.00	9,707,894.00	236,356.00	5,989,236.00	41,752.00	1,165,481.00	54,037.00	153.40
5	2021	Energy Resource Development	16,237,704.00	10,887,561.00	237,777.00	3,562,816.00	5,614.00	1,446,301.00	10,140.00	168.10
6	2022	Energy Resource Development	25,583,205.00	11,166,309.00	246,340.00	12,560,550.00	37,731.00	1,246,443.00	19,985.00	157.10
7	2023	Energy Resource Development	26,903,564.00	11,056,060.00	252,933.00	12,846,821.00	56,650.00	2,287,481.00	48,344.00	151.30
8	2024	Energy Resource Development	35,533,565.00	13,183,578.00	229,161.00	15,140,585.00	144,641.00	6,425,653.00	0.00	186.00
9	2016	Oil/Gas Monitoring & Inspections	18,471,666.00	15,080,122.00	685,768.00	1,546,321.00	22,630.00	208,311.00	121,363.00	256.70
10	2017	Oil/Gas Monitoring & Inspections	17,204,058.00	15,086,262.00	686,194.00	176,786.00	19,654.00	230,525.00	272,461.00	249.50
11	2018	Oil/Gas Monitoring & Inspections	17,562,431.00	13,083,406.00	430,429.00	1,147,080.00	57,312.00	1,040,639.00	649,172.00	229.90
12	2019	Oil/Gas Monitoring & Inspections	21,951,747.00	14,878,875.00	340,135.00	2,895,436.00	187,048.00	1,185,772.00	1,255,930.00	255.60
13	2020	Oil/Gas Monitoring & Inspections	26,057,560.00	17,228,302.00	417,683.00	4,822,351.00	106,428.00	1,398,705.00	896,846.00	284.00
14	2021	Oil/Gas Monitoring & Inspections	28,756,689.00	17,155,864.00	426,139.00	8,212,873.00	34,762.00	1,394,783.00	230,439.00	277.80
15	2022	Oil/Gas Monitoring & Inspections	25,914,265.00	17,834,460.00	391,138.00	4,007,178.00	154,334.00	1,255,945.00	694,706.00	264.00
16	2023	Oil/Gas Monitoring & Inspections	34,330,858.00	18,622,389.00	457,753.00	8,945,350.00	149,418.00	2,428,330.00	2,234,623.00	271.20
17	2024	Oil/Gas Monitoring & Inspections	38,506,556.00	20,834,721.00	361,687.00	8,851,915.00	316,806.00	4,112,998.00	2,659,208.00	280.80

	district	year	total_inspections	unique_wells	compliance_rate	avg_days_between_inspections	total_violations	unique_wells_with_violations	major_violations	resolution_rate	...	erd_travel	erd_other_operating	erd_capital_exp	erd_fte	violations_per_inspection	ogi_budget_m	erd_budget_m	post_2019	inspection_budget_share
0	01	2016	13975	4055	69.42	18.90	5720	1009	0	21.42	...	16,477.00	210,293.00	0.00	130.60	0.41	18.47	11.71	0	0.61
1	01	2017	18022	6153	83.52	56.80	4380	767	0	44.36	...	6,792.00	77,855.00	0.00	120.30	0.24	17.20	10.91	0	0.61
2	01	2018	23826	9109	85.61	53.50	5766	997	0	64.46	...	28,694.00	1,045,727.00	0.00	131.00	0.24	17.56	9.85	0	0.64
3	01	2019	19790	6447	84.97	79.80	3593	902	4	49.37	...	73,651.00	988,740.00	13,232.00	137.40	0.18	21.95	11.12	1	0.66
4	01	2020	26006	8716	85.52	122.90	4838	1019	5	27.43	...	41,752.00	1,165,481.00	54,037.00	153.40	0.19	26.06	17.28	1	0.60

District	Estimated Slope
01 (San Antonio)	0.09
02 (Corpus Christi)	0.24
03 (Houston)	−0.34
04 (Laredo)	0.28
05 (Midland/Abilene)	0.05
06 (Kilgore)	0.43
08 (Midland)	0.28
09 (Wichita Falls)	0.00
10 (Amarillo)	0.13
6E (Kilgore East)	1.36
7B (Abilene)	0.27
7C (Big Spring)	0.40
8A (Lubbock)	0.19

Texas RRC Inspection Expenses Analysis¶

Hypotheses¶

1. Data Loading¶

2. Exploratory Overview¶

Data and Methods¶

Data Sources¶

Sample and Panel Construction¶

Measures¶

Estimation Strategy¶

Analysis¶

H1: Organizational Capacity → Policy Outputs¶

H2: Goal Ambiguity Moderates Capacity Effects¶

H3: District Multilevel Effects¶

H4: Spatial and Geographic Factors¶

Results¶

Descriptive Trends¶

H1: Organizational Capacity and Regulatory Outputs¶

H2: Goal Ambiguity as a Moderator¶

H3: District-Level Heterogeneity¶

H4: Spatial and Geographic Factors¶

Summary¶

Robustness Checks¶

Robustness Checks¶

Hypotheses Summary¶

Year	OGI Budget ($M)	OGI FTE	Inspections/District	Compliance Rate (%)	Resolution Rate (%)	Days to Enforcement
2016	18.47	256.7	18,278	83.1	36.8	131.9
2017	17.20	249.5	20,139	86.5	59.0	185.0
2018	17.56	229.9	25,704	90.2	59.5	207.3
2019	21.95	255.6	25,058	89.9	61.4	170.4
2020	26.06	284.0	27,669	89.6	56.8	154.7
2021	28.76	277.8	24,116	88.8	66.2	118.8
2022	25.91	264.0	32,024	89.8	67.9	91.5
2023	34.33	271.2	33,806	91.6	69.7	105.2
2024†	38.51	280.8	36,553	92.6	65.1	76.9
2025‡	—	—	34,082	90.5	52.1	36.6‡

Dependent Variable	$\hat{\beta}$ (Budget $M)	SE	$z$	$p$	$R^2$	Adj. $R^2$
Total inspections	666.30	212.98	3.13	<.01	.769	.736
Compliance rate (%)	0.26	0.11	2.31	.02	.538	.471
Resolution rate (%)	1.05	0.32	3.28	<.01	.624	.569

Term	$\hat{\beta}$	SE	$z$	$p$
Budget ($M)	4.20	1.09	3.86	<.01
Inspection budget share	170.18	44.79	3.80	<.01
Budget × Share	−6.53	1.84	−3.55	<.01

Term	$\hat{\beta}$	SE	$z$	$p$
Budget ($M)	0.35	0.15	2.39	.02
Offshore (= 1)	7.61	3.29	2.31	.02
Border (= 1)	6.03	2.84	2.12	.03
Budget × Offshore	−0.03	0.18	−0.16	.87
Budget × Border	−0.25	0.15	−1.74	.08

Outcome	$t$-statistic	$p$ (asymptotic)	$p$ (bootstrap)
Total inspections	3.13	.002	.494
Compliance rate	2.31	.021	.473
Resolution rate	3.28	.001	.509

Model	DV	$\hat{\beta}_t$	$p$	$\hat{\beta}_{t-1}$	$p$	$R^2$
A — Lag only	Compliance rate	—	—	0.10	.44	.543
B — Both	Compliance rate	0.24	.04	−0.14	.12	.569
A — Lag only	Resolution rate	—	—	0.83	.09	.696
B — Both	Resolution rate	0.24	.22	0.59	.14	.698

#	Hypothesis	Prediction	Key Result	Support
H1a	Capacity → Inspection volume	Higher OGI budget predicts more inspections per district	β = 666.3/$1M (z = 3.13, p < .01); bootstrap p = .494	✓†
H1b	Capacity → Compliance	Higher OGI budget predicts higher district compliance rate	β = 0.26 pp/$1M (z = 2.31, p = .02); bootstrap p = .473	✓†
H1c	Capacity → Resolution	Higher OGI budget predicts higher violation resolution rate	β = 1.05 pp/$1M (z = 3.28, p < .01); bootstrap p = .509	✓†
H2a	Goal ambiguity moderates capacity → compliance	Clearer inspection focus amplifies budget effect	Significant but negative (β = −6.53, z = −3.55, p < .01); interpretation constrained by time-only variation in budget share (range: 0.59–0.67)	Exploratory‡
H2b	Goal ambiguity moderates capacity → resolution	Clearer inspection focus amplifies budget effect	Interaction not significant (p = .24)	✗
H3	District heterogeneity in budget slopes	Budget → compliance slope varies across districts	Slopes from −0.34 pp/$1M (D03) to +1.36 pp/$1M (D6E); inference unreliable	Descriptive§
H4a	Offshore jurisdiction moderates budget effect	Offshore districts show different budget → compliance slope	Level effect: +7.6 pp (p = .02); slope interaction not significant (p = .87)	Partial¶
H4b	Border proximity moderates budget effect	Border districts show different budget → compliance slope	Level effect: +6.0 pp (p = .03); slope interaction marginal (p = .08)	Partial¶
H4c	Spatial autocorrelation in residuals	Geographic spillovers produce clustered residuals	Moran's I = −0.051; no significant spatial autocorrelation	✗