texas-inspection-expenses/texas_inspection_expenses.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dc6818b8",
   "metadata": {},
   "source": [
    "# Texas RRC Inspection Expenses Analysis\n",
    "\n",
    "**Research question:** Does organizational capacity (budget, staffing) predict better regulatory outputs (inspections, compliance, enforcement), and how is that relationship moderated by goal ambiguity, district-level heterogeneity, and spatial/geographic factors?\n",
    "\n",
    "## Hypotheses\n",
    "- **H1 — Capacity → Outputs:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n",
    "- **H2 — Goal Ambiguity:** When a larger share of RRC budget goes to the more ambiguous \"Energy Resource Development\" goal, the capacity → output relationship weakens.\n",
    "- **H3 — Multilevel / District Effects:** The capacity → output relationship varies across RRC districts (budget slope heterogeneity).\n",
    "- **H4 — Spatial & Geographic:** Offshore-jurisdiction and border districts moderate the capacity → output relationship; spatial autocorrelation in residuals is tested via Moran's I.\n",
    "\n",
    "**Data:**\n",
    "- PostgreSQL warehouse (`texas_data`): `inspections`, `violations`, `well_shape_tract`\n",
    "- `RRC Budget Data.xlsx`: statewide RRC budget by strategy, 2016–2024\n",
    "- Analysis period: 2016–2023 (2024 is budget estimate, excluded from regressions)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49de2b5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings\n",
    "from pathlib import Path\n",
    "from urllib.parse import quote_plus\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as mticker\n",
    "import statsmodels.formula.api as smf\n",
    "from dotenv import load_dotenv\n",
    "from sqlalchemy import create_engine, text\n",
    "from scipy.spatial.distance import cdist\n",
    "\n",
    "warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
    "pd.set_option(\"display.float_format\", \"{:,.2f}\".format)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08420da3",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(override=False)\n",
    "\n",
    "host     = os.getenv(\"PGHOST\",     \"localhost\")\n",
    "port     = os.getenv(\"PGPORT\",     \"5432\")\n",
    "user     = os.getenv(\"PGUSER\",     \"postgres\")\n",
    "password = quote_plus(os.getenv(\"PGPASSWORD\", \"\"))\n",
    "database = os.getenv(\"PGDATABASE\", \"texas_data\")\n",
    "\n",
    "engine = create_engine(\n",
    "    f\"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}\"\n",
    ")\n",
    "print(f\"Connected → {database} on {host}:{port}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4e6c44f",
   "metadata": {},
   "source": [
    "## 1. Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43886f13",
   "metadata": {},
   "outputs": [],
   "source": [
    "# District-year inspection metrics aggregated in SQL.\n",
    "# LAG() computes days since the previous inspection for the same well (api_norm).\n",
    "insp_sql = \"\"\"\n",
    "WITH lagged AS (\n",
    "    SELECT\n",
    "        district,\n",
    "        EXTRACT(year FROM inspection_date)::int                                    AS year,\n",
    "        api_norm,\n",
    "        inspection_date,\n",
    "        CASE WHEN UPPER(compliance::text) IN ('YES', 'Y') THEN 1.0 ELSE 0.0 END   AS is_compliant,\n",
    "        EXTRACT(EPOCH FROM (\n",
    "            inspection_date\n",
    "            - LAG(inspection_date) OVER (PARTITION BY api_norm ORDER BY inspection_date)\n",
    "        )) / 86400.0                                                               AS days_since_prev\n",
    "    FROM inspections\n",
    "    WHERE inspection_date IS NOT NULL\n",
    "      AND district        IS NOT NULL\n",
    "      AND EXTRACT(year FROM inspection_date) BETWEEN 2016 AND 2024\n",
    ")\n",
    "SELECT\n",
    "    district,\n",
    "    year,\n",
    "    COUNT(*)                                       AS total_inspections,\n",
    "    COUNT(DISTINCT api_norm)                       AS unique_wells,\n",
    "    ROUND(AVG(is_compliant)::numeric * 100, 2)     AS compliance_rate,\n",
    "    ROUND(AVG(days_since_prev)::numeric, 1)        AS avg_days_between_inspections\n",
    "FROM lagged\n",
    "GROUP BY district, year\n",
    "ORDER BY district, year\n",
    "\"\"\"\n",
    "\n",
    "insp = pd.read_sql(text(insp_sql), engine)\n",
    "print(f\"Inspections panel: {len(insp):,} district-year rows | {insp['district'].nunique()} districts\")\n",
    "insp.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3841e2f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# District-year violation metrics. Blank last_enf_action strings treated as no action.\n",
    "viol_sql = \"\"\"\n",
    "SELECT\n",
    "    district,\n",
    "    EXTRACT(year FROM violation_disc_date)::int                                        AS year,\n",
    "    COUNT(*)                                                                            AS total_violations,\n",
    "    COUNT(DISTINCT api_norm)                                                            AS unique_wells_with_violations,\n",
    "    SUM(CASE WHEN major_viol_ind = 'Y' THEN 1 ELSE 0 END)                             AS major_violations,\n",
    "    ROUND(AVG(CASE WHEN compliant_on_reinsp = 'Y' THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)\n",
    "                                                                                        AS resolution_rate,\n",
    "    ROUND(AVG(CASE WHEN last_enf_action IS NOT NULL AND last_enf_action <> ''\n",
    "              THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)                                AS enforcement_rate,\n",
    "    ROUND(AVG(\n",
    "        CASE WHEN last_enf_action_date IS NOT NULL\n",
    "        THEN EXTRACT(EPOCH FROM (last_enf_action_date - violation_disc_date)) / 86400.0\n",
    "        END\n",
    "    )::numeric, 1)                                                                      AS avg_days_to_enforcement\n",
    "FROM violations\n",
    "WHERE violation_disc_date IS NOT NULL\n",
    "  AND district             IS NOT NULL\n",
    "  AND EXTRACT(year FROM violation_disc_date) BETWEEN 2016 AND 2024\n",
    "GROUP BY district, year\n",
    "ORDER BY district, year\n",
    "\"\"\"\n",
    "\n",
    "viol = pd.read_sql(text(viol_sql), engine)\n",
    "print(f\"Violations panel: {len(viol):,} district-year rows\")\n",
    "viol.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e196cac",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUDGET_PATH = Path(\"RRC Budget Data.xlsx\")\n",
    "raw  = pd.read_excel(BUDGET_PATH, header=None)\n",
    "YEARS = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]\n",
    "COLS  = slice(1, 10)   # spreadsheet columns 1-9 map to years 2016-2024\n",
    "\n",
    "# ── Section 1: Energy Resource Development (rows 7-18) ──────────────────────\n",
    "erd = pd.DataFrame({\n",
    "    \"year\":              YEARS,\n",
    "    \"strategy\":          \"Energy Resource Development\",\n",
    "    \"total_budget\":      raw.iloc[1,  COLS].values.astype(float),\n",
    "    \"salaries\":          raw.iloc[7,  COLS].values.astype(float),\n",
    "    \"other_personnel\":   raw.iloc[8,  COLS].values.astype(float),\n",
    "    \"professional_fees\": raw.iloc[9,  COLS].values.astype(float),\n",
    "    \"travel\":            raw.iloc[13, COLS].values.astype(float),\n",
    "    \"other_operating\":   raw.iloc[16, COLS].values.astype(float),\n",
    "    \"capital_exp\":       raw.iloc[17, COLS].values.astype(float),\n",
    "    \"fte\":               raw.iloc[18, COLS].values.astype(float),\n",
    "})\n",
    "\n",
    "# ── Section 2: Oil/Gas Monitoring & Inspections (rows 20-31) ────────────────\n",
    "ogi = pd.DataFrame({\n",
    "    \"year\":              YEARS,\n",
    "    \"strategy\":          \"Oil/Gas Monitoring & Inspections\",\n",
    "    \"total_budget\":      raw.iloc[2,  COLS].values.astype(float),\n",
    "    \"salaries\":          raw.iloc[20, COLS].values.astype(float),\n",
    "    \"other_personnel\":   raw.iloc[21, COLS].values.astype(float),\n",
    "    \"professional_fees\": raw.iloc[22, COLS].values.astype(float),\n",
    "    \"travel\":            raw.iloc[26, COLS].values.astype(float),\n",
    "    \"other_operating\":   raw.iloc[29, COLS].values.astype(float),\n",
    "    \"capital_exp\":       raw.iloc[30, COLS].values.astype(float),\n",
    "    \"fte\":               raw.iloc[31, COLS].values.astype(float),\n",
    "})\n",
    "\n",
    "budget_long = pd.concat([erd, ogi], ignore_index=True)\n",
    "print(f\"Budget long: {len(budget_long)} rows  (2 strategies × {len(YEARS)} years)\")\n",
    "budget_long\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "896d152b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── Wide budget: one row per year with ogi_ / erd_ prefixed columns ──────────\n",
    "ogi_wide = ogi.drop(columns=\"strategy\").add_prefix(\"ogi_\")\n",
    "erd_wide = erd.drop(columns=\"strategy\").add_prefix(\"erd_\")\n",
    "\n",
    "budget_wide = (\n",
    "    ogi_wide\n",
    "    .merge(erd_wide, left_on=\"ogi_year\", right_on=\"erd_year\")\n",
    "    .rename(columns={\"ogi_year\": \"year\"})\n",
    "    .drop(columns=\"erd_year\")\n",
    ")\n",
    "\n",
    "# ── Merge inspections + violations, then join statewide budget on year ────────\n",
    "panel = (\n",
    "    insp\n",
    "    .merge(viol, on=[\"district\", \"year\"], how=\"left\")\n",
    "    .merge(budget_wide, on=\"year\", how=\"left\")\n",
    ")\n",
    "\n",
    "# ── Derived columns ───────────────────────────────────────────────────────────\n",
    "panel[\"violations_per_inspection\"] = panel[\"total_violations\"] / panel[\"total_inspections\"]\n",
    "panel[\"ogi_budget_m\"]              = panel[\"ogi_total_budget\"] / 1_000_000   # dollars → millions\n",
    "panel[\"erd_budget_m\"]              = panel[\"erd_total_budget\"] / 1_000_000\n",
    "panel[\"post_2019\"]                 = (panel[\"year\"] >= 2019).astype(int)\n",
    "panel[\"is_budget_year\"]            = (panel[\"year\"] == 2024).astype(int)\n",
    "\n",
    "# Goal ambiguity: share of combined budget going to the inspection mission.\n",
    "# Higher share = clearer mission focus; lower share = more goal ambiguity.\n",
    "panel[\"inspection_budget_share\"] = (\n",
    "    panel[\"ogi_total_budget\"] / (panel[\"ogi_total_budget\"] + panel[\"erd_total_budget\"])\n",
    ")\n",
    "\n",
    "# Fill violation NaNs for districts with zero violations in a given year\n",
    "fill_cols = [\n",
    "    \"total_violations\", \"unique_wells_with_violations\", \"major_violations\",\n",
    "    \"resolution_rate\", \"enforcement_rate\", \"avg_days_to_enforcement\",\n",
    "    \"violations_per_inspection\",\n",
    "]\n",
    "panel[fill_cols] = panel[fill_cols].fillna(0)\n",
    "\n",
    "print(f\"Analysis panel: {len(panel):,} rows | \"\n",
    "      f\"{panel['district'].nunique()} districts | \"\n",
    "      f\"{panel['year'].nunique()} years\")\n",
    "panel.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3558bae7",
   "metadata": {},
   "source": [
    "## 2. Exploratory Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92921756",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Year-level means across districts\n",
    "yearly = panel.groupby(\"year\").agg(\n",
    "    ogi_budget_m          = (\"ogi_budget_m\",           \"first\"),\n",
    "    ogi_fte               = (\"ogi_fte\",                \"first\"),\n",
    "    total_inspections     = (\"total_inspections\",      \"mean\"),\n",
    "    compliance_rate       = (\"compliance_rate\",        \"mean\"),\n",
    "    total_violations      = (\"total_violations\",       \"mean\"),\n",
    "    resolution_rate       = (\"resolution_rate\",        \"mean\"),\n",
    "    avg_days_to_enf       = (\"avg_days_to_enforcement\",\"mean\"),\n",
    ").round(2)\n",
    "\n",
    "print(yearly.to_string())\n",
    "\n",
    "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n",
    "axes = axes.flatten()\n",
    "\n",
    "yearly[\"ogi_budget_m\"].plot(ax=axes[0], marker=\"o\", title=\"OGI Budget ($M)\")\n",
    "axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f\"${x:.0f}M\"))\n",
    "\n",
    "yearly[\"ogi_fte\"].plot(ax=axes[1], marker=\"o\", title=\"OGI FTE Positions\")\n",
    "\n",
    "yearly[\"total_inspections\"].plot(ax=axes[2], marker=\"o\", title=\"Avg Inspections / District\")\n",
    "\n",
    "yearly[\"compliance_rate\"].plot(ax=axes[3], marker=\"o\", title=\"Avg Compliance Rate (%)\")\n",
    "\n",
    "yearly[\"resolution_rate\"].plot(ax=axes[4], marker=\"o\", title=\"Avg Resolution Rate (%)\")\n",
    "\n",
    "yearly[\"avg_days_to_enf\"].plot(ax=axes[5], marker=\"o\", title=\"Avg Days to Enforcement\")\n",
    "\n",
    "for ax in axes:\n",
    "    ax.axvline(2019, color=\"red\", linestyle=\"--\", alpha=0.5, label=\"2019 policy\")\n",
    "    ax.set_xlabel(\"Year\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d2671c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "corr_cols = [\n",
    "    \"ogi_budget_m\", \"ogi_fte\", \"inspection_budget_share\",\n",
    "    \"total_inspections\", \"compliance_rate\",\n",
    "    \"total_violations\", \"resolution_rate\", \"avg_days_to_enforcement\",\n",
    "]\n",
    "corr = panel[corr_cols].corr().round(2)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(9, 7))\n",
    "im = ax.imshow(corr, cmap=\"RdBu_r\", vmin=-1, vmax=1)\n",
    "ax.set_xticks(range(len(corr_cols)))\n",
    "ax.set_yticks(range(len(corr_cols)))\n",
    "ax.set_xticklabels(corr_cols, rotation=45, ha=\"right\", fontsize=9)\n",
    "ax.set_yticklabels(corr_cols, fontsize=9)\n",
    "for i in range(len(corr_cols)):\n",
    "    for j in range(len(corr_cols)):\n",
    "        ax.text(j, i, corr.iloc[i, j], ha=\"center\", va=\"center\", fontsize=8)\n",
    "plt.colorbar(im, ax=ax)\n",
    "ax.set_title(\"Correlation Matrix — Key Variables\")\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ca1410a",
   "metadata": {},
   "source": [
    "## H1: Organizational Capacity → Policy Outputs\n",
    "\n",
    "**Prediction:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n",
    "\n",
    "**Model:** OLS with district fixed effects and year 2016–2023 (excluding 2024 budget estimate). Budget varies only over time, so it identifies via year-to-year changes in statewide RRC allocations; district FE absorbs persistent cross-district differences.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "463387d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "actuals = panel[panel[\"is_budget_year\"] == 0].copy()\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
    "\n",
    "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"total_inspections\",\n",
    "                     alpha=0.4, ax=axes[0], title=\"Budget → Inspections\")\n",
    "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"compliance_rate\",\n",
    "                     alpha=0.4, ax=axes[1], title=\"Budget → Compliance Rate (%)\")\n",
    "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"resolution_rate\",\n",
    "                     alpha=0.4, ax=axes[2], title=\"Budget → Resolution Rate (%)\")\n",
    "\n",
    "for ax in axes:\n",
    "    ax.set_xlabel(\"OGI Budget ($M)\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "535fc4eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "m_inspections = smf.ols(\n",
    "    \"total_inspections ~ ogi_budget_m + C(district)\",\n",
    "    data=actuals,\n",
    ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
    "\n",
    "m_compliance = smf.ols(\n",
    "    \"compliance_rate ~ ogi_budget_m + C(district)\",\n",
    "    data=actuals,\n",
    ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
    "\n",
    "m_resolution = smf.ols(\n",
    "    \"resolution_rate ~ ogi_budget_m + C(district)\",\n",
    "    data=actuals,\n",
    ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
    "\n",
    "display_cols = [\"Coef.\", \"Std.Err.\", \"t\", \"P>|t|\"]\n",
    "\n",
    "print(\"H1a — OGI Budget ($M) → Total Inspections\")\n",
    "print(m_inspections.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
    "print(f\"  R² = {m_inspections.rsquared:.3f}  Adj. R² = {m_inspections.rsquared_adj:.3f}\\n\")\n",
    "\n",
    "print(\"H1b — OGI Budget ($M) → Compliance Rate (%)\")\n",
    "print(m_compliance.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
    "print(f\"  R² = {m_compliance.rsquared:.3f}  Adj. R² = {m_compliance.rsquared_adj:.3f}\\n\")\n",
    "\n",
    "print(\"H1c — OGI Budget ($M) → Resolution Rate (%)\")\n",
    "print(m_resolution.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
    "print(f\"  R² = {m_resolution.rsquared:.3f}  Adj. R² = {m_resolution.rsquared_adj:.3f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56add68a",
   "metadata": {},
   "source": [
    "## H2: Goal Ambiguity Moderates Capacity Effects\n",
    "\n",
    "**Prediction:** When a larger share of the combined RRC budget flows to the broader \"Energy Resource Development\" goal (lower `inspection_budget_share`), the capacity → output link weakens.\n",
    "\n",
    "**Operationalization:**\n",
    "`inspection_budget_share = ogi_budget / (ogi_budget + erd_budget)`\n",
    "\n",
    "A negative interaction coefficient `ogi_budget_m × inspection_budget_share` would be unexpected (higher share → weaker effect). A positive coefficient supports H2 — clearer mission focus amplifies the budget → compliance relationship.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24187ce8",
   "metadata": {},
   "outputs": [],
   "source": [
    "m_h2 = smf.ols(\n",
    "    \"compliance_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n",
    "    data=actuals,\n",
    ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
    "\n",
    "key_rows = [\"ogi_budget_m\", \"inspection_budget_share\", \"ogi_budget_m:inspection_budget_share\"]\n",
    "print(\"H2 — Goal Ambiguity Moderation (DV: compliance_rate)\")\n",
    "print(m_h2.summary2().tables[1][display_cols].loc[key_rows])\n",
    "print(f\"\\nR² = {m_h2.rsquared:.3f}  Adj. R² = {m_h2.rsquared_adj:.3f}\")\n",
    "\n",
    "# ── Same model with resolution rate as DV ────────────────────────────────────\n",
    "m_h2_res = smf.ols(\n",
    "    \"resolution_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n",
    "    data=actuals,\n",
    ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
    "\n",
    "print(\"\\nH2 — Goal Ambiguity Moderation (DV: resolution_rate)\")\n",
    "print(m_h2_res.summary2().tables[1][display_cols].loc[key_rows])\n",
    "print(f\"\\nR² = {m_h2_res.rsquared:.3f}  Adj. R² = {m_h2_res.rsquared_adj:.3f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6583857",
   "metadata": {},
   "source": [
    "## H3: District Multilevel Effects\n",
    "\n",
    "**Prediction:** The budget → output slope varies across RRC districts — some districts translate budget increases into better outputs more effectively than others.\n",
    "\n",
    "**Model:** Interaction `ogi_budget_m × C(district)` — the reference district captures the baseline budget slope; interaction terms show how each other district's slope differs.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "151faefd",
   "metadata": {},
   "outputs": [],
   "source": [
    "m_h3 = smf.ols(\n",
    "    \"compliance_rate ~ ogi_budget_m * C(district)\",\n",
    "    data=actuals,\n",
    ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
    "\n",
    "coef_table = m_h3.summary2().tables[1]\n",
    "\n",
    "# Baseline budget slope (reference district)\n",
    "baseline_row = coef_table.loc[[\"ogi_budget_m\"]]\n",
    "print(\"H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)\")\n",
    "print(f\"Baseline (reference district) budget slope:\")\n",
    "print(baseline_row[display_cols])\n",
    "\n",
    "# District-specific deviations from baseline\n",
    "interaction_rows = coef_table[coef_table.index.str.contains(\"ogi_budget_m:C\")]\n",
    "print(\"\\nDistrict interaction terms (deviation from reference slope):\")\n",
    "print(interaction_rows[display_cols].round(4))\n",
    "print(f\"\\nR² = {m_h3.rsquared:.3f}  Adj. R² = {m_h3.rsquared_adj:.3f}\")\n",
    "\n",
    "# ── Plot district-specific budget slopes ─────────────────────────────────────\n",
    "districts = actuals[\"district\"].unique()\n",
    "slopes = {}\n",
    "for d in districts:\n",
    "    key = f\"ogi_budget_m:C(district)[T.{d}]\"\n",
    "    base = m_h3.params.get(\"ogi_budget_m\", 0)\n",
    "    delta = m_h3.params.get(key, 0)\n",
    "    slopes[str(d)] = base + delta\n",
    "\n",
    "slope_df = pd.Series(slopes).sort_values()\n",
    "fig, ax = plt.subplots(figsize=(10, 4))\n",
    "slope_df.plot.barh(ax=ax, color=[\"#d62728\" if v < 0 else \"#1f77b4\" for v in slope_df])\n",
    "ax.axvline(0, color=\"black\", linewidth=0.8)\n",
    "ax.set_xlabel(\"Budget slope (compliance rate pp per $M)\")\n",
    "ax.set_title(\"H3 — District-Specific Budget → Compliance Slopes\")\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5bb27b3e",
   "metadata": {},
   "source": [
    "## H4: Spatial and Geographic Factors\n",
    "\n",
    "**Predictions:**\n",
    "- Offshore-jurisdiction districts (02, 03, 04) show a different budget → output relationship due to dual onshore/offshore oversight burden.\n",
    "- Border-proximate districts show a different relationship due to cross-jurisdiction enforcement complexity.\n",
    "- Spatial autocorrelation in H1 residuals (Moran's I) would indicate unmodeled geographic spillovers.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6e56f00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Texas RRC district geography flags (based on known RRC district locations)\n",
    "OFFSHORE_DISTRICTS = {\"02\", \"03\", \"04\"}          # dual onshore + offshore jurisdiction\n",
    "BORDER_DISTRICTS   = {\"01\", \"02\", \"03\", \"04\"}    # south / gulf coast proximity\n",
    "\n",
    "actuals = actuals.copy()\n",
    "actuals[\"district_str\"] = actuals[\"district\"].astype(str).str.strip()\n",
    "actuals[\"offshore\"]     = actuals[\"district_str\"].isin(OFFSHORE_DISTRICTS).astype(int)\n",
    "actuals[\"border\"]       = actuals[\"district_str\"].isin(BORDER_DISTRICTS).astype(int)\n",
    "\n",
    "print(\"District classification:\")\n",
    "print(\n",
    "    actuals.groupby([\"district_str\", \"offshore\", \"border\"])\n",
    "           .size()\n",
    "           .reset_index(name=\"district_year_obs\")\n",
    "           .to_string(index=False)\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74686bfe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── Spatial regression: offshore and border interactions ─────────────────────\n",
    "m_h4 = smf.ols(\n",
    "    \"compliance_rate ~ ogi_budget_m + offshore + border \"\n",
    "    \"+ ogi_budget_m:offshore + ogi_budget_m:border + C(district)\",\n",
    "    data=actuals,\n",
    ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
    "\n",
    "spatial_rows = [\n",
    "    \"ogi_budget_m\", \"offshore\", \"border\",\n",
    "    \"ogi_budget_m:offshore\", \"ogi_budget_m:border\",\n",
    "]\n",
    "available = [r for r in spatial_rows if r in m_h4.params.index]\n",
    "print(\"H4 — Spatial Moderators (DV: compliance_rate)\")\n",
    "print(m_h4.summary2().tables[1][display_cols].loc[available])\n",
    "print(f\"\\nR² = {m_h4.rsquared:.3f}  Adj. R² = {m_h4.rsquared_adj:.3f}\")\n",
    "\n",
    "# ── Moran's I on H1 residuals ─────────────────────────────────────────────────\n",
    "# Compute district centroids from well lat/lon joined via inspections\n",
    "centroids_sql = \"\"\"\n",
    "SELECT\n",
    "    i.district,\n",
    "    AVG(w.latitude)  AS lat,\n",
    "    AVG(w.longitude) AS lon\n",
    "FROM inspections i\n",
    "JOIN well_shape_tract w USING (api_norm)\n",
    "WHERE w.latitude  IS NOT NULL\n",
    "  AND w.longitude IS NOT NULL\n",
    "  AND i.district  IS NOT NULL\n",
    "GROUP BY i.district\n",
    "\"\"\"\n",
    "\n",
    "try:\n",
    "    centroids = pd.read_sql(text(centroids_sql), engine)\n",
    "\n",
    "    # Average H1 compliance residuals to district level\n",
    "    resid_df = actuals[[\"district\", \"compliance_rate\"]].copy()\n",
    "    resid_df[\"resid\"] = m_compliance.resid.reindex(actuals.index).values\n",
    "    resid_by_district = resid_df.groupby(\"district\")[\"resid\"].mean().reset_index()\n",
    "\n",
    "    centroids = centroids.merge(resid_by_district, on=\"district\").dropna()\n",
    "\n",
    "    # Row-normalised inverse-distance weights matrix\n",
    "    coords = centroids[[\"lon\", \"lat\"]].values\n",
    "    D = cdist(coords, coords)\n",
    "    np.fill_diagonal(D, np.inf)\n",
    "    W = 1 / D\n",
    "    W = W / W.sum(axis=1, keepdims=True)\n",
    "\n",
    "    z = centroids[\"resid\"].values\n",
    "    z = z - z.mean()\n",
    "    n = len(z)\n",
    "    morans_i = (n / W.sum()) * (z @ W @ z) / (z @ z)\n",
    "\n",
    "    print(f\"\\nMoran's I on H1 compliance residuals = {morans_i:.4f}\")\n",
    "    print(\"  > 0  → residuals cluster spatially (similar neighbours)\")\n",
    "    print(\"  ≈ 0  → no spatial pattern\")\n",
    "    print(\"  < 0  → spatial dispersion (dissimilar neighbours)\")\n",
    "\n",
    "    print(\"\\nDistrict centroids used:\")\n",
    "    print(centroids[[\"district\", \"lat\", \"lon\"]].round(2).to_string(index=False))\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"Moran's I skipped: {e}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}