commit 8038a3fab4ae3aa19181b8f14f8d1fb301e2f871 Author: dadams Date: Wed Feb 25 12:17:56 2026 -0800 Initial commit: texas inspection expenses analysis notebook Jupyter notebook analyzing RRC budget (2016-2024) against inspection/violation outcomes across Texas RRC districts. Tests four hypotheses: organizational capacity, goal ambiguity, district multilevel effects, and spatial factors. Co-Authored-By: Claude Sonnet 4.6 diff --git a/RRC Budget Data.xlsx b/RRC Budget Data.xlsx new file mode 100644 index 0000000..d35f8a6 Binary files /dev/null and b/RRC Budget Data.xlsx differ diff --git a/texas_inspection_expenses.ipynb b/texas_inspection_expenses.ipynb new file mode 100644 index 0000000..403ff46 --- /dev/null +++ b/texas_inspection_expenses.ipynb @@ -0,0 +1,634 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dc6818b8", + "metadata": {}, + "source": [ + "# Texas RRC Inspection Expenses Analysis\n", + "\n", + "**Research question:** Does organizational capacity (budget, staffing) predict better regulatory outputs (inspections, compliance, enforcement), and how is that relationship moderated by goal ambiguity, district-level heterogeneity, and spatial/geographic factors?\n", + "\n", + "## Hypotheses\n", + "- **H1 — Capacity → Outputs:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n", + "- **H2 — Goal Ambiguity:** When a larger share of RRC budget goes to the more ambiguous \"Energy Resource Development\" goal, the capacity → output relationship weakens.\n", + "- **H3 — Multilevel / District Effects:** The capacity → output relationship varies across RRC districts (budget slope heterogeneity).\n", + "- **H4 — Spatial & Geographic:** Offshore-jurisdiction and border districts moderate the capacity → output relationship; spatial autocorrelation in residuals is tested via Moran's I.\n", + "\n", + "**Data:**\n", + "- PostgreSQL warehouse (`texas_data`): `inspections`, `violations`, `well_shape_tract`\n", + "- `RRC Budget Data.xlsx`: statewide RRC budget by strategy, 2016–2024\n", + "- Analysis period: 2016–2023 (2024 is budget estimate, excluded from regressions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49de2b5c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import warnings\n", + "from pathlib import Path\n", + "from urllib.parse import quote_plus\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as mticker\n", + "import statsmodels.formula.api as smf\n", + "from dotenv import load_dotenv\n", + "from sqlalchemy import create_engine, text\n", + "from scipy.spatial.distance import cdist\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "pd.set_option(\"display.float_format\", \"{:,.2f}\".format)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08420da3", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=False)\n", + "\n", + "host = os.getenv(\"PGHOST\", \"localhost\")\n", + "port = os.getenv(\"PGPORT\", \"5432\")\n", + "user = os.getenv(\"PGUSER\", \"postgres\")\n", + "password = quote_plus(os.getenv(\"PGPASSWORD\", \"\"))\n", + "database = os.getenv(\"PGDATABASE\", \"texas_data\")\n", + "\n", + "engine = create_engine(\n", + " f\"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}\"\n", + ")\n", + "print(f\"Connected → {database} on {host}:{port}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "b4e6c44f", + "metadata": {}, + "source": [ + "## 1. Data Loading" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43886f13", + "metadata": {}, + "outputs": [], + "source": [ + "# District-year inspection metrics aggregated in SQL.\n", + "# LAG() computes days since the previous inspection for the same well (api_norm).\n", + "insp_sql = \"\"\"\n", + "WITH lagged AS (\n", + " SELECT\n", + " district,\n", + " EXTRACT(year FROM inspection_date)::int AS year,\n", + " api_norm,\n", + " inspection_date,\n", + " CASE WHEN UPPER(compliance::text) IN ('YES', 'Y') THEN 1.0 ELSE 0.0 END AS is_compliant,\n", + " EXTRACT(EPOCH FROM (\n", + " inspection_date\n", + " - LAG(inspection_date) OVER (PARTITION BY api_norm ORDER BY inspection_date)\n", + " )) / 86400.0 AS days_since_prev\n", + " FROM inspections\n", + " WHERE inspection_date IS NOT NULL\n", + " AND district IS NOT NULL\n", + " AND EXTRACT(year FROM inspection_date) BETWEEN 2016 AND 2024\n", + ")\n", + "SELECT\n", + " district,\n", + " year,\n", + " COUNT(*) AS total_inspections,\n", + " COUNT(DISTINCT api_norm) AS unique_wells,\n", + " ROUND(AVG(is_compliant)::numeric * 100, 2) AS compliance_rate,\n", + " ROUND(AVG(days_since_prev)::numeric, 1) AS avg_days_between_inspections\n", + "FROM lagged\n", + "GROUP BY district, year\n", + "ORDER BY district, year\n", + "\"\"\"\n", + "\n", + "insp = pd.read_sql(text(insp_sql), engine)\n", + "print(f\"Inspections panel: {len(insp):,} district-year rows | {insp['district'].nunique()} districts\")\n", + "insp.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3841e2f5", + "metadata": {}, + "outputs": [], + "source": [ + "# District-year violation metrics. Blank last_enf_action strings treated as no action.\n", + "viol_sql = \"\"\"\n", + "SELECT\n", + " district,\n", + " EXTRACT(year FROM violation_disc_date)::int AS year,\n", + " COUNT(*) AS total_violations,\n", + " COUNT(DISTINCT api_norm) AS unique_wells_with_violations,\n", + " SUM(CASE WHEN major_viol_ind = 'Y' THEN 1 ELSE 0 END) AS major_violations,\n", + " ROUND(AVG(CASE WHEN compliant_on_reinsp = 'Y' THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)\n", + " AS resolution_rate,\n", + " ROUND(AVG(CASE WHEN last_enf_action IS NOT NULL AND last_enf_action <> ''\n", + " THEN 1.0 ELSE 0.0 END)::numeric * 100, 2) AS enforcement_rate,\n", + " ROUND(AVG(\n", + " CASE WHEN last_enf_action_date IS NOT NULL\n", + " THEN EXTRACT(EPOCH FROM (last_enf_action_date - violation_disc_date)) / 86400.0\n", + " END\n", + " )::numeric, 1) AS avg_days_to_enforcement\n", + "FROM violations\n", + "WHERE violation_disc_date IS NOT NULL\n", + " AND district IS NOT NULL\n", + " AND EXTRACT(year FROM violation_disc_date) BETWEEN 2016 AND 2024\n", + "GROUP BY district, year\n", + "ORDER BY district, year\n", + "\"\"\"\n", + "\n", + "viol = pd.read_sql(text(viol_sql), engine)\n", + "print(f\"Violations panel: {len(viol):,} district-year rows\")\n", + "viol.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e196cac", + "metadata": {}, + "outputs": [], + "source": [ + "BUDGET_PATH = Path(\"RRC Budget Data.xlsx\")\n", + "raw = pd.read_excel(BUDGET_PATH, header=None)\n", + "YEARS = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]\n", + "COLS = slice(1, 10) # spreadsheet columns 1-9 map to years 2016-2024\n", + "\n", + "# ── Section 1: Energy Resource Development (rows 7-18) ──────────────────────\n", + "erd = pd.DataFrame({\n", + " \"year\": YEARS,\n", + " \"strategy\": \"Energy Resource Development\",\n", + " \"total_budget\": raw.iloc[1, COLS].values.astype(float),\n", + " \"salaries\": raw.iloc[7, COLS].values.astype(float),\n", + " \"other_personnel\": raw.iloc[8, COLS].values.astype(float),\n", + " \"professional_fees\": raw.iloc[9, COLS].values.astype(float),\n", + " \"travel\": raw.iloc[13, COLS].values.astype(float),\n", + " \"other_operating\": raw.iloc[16, COLS].values.astype(float),\n", + " \"capital_exp\": raw.iloc[17, COLS].values.astype(float),\n", + " \"fte\": raw.iloc[18, COLS].values.astype(float),\n", + "})\n", + "\n", + "# ── Section 2: Oil/Gas Monitoring & Inspections (rows 20-31) ────────────────\n", + "ogi = pd.DataFrame({\n", + " \"year\": YEARS,\n", + " \"strategy\": \"Oil/Gas Monitoring & Inspections\",\n", + " \"total_budget\": raw.iloc[2, COLS].values.astype(float),\n", + " \"salaries\": raw.iloc[20, COLS].values.astype(float),\n", + " \"other_personnel\": raw.iloc[21, COLS].values.astype(float),\n", + " \"professional_fees\": raw.iloc[22, COLS].values.astype(float),\n", + " \"travel\": raw.iloc[26, COLS].values.astype(float),\n", + " \"other_operating\": raw.iloc[29, COLS].values.astype(float),\n", + " \"capital_exp\": raw.iloc[30, COLS].values.astype(float),\n", + " \"fte\": raw.iloc[31, COLS].values.astype(float),\n", + "})\n", + "\n", + "budget_long = pd.concat([erd, ogi], ignore_index=True)\n", + "print(f\"Budget long: {len(budget_long)} rows (2 strategies × {len(YEARS)} years)\")\n", + "budget_long\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "896d152b", + "metadata": {}, + "outputs": [], + "source": [ + "# ── Wide budget: one row per year with ogi_ / erd_ prefixed columns ──────────\n", + "ogi_wide = ogi.drop(columns=\"strategy\").add_prefix(\"ogi_\")\n", + "erd_wide = erd.drop(columns=\"strategy\").add_prefix(\"erd_\")\n", + "\n", + "budget_wide = (\n", + " ogi_wide\n", + " .merge(erd_wide, left_on=\"ogi_year\", right_on=\"erd_year\")\n", + " .rename(columns={\"ogi_year\": \"year\"})\n", + " .drop(columns=\"erd_year\")\n", + ")\n", + "\n", + "# ── Merge inspections + violations, then join statewide budget on year ────────\n", + "panel = (\n", + " insp\n", + " .merge(viol, on=[\"district\", \"year\"], how=\"left\")\n", + " .merge(budget_wide, on=\"year\", how=\"left\")\n", + ")\n", + "\n", + "# ── Derived columns ───────────────────────────────────────────────────────────\n", + "panel[\"violations_per_inspection\"] = panel[\"total_violations\"] / panel[\"total_inspections\"]\n", + "panel[\"ogi_budget_m\"] = panel[\"ogi_total_budget\"] / 1_000_000 # dollars → millions\n", + "panel[\"erd_budget_m\"] = panel[\"erd_total_budget\"] / 1_000_000\n", + "panel[\"post_2019\"] = (panel[\"year\"] >= 2019).astype(int)\n", + "panel[\"is_budget_year\"] = (panel[\"year\"] == 2024).astype(int)\n", + "\n", + "# Goal ambiguity: share of combined budget going to the inspection mission.\n", + "# Higher share = clearer mission focus; lower share = more goal ambiguity.\n", + "panel[\"inspection_budget_share\"] = (\n", + " panel[\"ogi_total_budget\"] / (panel[\"ogi_total_budget\"] + panel[\"erd_total_budget\"])\n", + ")\n", + "\n", + "# Fill violation NaNs for districts with zero violations in a given year\n", + "fill_cols = [\n", + " \"total_violations\", \"unique_wells_with_violations\", \"major_violations\",\n", + " \"resolution_rate\", \"enforcement_rate\", \"avg_days_to_enforcement\",\n", + " \"violations_per_inspection\",\n", + "]\n", + "panel[fill_cols] = panel[fill_cols].fillna(0)\n", + "\n", + "print(f\"Analysis panel: {len(panel):,} rows | \"\n", + " f\"{panel['district'].nunique()} districts | \"\n", + " f\"{panel['year'].nunique()} years\")\n", + "panel.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "3558bae7", + "metadata": {}, + "source": [ + "## 2. Exploratory Overview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92921756", + "metadata": {}, + "outputs": [], + "source": [ + "# Year-level means across districts\n", + "yearly = panel.groupby(\"year\").agg(\n", + " ogi_budget_m = (\"ogi_budget_m\", \"first\"),\n", + " ogi_fte = (\"ogi_fte\", \"first\"),\n", + " total_inspections = (\"total_inspections\", \"mean\"),\n", + " compliance_rate = (\"compliance_rate\", \"mean\"),\n", + " total_violations = (\"total_violations\", \"mean\"),\n", + " resolution_rate = (\"resolution_rate\", \"mean\"),\n", + " avg_days_to_enf = (\"avg_days_to_enforcement\",\"mean\"),\n", + ").round(2)\n", + "\n", + "print(yearly.to_string())\n", + "\n", + "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n", + "axes = axes.flatten()\n", + "\n", + "yearly[\"ogi_budget_m\"].plot(ax=axes[0], marker=\"o\", title=\"OGI Budget ($M)\")\n", + "axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f\"${x:.0f}M\"))\n", + "\n", + "yearly[\"ogi_fte\"].plot(ax=axes[1], marker=\"o\", title=\"OGI FTE Positions\")\n", + "\n", + "yearly[\"total_inspections\"].plot(ax=axes[2], marker=\"o\", title=\"Avg Inspections / District\")\n", + "\n", + "yearly[\"compliance_rate\"].plot(ax=axes[3], marker=\"o\", title=\"Avg Compliance Rate (%)\")\n", + "\n", + "yearly[\"resolution_rate\"].plot(ax=axes[4], marker=\"o\", title=\"Avg Resolution Rate (%)\")\n", + "\n", + "yearly[\"avg_days_to_enf\"].plot(ax=axes[5], marker=\"o\", title=\"Avg Days to Enforcement\")\n", + "\n", + "for ax in axes:\n", + " ax.axvline(2019, color=\"red\", linestyle=\"--\", alpha=0.5, label=\"2019 policy\")\n", + " ax.set_xlabel(\"Year\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d2671c9", + "metadata": {}, + "outputs": [], + "source": [ + "corr_cols = [\n", + " \"ogi_budget_m\", \"ogi_fte\", \"inspection_budget_share\",\n", + " \"total_inspections\", \"compliance_rate\",\n", + " \"total_violations\", \"resolution_rate\", \"avg_days_to_enforcement\",\n", + "]\n", + "corr = panel[corr_cols].corr().round(2)\n", + "\n", + "fig, ax = plt.subplots(figsize=(9, 7))\n", + "im = ax.imshow(corr, cmap=\"RdBu_r\", vmin=-1, vmax=1)\n", + "ax.set_xticks(range(len(corr_cols)))\n", + "ax.set_yticks(range(len(corr_cols)))\n", + "ax.set_xticklabels(corr_cols, rotation=45, ha=\"right\", fontsize=9)\n", + "ax.set_yticklabels(corr_cols, fontsize=9)\n", + "for i in range(len(corr_cols)):\n", + " for j in range(len(corr_cols)):\n", + " ax.text(j, i, corr.iloc[i, j], ha=\"center\", va=\"center\", fontsize=8)\n", + "plt.colorbar(im, ax=ax)\n", + "ax.set_title(\"Correlation Matrix — Key Variables\")\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "3ca1410a", + "metadata": {}, + "source": [ + "## H1: Organizational Capacity → Policy Outputs\n", + "\n", + "**Prediction:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n", + "\n", + "**Model:** OLS with district fixed effects and year 2016–2023 (excluding 2024 budget estimate). Budget varies only over time, so it identifies via year-to-year changes in statewide RRC allocations; district FE absorbs persistent cross-district differences.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "463387d3", + "metadata": {}, + "outputs": [], + "source": [ + "actuals = panel[panel[\"is_budget_year\"] == 0].copy()\n", + "\n", + "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n", + "\n", + "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"total_inspections\",\n", + " alpha=0.4, ax=axes[0], title=\"Budget → Inspections\")\n", + "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"compliance_rate\",\n", + " alpha=0.4, ax=axes[1], title=\"Budget → Compliance Rate (%)\")\n", + "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"resolution_rate\",\n", + " alpha=0.4, ax=axes[2], title=\"Budget → Resolution Rate (%)\")\n", + "\n", + "for ax in axes:\n", + " ax.set_xlabel(\"OGI Budget ($M)\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "535fc4eb", + "metadata": {}, + "outputs": [], + "source": [ + "m_inspections = smf.ols(\n", + " \"total_inspections ~ ogi_budget_m + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "m_compliance = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "m_resolution = smf.ols(\n", + " \"resolution_rate ~ ogi_budget_m + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "display_cols = [\"Coef.\", \"Std.Err.\", \"t\", \"P>|t|\"]\n", + "\n", + "print(\"H1a — OGI Budget ($M) → Total Inspections\")\n", + "print(m_inspections.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", + "print(f\" R² = {m_inspections.rsquared:.3f} Adj. R² = {m_inspections.rsquared_adj:.3f}\\n\")\n", + "\n", + "print(\"H1b — OGI Budget ($M) → Compliance Rate (%)\")\n", + "print(m_compliance.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", + "print(f\" R² = {m_compliance.rsquared:.3f} Adj. R² = {m_compliance.rsquared_adj:.3f}\\n\")\n", + "\n", + "print(\"H1c — OGI Budget ($M) → Resolution Rate (%)\")\n", + "print(m_resolution.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", + "print(f\" R² = {m_resolution.rsquared:.3f} Adj. R² = {m_resolution.rsquared_adj:.3f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "56add68a", + "metadata": {}, + "source": [ + "## H2: Goal Ambiguity Moderates Capacity Effects\n", + "\n", + "**Prediction:** When a larger share of the combined RRC budget flows to the broader \"Energy Resource Development\" goal (lower `inspection_budget_share`), the capacity → output link weakens.\n", + "\n", + "**Operationalization:**\n", + "`inspection_budget_share = ogi_budget / (ogi_budget + erd_budget)`\n", + "\n", + "A negative interaction coefficient `ogi_budget_m × inspection_budget_share` would be unexpected (higher share → weaker effect). A positive coefficient supports H2 — clearer mission focus amplifies the budget → compliance relationship.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24187ce8", + "metadata": {}, + "outputs": [], + "source": [ + "m_h2 = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "key_rows = [\"ogi_budget_m\", \"inspection_budget_share\", \"ogi_budget_m:inspection_budget_share\"]\n", + "print(\"H2 — Goal Ambiguity Moderation (DV: compliance_rate)\")\n", + "print(m_h2.summary2().tables[1][display_cols].loc[key_rows])\n", + "print(f\"\\nR² = {m_h2.rsquared:.3f} Adj. R² = {m_h2.rsquared_adj:.3f}\")\n", + "\n", + "# ── Same model with resolution rate as DV ────────────────────────────────────\n", + "m_h2_res = smf.ols(\n", + " \"resolution_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "print(\"\\nH2 — Goal Ambiguity Moderation (DV: resolution_rate)\")\n", + "print(m_h2_res.summary2().tables[1][display_cols].loc[key_rows])\n", + "print(f\"\\nR² = {m_h2_res.rsquared:.3f} Adj. R² = {m_h2_res.rsquared_adj:.3f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "b6583857", + "metadata": {}, + "source": [ + "## H3: District Multilevel Effects\n", + "\n", + "**Prediction:** The budget → output slope varies across RRC districts — some districts translate budget increases into better outputs more effectively than others.\n", + "\n", + "**Model:** Interaction `ogi_budget_m × C(district)` — the reference district captures the baseline budget slope; interaction terms show how each other district's slope differs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "151faefd", + "metadata": {}, + "outputs": [], + "source": [ + "m_h3 = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m * C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "coef_table = m_h3.summary2().tables[1]\n", + "\n", + "# Baseline budget slope (reference district)\n", + "baseline_row = coef_table.loc[[\"ogi_budget_m\"]]\n", + "print(\"H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)\")\n", + "print(f\"Baseline (reference district) budget slope:\")\n", + "print(baseline_row[display_cols])\n", + "\n", + "# District-specific deviations from baseline\n", + "interaction_rows = coef_table[coef_table.index.str.contains(\"ogi_budget_m:C\")]\n", + "print(\"\\nDistrict interaction terms (deviation from reference slope):\")\n", + "print(interaction_rows[display_cols].round(4))\n", + "print(f\"\\nR² = {m_h3.rsquared:.3f} Adj. R² = {m_h3.rsquared_adj:.3f}\")\n", + "\n", + "# ── Plot district-specific budget slopes ─────────────────────────────────────\n", + "districts = actuals[\"district\"].unique()\n", + "slopes = {}\n", + "for d in districts:\n", + " key = f\"ogi_budget_m:C(district)[T.{d}]\"\n", + " base = m_h3.params.get(\"ogi_budget_m\", 0)\n", + " delta = m_h3.params.get(key, 0)\n", + " slopes[str(d)] = base + delta\n", + "\n", + "slope_df = pd.Series(slopes).sort_values()\n", + "fig, ax = plt.subplots(figsize=(10, 4))\n", + "slope_df.plot.barh(ax=ax, color=[\"#d62728\" if v < 0 else \"#1f77b4\" for v in slope_df])\n", + "ax.axvline(0, color=\"black\", linewidth=0.8)\n", + "ax.set_xlabel(\"Budget slope (compliance rate pp per $M)\")\n", + "ax.set_title(\"H3 — District-Specific Budget → Compliance Slopes\")\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "5bb27b3e", + "metadata": {}, + "source": [ + "## H4: Spatial and Geographic Factors\n", + "\n", + "**Predictions:**\n", + "- Offshore-jurisdiction districts (02, 03, 04) show a different budget → output relationship due to dual onshore/offshore oversight burden.\n", + "- Border-proximate districts show a different relationship due to cross-jurisdiction enforcement complexity.\n", + "- Spatial autocorrelation in H1 residuals (Moran's I) would indicate unmodeled geographic spillovers.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6e56f00", + "metadata": {}, + "outputs": [], + "source": [ + "# Texas RRC district geography flags (based on known RRC district locations)\n", + "OFFSHORE_DISTRICTS = {\"02\", \"03\", \"04\"} # dual onshore + offshore jurisdiction\n", + "BORDER_DISTRICTS = {\"01\", \"02\", \"03\", \"04\"} # south / gulf coast proximity\n", + "\n", + "actuals = actuals.copy()\n", + "actuals[\"district_str\"] = actuals[\"district\"].astype(str).str.strip()\n", + "actuals[\"offshore\"] = actuals[\"district_str\"].isin(OFFSHORE_DISTRICTS).astype(int)\n", + "actuals[\"border\"] = actuals[\"district_str\"].isin(BORDER_DISTRICTS).astype(int)\n", + "\n", + "print(\"District classification:\")\n", + "print(\n", + " actuals.groupby([\"district_str\", \"offshore\", \"border\"])\n", + " .size()\n", + " .reset_index(name=\"district_year_obs\")\n", + " .to_string(index=False)\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74686bfe", + "metadata": {}, + "outputs": [], + "source": [ + "# ── Spatial regression: offshore and border interactions ─────────────────────\n", + "m_h4 = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m + offshore + border \"\n", + " \"+ ogi_budget_m:offshore + ogi_budget_m:border + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "spatial_rows = [\n", + " \"ogi_budget_m\", \"offshore\", \"border\",\n", + " \"ogi_budget_m:offshore\", \"ogi_budget_m:border\",\n", + "]\n", + "available = [r for r in spatial_rows if r in m_h4.params.index]\n", + "print(\"H4 — Spatial Moderators (DV: compliance_rate)\")\n", + "print(m_h4.summary2().tables[1][display_cols].loc[available])\n", + "print(f\"\\nR² = {m_h4.rsquared:.3f} Adj. R² = {m_h4.rsquared_adj:.3f}\")\n", + "\n", + "# ── Moran's I on H1 residuals ─────────────────────────────────────────────────\n", + "# Compute district centroids from well lat/lon joined via inspections\n", + "centroids_sql = \"\"\"\n", + "SELECT\n", + " i.district,\n", + " AVG(w.latitude) AS lat,\n", + " AVG(w.longitude) AS lon\n", + "FROM inspections i\n", + "JOIN well_shape_tract w USING (api_norm)\n", + "WHERE w.latitude IS NOT NULL\n", + " AND w.longitude IS NOT NULL\n", + " AND i.district IS NOT NULL\n", + "GROUP BY i.district\n", + "\"\"\"\n", + "\n", + "try:\n", + " centroids = pd.read_sql(text(centroids_sql), engine)\n", + "\n", + " # Average H1 compliance residuals to district level\n", + " resid_df = actuals[[\"district\", \"compliance_rate\"]].copy()\n", + " resid_df[\"resid\"] = m_compliance.resid.reindex(actuals.index).values\n", + " resid_by_district = resid_df.groupby(\"district\")[\"resid\"].mean().reset_index()\n", + "\n", + " centroids = centroids.merge(resid_by_district, on=\"district\").dropna()\n", + "\n", + " # Row-normalised inverse-distance weights matrix\n", + " coords = centroids[[\"lon\", \"lat\"]].values\n", + " D = cdist(coords, coords)\n", + " np.fill_diagonal(D, np.inf)\n", + " W = 1 / D\n", + " W = W / W.sum(axis=1, keepdims=True)\n", + "\n", + " z = centroids[\"resid\"].values\n", + " z = z - z.mean()\n", + " n = len(z)\n", + " morans_i = (n / W.sum()) * (z @ W @ z) / (z @ z)\n", + "\n", + " print(f\"\\nMoran's I on H1 compliance residuals = {morans_i:.4f}\")\n", + " print(\" > 0 → residuals cluster spatially (similar neighbours)\")\n", + " print(\" ≈ 0 → no spatial pattern\")\n", + " print(\" < 0 → spatial dispersion (dissimilar neighbours)\")\n", + "\n", + " print(\"\\nDistrict centroids used:\")\n", + " print(centroids[[\"district\", \"lat\", \"lon\"]].round(2).to_string(index=False))\n", + "\n", + "except Exception as e:\n", + " print(f\"Moran's I skipped: {e}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}