Files
texas-inspection-expenses/texas_inspection_expenses.ipynb
dadams 8038a3fab4 Initial commit: texas inspection expenses analysis notebook
Jupyter notebook analyzing RRC budget (2016-2024) against inspection/violation
outcomes across Texas RRC districts. Tests four hypotheses: organizational
capacity, goal ambiguity, district multilevel effects, and spatial factors.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 12:17:56 -08:00

635 lines
27 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "dc6818b8",
"metadata": {},
"source": [
"# Texas RRC Inspection Expenses Analysis\n",
"\n",
"**Research question:** Does organizational capacity (budget, staffing) predict better regulatory outputs (inspections, compliance, enforcement), and how is that relationship moderated by goal ambiguity, district-level heterogeneity, and spatial/geographic factors?\n",
"\n",
"## Hypotheses\n",
"- **H1 — Capacity → Outputs:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n",
"- **H2 — Goal Ambiguity:** When a larger share of RRC budget goes to the more ambiguous \"Energy Resource Development\" goal, the capacity → output relationship weakens.\n",
"- **H3 — Multilevel / District Effects:** The capacity → output relationship varies across RRC districts (budget slope heterogeneity).\n",
"- **H4 — Spatial & Geographic:** Offshore-jurisdiction and border districts moderate the capacity → output relationship; spatial autocorrelation in residuals is tested via Moran's I.\n",
"\n",
"**Data:**\n",
"- PostgreSQL warehouse (`texas_data`): `inspections`, `violations`, `well_shape_tract`\n",
"- `RRC Budget Data.xlsx`: statewide RRC budget by strategy, 20162024\n",
"- Analysis period: 20162023 (2024 is budget estimate, excluded from regressions)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "49de2b5c",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import warnings\n",
"from pathlib import Path\n",
"from urllib.parse import quote_plus\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.ticker as mticker\n",
"import statsmodels.formula.api as smf\n",
"from dotenv import load_dotenv\n",
"from sqlalchemy import create_engine, text\n",
"from scipy.spatial.distance import cdist\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
"pd.set_option(\"display.float_format\", \"{:,.2f}\".format)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08420da3",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=False)\n",
"\n",
"host = os.getenv(\"PGHOST\", \"localhost\")\n",
"port = os.getenv(\"PGPORT\", \"5432\")\n",
"user = os.getenv(\"PGUSER\", \"postgres\")\n",
"password = quote_plus(os.getenv(\"PGPASSWORD\", \"\"))\n",
"database = os.getenv(\"PGDATABASE\", \"texas_data\")\n",
"\n",
"engine = create_engine(\n",
" f\"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}\"\n",
")\n",
"print(f\"Connected → {database} on {host}:{port}\")\n"
]
},
{
"cell_type": "markdown",
"id": "b4e6c44f",
"metadata": {},
"source": [
"## 1. Data Loading"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43886f13",
"metadata": {},
"outputs": [],
"source": [
"# District-year inspection metrics aggregated in SQL.\n",
"# LAG() computes days since the previous inspection for the same well (api_norm).\n",
"insp_sql = \"\"\"\n",
"WITH lagged AS (\n",
" SELECT\n",
" district,\n",
" EXTRACT(year FROM inspection_date)::int AS year,\n",
" api_norm,\n",
" inspection_date,\n",
" CASE WHEN UPPER(compliance::text) IN ('YES', 'Y') THEN 1.0 ELSE 0.0 END AS is_compliant,\n",
" EXTRACT(EPOCH FROM (\n",
" inspection_date\n",
" - LAG(inspection_date) OVER (PARTITION BY api_norm ORDER BY inspection_date)\n",
" )) / 86400.0 AS days_since_prev\n",
" FROM inspections\n",
" WHERE inspection_date IS NOT NULL\n",
" AND district IS NOT NULL\n",
" AND EXTRACT(year FROM inspection_date) BETWEEN 2016 AND 2024\n",
")\n",
"SELECT\n",
" district,\n",
" year,\n",
" COUNT(*) AS total_inspections,\n",
" COUNT(DISTINCT api_norm) AS unique_wells,\n",
" ROUND(AVG(is_compliant)::numeric * 100, 2) AS compliance_rate,\n",
" ROUND(AVG(days_since_prev)::numeric, 1) AS avg_days_between_inspections\n",
"FROM lagged\n",
"GROUP BY district, year\n",
"ORDER BY district, year\n",
"\"\"\"\n",
"\n",
"insp = pd.read_sql(text(insp_sql), engine)\n",
"print(f\"Inspections panel: {len(insp):,} district-year rows | {insp['district'].nunique()} districts\")\n",
"insp.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3841e2f5",
"metadata": {},
"outputs": [],
"source": [
"# District-year violation metrics. Blank last_enf_action strings treated as no action.\n",
"viol_sql = \"\"\"\n",
"SELECT\n",
" district,\n",
" EXTRACT(year FROM violation_disc_date)::int AS year,\n",
" COUNT(*) AS total_violations,\n",
" COUNT(DISTINCT api_norm) AS unique_wells_with_violations,\n",
" SUM(CASE WHEN major_viol_ind = 'Y' THEN 1 ELSE 0 END) AS major_violations,\n",
" ROUND(AVG(CASE WHEN compliant_on_reinsp = 'Y' THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)\n",
" AS resolution_rate,\n",
" ROUND(AVG(CASE WHEN last_enf_action IS NOT NULL AND last_enf_action <> ''\n",
" THEN 1.0 ELSE 0.0 END)::numeric * 100, 2) AS enforcement_rate,\n",
" ROUND(AVG(\n",
" CASE WHEN last_enf_action_date IS NOT NULL\n",
" THEN EXTRACT(EPOCH FROM (last_enf_action_date - violation_disc_date)) / 86400.0\n",
" END\n",
" )::numeric, 1) AS avg_days_to_enforcement\n",
"FROM violations\n",
"WHERE violation_disc_date IS NOT NULL\n",
" AND district IS NOT NULL\n",
" AND EXTRACT(year FROM violation_disc_date) BETWEEN 2016 AND 2024\n",
"GROUP BY district, year\n",
"ORDER BY district, year\n",
"\"\"\"\n",
"\n",
"viol = pd.read_sql(text(viol_sql), engine)\n",
"print(f\"Violations panel: {len(viol):,} district-year rows\")\n",
"viol.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e196cac",
"metadata": {},
"outputs": [],
"source": [
"BUDGET_PATH = Path(\"RRC Budget Data.xlsx\")\n",
"raw = pd.read_excel(BUDGET_PATH, header=None)\n",
"YEARS = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]\n",
"COLS = slice(1, 10) # spreadsheet columns 1-9 map to years 2016-2024\n",
"\n",
"# ── Section 1: Energy Resource Development (rows 7-18) ──────────────────────\n",
"erd = pd.DataFrame({\n",
" \"year\": YEARS,\n",
" \"strategy\": \"Energy Resource Development\",\n",
" \"total_budget\": raw.iloc[1, COLS].values.astype(float),\n",
" \"salaries\": raw.iloc[7, COLS].values.astype(float),\n",
" \"other_personnel\": raw.iloc[8, COLS].values.astype(float),\n",
" \"professional_fees\": raw.iloc[9, COLS].values.astype(float),\n",
" \"travel\": raw.iloc[13, COLS].values.astype(float),\n",
" \"other_operating\": raw.iloc[16, COLS].values.astype(float),\n",
" \"capital_exp\": raw.iloc[17, COLS].values.astype(float),\n",
" \"fte\": raw.iloc[18, COLS].values.astype(float),\n",
"})\n",
"\n",
"# ── Section 2: Oil/Gas Monitoring & Inspections (rows 20-31) ────────────────\n",
"ogi = pd.DataFrame({\n",
" \"year\": YEARS,\n",
" \"strategy\": \"Oil/Gas Monitoring & Inspections\",\n",
" \"total_budget\": raw.iloc[2, COLS].values.astype(float),\n",
" \"salaries\": raw.iloc[20, COLS].values.astype(float),\n",
" \"other_personnel\": raw.iloc[21, COLS].values.astype(float),\n",
" \"professional_fees\": raw.iloc[22, COLS].values.astype(float),\n",
" \"travel\": raw.iloc[26, COLS].values.astype(float),\n",
" \"other_operating\": raw.iloc[29, COLS].values.astype(float),\n",
" \"capital_exp\": raw.iloc[30, COLS].values.astype(float),\n",
" \"fte\": raw.iloc[31, COLS].values.astype(float),\n",
"})\n",
"\n",
"budget_long = pd.concat([erd, ogi], ignore_index=True)\n",
"print(f\"Budget long: {len(budget_long)} rows (2 strategies × {len(YEARS)} years)\")\n",
"budget_long\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "896d152b",
"metadata": {},
"outputs": [],
"source": [
"# ── Wide budget: one row per year with ogi_ / erd_ prefixed columns ──────────\n",
"ogi_wide = ogi.drop(columns=\"strategy\").add_prefix(\"ogi_\")\n",
"erd_wide = erd.drop(columns=\"strategy\").add_prefix(\"erd_\")\n",
"\n",
"budget_wide = (\n",
" ogi_wide\n",
" .merge(erd_wide, left_on=\"ogi_year\", right_on=\"erd_year\")\n",
" .rename(columns={\"ogi_year\": \"year\"})\n",
" .drop(columns=\"erd_year\")\n",
")\n",
"\n",
"# ── Merge inspections + violations, then join statewide budget on year ────────\n",
"panel = (\n",
" insp\n",
" .merge(viol, on=[\"district\", \"year\"], how=\"left\")\n",
" .merge(budget_wide, on=\"year\", how=\"left\")\n",
")\n",
"\n",
"# ── Derived columns ───────────────────────────────────────────────────────────\n",
"panel[\"violations_per_inspection\"] = panel[\"total_violations\"] / panel[\"total_inspections\"]\n",
"panel[\"ogi_budget_m\"] = panel[\"ogi_total_budget\"] / 1_000_000 # dollars → millions\n",
"panel[\"erd_budget_m\"] = panel[\"erd_total_budget\"] / 1_000_000\n",
"panel[\"post_2019\"] = (panel[\"year\"] >= 2019).astype(int)\n",
"panel[\"is_budget_year\"] = (panel[\"year\"] == 2024).astype(int)\n",
"\n",
"# Goal ambiguity: share of combined budget going to the inspection mission.\n",
"# Higher share = clearer mission focus; lower share = more goal ambiguity.\n",
"panel[\"inspection_budget_share\"] = (\n",
" panel[\"ogi_total_budget\"] / (panel[\"ogi_total_budget\"] + panel[\"erd_total_budget\"])\n",
")\n",
"\n",
"# Fill violation NaNs for districts with zero violations in a given year\n",
"fill_cols = [\n",
" \"total_violations\", \"unique_wells_with_violations\", \"major_violations\",\n",
" \"resolution_rate\", \"enforcement_rate\", \"avg_days_to_enforcement\",\n",
" \"violations_per_inspection\",\n",
"]\n",
"panel[fill_cols] = panel[fill_cols].fillna(0)\n",
"\n",
"print(f\"Analysis panel: {len(panel):,} rows | \"\n",
" f\"{panel['district'].nunique()} districts | \"\n",
" f\"{panel['year'].nunique()} years\")\n",
"panel.head()\n"
]
},
{
"cell_type": "markdown",
"id": "3558bae7",
"metadata": {},
"source": [
"## 2. Exploratory Overview"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92921756",
"metadata": {},
"outputs": [],
"source": [
"# Year-level means across districts\n",
"yearly = panel.groupby(\"year\").agg(\n",
" ogi_budget_m = (\"ogi_budget_m\", \"first\"),\n",
" ogi_fte = (\"ogi_fte\", \"first\"),\n",
" total_inspections = (\"total_inspections\", \"mean\"),\n",
" compliance_rate = (\"compliance_rate\", \"mean\"),\n",
" total_violations = (\"total_violations\", \"mean\"),\n",
" resolution_rate = (\"resolution_rate\", \"mean\"),\n",
" avg_days_to_enf = (\"avg_days_to_enforcement\",\"mean\"),\n",
").round(2)\n",
"\n",
"print(yearly.to_string())\n",
"\n",
"fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n",
"axes = axes.flatten()\n",
"\n",
"yearly[\"ogi_budget_m\"].plot(ax=axes[0], marker=\"o\", title=\"OGI Budget ($M)\")\n",
"axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f\"${x:.0f}M\"))\n",
"\n",
"yearly[\"ogi_fte\"].plot(ax=axes[1], marker=\"o\", title=\"OGI FTE Positions\")\n",
"\n",
"yearly[\"total_inspections\"].plot(ax=axes[2], marker=\"o\", title=\"Avg Inspections / District\")\n",
"\n",
"yearly[\"compliance_rate\"].plot(ax=axes[3], marker=\"o\", title=\"Avg Compliance Rate (%)\")\n",
"\n",
"yearly[\"resolution_rate\"].plot(ax=axes[4], marker=\"o\", title=\"Avg Resolution Rate (%)\")\n",
"\n",
"yearly[\"avg_days_to_enf\"].plot(ax=axes[5], marker=\"o\", title=\"Avg Days to Enforcement\")\n",
"\n",
"for ax in axes:\n",
" ax.axvline(2019, color=\"red\", linestyle=\"--\", alpha=0.5, label=\"2019 policy\")\n",
" ax.set_xlabel(\"Year\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d2671c9",
"metadata": {},
"outputs": [],
"source": [
"corr_cols = [\n",
" \"ogi_budget_m\", \"ogi_fte\", \"inspection_budget_share\",\n",
" \"total_inspections\", \"compliance_rate\",\n",
" \"total_violations\", \"resolution_rate\", \"avg_days_to_enforcement\",\n",
"]\n",
"corr = panel[corr_cols].corr().round(2)\n",
"\n",
"fig, ax = plt.subplots(figsize=(9, 7))\n",
"im = ax.imshow(corr, cmap=\"RdBu_r\", vmin=-1, vmax=1)\n",
"ax.set_xticks(range(len(corr_cols)))\n",
"ax.set_yticks(range(len(corr_cols)))\n",
"ax.set_xticklabels(corr_cols, rotation=45, ha=\"right\", fontsize=9)\n",
"ax.set_yticklabels(corr_cols, fontsize=9)\n",
"for i in range(len(corr_cols)):\n",
" for j in range(len(corr_cols)):\n",
" ax.text(j, i, corr.iloc[i, j], ha=\"center\", va=\"center\", fontsize=8)\n",
"plt.colorbar(im, ax=ax)\n",
"ax.set_title(\"Correlation Matrix — Key Variables\")\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "3ca1410a",
"metadata": {},
"source": [
"## H1: Organizational Capacity → Policy Outputs\n",
"\n",
"**Prediction:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n",
"\n",
"**Model:** OLS with district fixed effects and year 20162023 (excluding 2024 budget estimate). Budget varies only over time, so it identifies via year-to-year changes in statewide RRC allocations; district FE absorbs persistent cross-district differences.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "463387d3",
"metadata": {},
"outputs": [],
"source": [
"actuals = panel[panel[\"is_budget_year\"] == 0].copy()\n",
"\n",
"fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
"\n",
"actuals.plot.scatter(x=\"ogi_budget_m\", y=\"total_inspections\",\n",
" alpha=0.4, ax=axes[0], title=\"Budget → Inspections\")\n",
"actuals.plot.scatter(x=\"ogi_budget_m\", y=\"compliance_rate\",\n",
" alpha=0.4, ax=axes[1], title=\"Budget → Compliance Rate (%)\")\n",
"actuals.plot.scatter(x=\"ogi_budget_m\", y=\"resolution_rate\",\n",
" alpha=0.4, ax=axes[2], title=\"Budget → Resolution Rate (%)\")\n",
"\n",
"for ax in axes:\n",
" ax.set_xlabel(\"OGI Budget ($M)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "535fc4eb",
"metadata": {},
"outputs": [],
"source": [
"m_inspections = smf.ols(\n",
" \"total_inspections ~ ogi_budget_m + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"m_compliance = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"m_resolution = smf.ols(\n",
" \"resolution_rate ~ ogi_budget_m + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"display_cols = [\"Coef.\", \"Std.Err.\", \"t\", \"P>|t|\"]\n",
"\n",
"print(\"H1a — OGI Budget ($M) → Total Inspections\")\n",
"print(m_inspections.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
"print(f\" R² = {m_inspections.rsquared:.3f} Adj. R² = {m_inspections.rsquared_adj:.3f}\\n\")\n",
"\n",
"print(\"H1b — OGI Budget ($M) → Compliance Rate (%)\")\n",
"print(m_compliance.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
"print(f\" R² = {m_compliance.rsquared:.3f} Adj. R² = {m_compliance.rsquared_adj:.3f}\\n\")\n",
"\n",
"print(\"H1c — OGI Budget ($M) → Resolution Rate (%)\")\n",
"print(m_resolution.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
"print(f\" R² = {m_resolution.rsquared:.3f} Adj. R² = {m_resolution.rsquared_adj:.3f}\")\n"
]
},
{
"cell_type": "markdown",
"id": "56add68a",
"metadata": {},
"source": [
"## H2: Goal Ambiguity Moderates Capacity Effects\n",
"\n",
"**Prediction:** When a larger share of the combined RRC budget flows to the broader \"Energy Resource Development\" goal (lower `inspection_budget_share`), the capacity → output link weakens.\n",
"\n",
"**Operationalization:**\n",
"`inspection_budget_share = ogi_budget / (ogi_budget + erd_budget)`\n",
"\n",
"A negative interaction coefficient `ogi_budget_m × inspection_budget_share` would be unexpected (higher share → weaker effect). A positive coefficient supports H2 — clearer mission focus amplifies the budget → compliance relationship.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24187ce8",
"metadata": {},
"outputs": [],
"source": [
"m_h2 = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"key_rows = [\"ogi_budget_m\", \"inspection_budget_share\", \"ogi_budget_m:inspection_budget_share\"]\n",
"print(\"H2 — Goal Ambiguity Moderation (DV: compliance_rate)\")\n",
"print(m_h2.summary2().tables[1][display_cols].loc[key_rows])\n",
"print(f\"\\nR² = {m_h2.rsquared:.3f} Adj. R² = {m_h2.rsquared_adj:.3f}\")\n",
"\n",
"# ── Same model with resolution rate as DV ────────────────────────────────────\n",
"m_h2_res = smf.ols(\n",
" \"resolution_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"print(\"\\nH2 — Goal Ambiguity Moderation (DV: resolution_rate)\")\n",
"print(m_h2_res.summary2().tables[1][display_cols].loc[key_rows])\n",
"print(f\"\\nR² = {m_h2_res.rsquared:.3f} Adj. R² = {m_h2_res.rsquared_adj:.3f}\")\n"
]
},
{
"cell_type": "markdown",
"id": "b6583857",
"metadata": {},
"source": [
"## H3: District Multilevel Effects\n",
"\n",
"**Prediction:** The budget → output slope varies across RRC districts — some districts translate budget increases into better outputs more effectively than others.\n",
"\n",
"**Model:** Interaction `ogi_budget_m × C(district)` — the reference district captures the baseline budget slope; interaction terms show how each other district's slope differs.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "151faefd",
"metadata": {},
"outputs": [],
"source": [
"m_h3 = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m * C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"coef_table = m_h3.summary2().tables[1]\n",
"\n",
"# Baseline budget slope (reference district)\n",
"baseline_row = coef_table.loc[[\"ogi_budget_m\"]]\n",
"print(\"H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)\")\n",
"print(f\"Baseline (reference district) budget slope:\")\n",
"print(baseline_row[display_cols])\n",
"\n",
"# District-specific deviations from baseline\n",
"interaction_rows = coef_table[coef_table.index.str.contains(\"ogi_budget_m:C\")]\n",
"print(\"\\nDistrict interaction terms (deviation from reference slope):\")\n",
"print(interaction_rows[display_cols].round(4))\n",
"print(f\"\\nR² = {m_h3.rsquared:.3f} Adj. R² = {m_h3.rsquared_adj:.3f}\")\n",
"\n",
"# ── Plot district-specific budget slopes ─────────────────────────────────────\n",
"districts = actuals[\"district\"].unique()\n",
"slopes = {}\n",
"for d in districts:\n",
" key = f\"ogi_budget_m:C(district)[T.{d}]\"\n",
" base = m_h3.params.get(\"ogi_budget_m\", 0)\n",
" delta = m_h3.params.get(key, 0)\n",
" slopes[str(d)] = base + delta\n",
"\n",
"slope_df = pd.Series(slopes).sort_values()\n",
"fig, ax = plt.subplots(figsize=(10, 4))\n",
"slope_df.plot.barh(ax=ax, color=[\"#d62728\" if v < 0 else \"#1f77b4\" for v in slope_df])\n",
"ax.axvline(0, color=\"black\", linewidth=0.8)\n",
"ax.set_xlabel(\"Budget slope (compliance rate pp per $M)\")\n",
"ax.set_title(\"H3 — District-Specific Budget → Compliance Slopes\")\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "5bb27b3e",
"metadata": {},
"source": [
"## H4: Spatial and Geographic Factors\n",
"\n",
"**Predictions:**\n",
"- Offshore-jurisdiction districts (02, 03, 04) show a different budget → output relationship due to dual onshore/offshore oversight burden.\n",
"- Border-proximate districts show a different relationship due to cross-jurisdiction enforcement complexity.\n",
"- Spatial autocorrelation in H1 residuals (Moran's I) would indicate unmodeled geographic spillovers.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6e56f00",
"metadata": {},
"outputs": [],
"source": [
"# Texas RRC district geography flags (based on known RRC district locations)\n",
"OFFSHORE_DISTRICTS = {\"02\", \"03\", \"04\"} # dual onshore + offshore jurisdiction\n",
"BORDER_DISTRICTS = {\"01\", \"02\", \"03\", \"04\"} # south / gulf coast proximity\n",
"\n",
"actuals = actuals.copy()\n",
"actuals[\"district_str\"] = actuals[\"district\"].astype(str).str.strip()\n",
"actuals[\"offshore\"] = actuals[\"district_str\"].isin(OFFSHORE_DISTRICTS).astype(int)\n",
"actuals[\"border\"] = actuals[\"district_str\"].isin(BORDER_DISTRICTS).astype(int)\n",
"\n",
"print(\"District classification:\")\n",
"print(\n",
" actuals.groupby([\"district_str\", \"offshore\", \"border\"])\n",
" .size()\n",
" .reset_index(name=\"district_year_obs\")\n",
" .to_string(index=False)\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74686bfe",
"metadata": {},
"outputs": [],
"source": [
"# ── Spatial regression: offshore and border interactions ─────────────────────\n",
"m_h4 = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m + offshore + border \"\n",
" \"+ ogi_budget_m:offshore + ogi_budget_m:border + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"spatial_rows = [\n",
" \"ogi_budget_m\", \"offshore\", \"border\",\n",
" \"ogi_budget_m:offshore\", \"ogi_budget_m:border\",\n",
"]\n",
"available = [r for r in spatial_rows if r in m_h4.params.index]\n",
"print(\"H4 — Spatial Moderators (DV: compliance_rate)\")\n",
"print(m_h4.summary2().tables[1][display_cols].loc[available])\n",
"print(f\"\\nR² = {m_h4.rsquared:.3f} Adj. R² = {m_h4.rsquared_adj:.3f}\")\n",
"\n",
"# ── Moran's I on H1 residuals ─────────────────────────────────────────────────\n",
"# Compute district centroids from well lat/lon joined via inspections\n",
"centroids_sql = \"\"\"\n",
"SELECT\n",
" i.district,\n",
" AVG(w.latitude) AS lat,\n",
" AVG(w.longitude) AS lon\n",
"FROM inspections i\n",
"JOIN well_shape_tract w USING (api_norm)\n",
"WHERE w.latitude IS NOT NULL\n",
" AND w.longitude IS NOT NULL\n",
" AND i.district IS NOT NULL\n",
"GROUP BY i.district\n",
"\"\"\"\n",
"\n",
"try:\n",
" centroids = pd.read_sql(text(centroids_sql), engine)\n",
"\n",
" # Average H1 compliance residuals to district level\n",
" resid_df = actuals[[\"district\", \"compliance_rate\"]].copy()\n",
" resid_df[\"resid\"] = m_compliance.resid.reindex(actuals.index).values\n",
" resid_by_district = resid_df.groupby(\"district\")[\"resid\"].mean().reset_index()\n",
"\n",
" centroids = centroids.merge(resid_by_district, on=\"district\").dropna()\n",
"\n",
" # Row-normalised inverse-distance weights matrix\n",
" coords = centroids[[\"lon\", \"lat\"]].values\n",
" D = cdist(coords, coords)\n",
" np.fill_diagonal(D, np.inf)\n",
" W = 1 / D\n",
" W = W / W.sum(axis=1, keepdims=True)\n",
"\n",
" z = centroids[\"resid\"].values\n",
" z = z - z.mean()\n",
" n = len(z)\n",
" morans_i = (n / W.sum()) * (z @ W @ z) / (z @ z)\n",
"\n",
" print(f\"\\nMoran's I on H1 compliance residuals = {morans_i:.4f}\")\n",
" print(\" > 0 → residuals cluster spatially (similar neighbours)\")\n",
" print(\" ≈ 0 → no spatial pattern\")\n",
" print(\" < 0 → spatial dispersion (dissimilar neighbours)\")\n",
"\n",
" print(\"\\nDistrict centroids used:\")\n",
" print(centroids[[\"district\", \"lat\", \"lon\"]].round(2).to_string(index=False))\n",
"\n",
"except Exception as e:\n",
" print(f\"Moran's I skipped: {e}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}