Initial commit: texas inspection expenses analysis notebook

Jupyter notebook analyzing RRC budget (2016-2024) against inspection/violation
outcomes across Texas RRC districts. Tests four hypotheses: organizational
capacity, goal ambiguity, district multilevel effects, and spatial factors.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 12:17:56 -08:00
commit 8038a3fab4
2 changed files with 634 additions and 0 deletions

View File

@@ -0,0 +1,634 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "dc6818b8",
"metadata": {},
"source": [
"# Texas RRC Inspection Expenses Analysis\n",
"\n",
"**Research question:** Does organizational capacity (budget, staffing) predict better regulatory outputs (inspections, compliance, enforcement), and how is that relationship moderated by goal ambiguity, district-level heterogeneity, and spatial/geographic factors?\n",
"\n",
"## Hypotheses\n",
"- **H1 — Capacity → Outputs:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n",
"- **H2 — Goal Ambiguity:** When a larger share of RRC budget goes to the more ambiguous \"Energy Resource Development\" goal, the capacity → output relationship weakens.\n",
"- **H3 — Multilevel / District Effects:** The capacity → output relationship varies across RRC districts (budget slope heterogeneity).\n",
"- **H4 — Spatial & Geographic:** Offshore-jurisdiction and border districts moderate the capacity → output relationship; spatial autocorrelation in residuals is tested via Moran's I.\n",
"\n",
"**Data:**\n",
"- PostgreSQL warehouse (`texas_data`): `inspections`, `violations`, `well_shape_tract`\n",
"- `RRC Budget Data.xlsx`: statewide RRC budget by strategy, 20162024\n",
"- Analysis period: 20162023 (2024 is budget estimate, excluded from regressions)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "49de2b5c",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import warnings\n",
"from pathlib import Path\n",
"from urllib.parse import quote_plus\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.ticker as mticker\n",
"import statsmodels.formula.api as smf\n",
"from dotenv import load_dotenv\n",
"from sqlalchemy import create_engine, text\n",
"from scipy.spatial.distance import cdist\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
"pd.set_option(\"display.float_format\", \"{:,.2f}\".format)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08420da3",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=False)\n",
"\n",
"host = os.getenv(\"PGHOST\", \"localhost\")\n",
"port = os.getenv(\"PGPORT\", \"5432\")\n",
"user = os.getenv(\"PGUSER\", \"postgres\")\n",
"password = quote_plus(os.getenv(\"PGPASSWORD\", \"\"))\n",
"database = os.getenv(\"PGDATABASE\", \"texas_data\")\n",
"\n",
"engine = create_engine(\n",
" f\"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}\"\n",
")\n",
"print(f\"Connected → {database} on {host}:{port}\")\n"
]
},
{
"cell_type": "markdown",
"id": "b4e6c44f",
"metadata": {},
"source": [
"## 1. Data Loading"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43886f13",
"metadata": {},
"outputs": [],
"source": [
"# District-year inspection metrics aggregated in SQL.\n",
"# LAG() computes days since the previous inspection for the same well (api_norm).\n",
"insp_sql = \"\"\"\n",
"WITH lagged AS (\n",
" SELECT\n",
" district,\n",
" EXTRACT(year FROM inspection_date)::int AS year,\n",
" api_norm,\n",
" inspection_date,\n",
" CASE WHEN UPPER(compliance::text) IN ('YES', 'Y') THEN 1.0 ELSE 0.0 END AS is_compliant,\n",
" EXTRACT(EPOCH FROM (\n",
" inspection_date\n",
" - LAG(inspection_date) OVER (PARTITION BY api_norm ORDER BY inspection_date)\n",
" )) / 86400.0 AS days_since_prev\n",
" FROM inspections\n",
" WHERE inspection_date IS NOT NULL\n",
" AND district IS NOT NULL\n",
" AND EXTRACT(year FROM inspection_date) BETWEEN 2016 AND 2024\n",
")\n",
"SELECT\n",
" district,\n",
" year,\n",
" COUNT(*) AS total_inspections,\n",
" COUNT(DISTINCT api_norm) AS unique_wells,\n",
" ROUND(AVG(is_compliant)::numeric * 100, 2) AS compliance_rate,\n",
" ROUND(AVG(days_since_prev)::numeric, 1) AS avg_days_between_inspections\n",
"FROM lagged\n",
"GROUP BY district, year\n",
"ORDER BY district, year\n",
"\"\"\"\n",
"\n",
"insp = pd.read_sql(text(insp_sql), engine)\n",
"print(f\"Inspections panel: {len(insp):,} district-year rows | {insp['district'].nunique()} districts\")\n",
"insp.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3841e2f5",
"metadata": {},
"outputs": [],
"source": [
"# District-year violation metrics. Blank last_enf_action strings treated as no action.\n",
"viol_sql = \"\"\"\n",
"SELECT\n",
" district,\n",
" EXTRACT(year FROM violation_disc_date)::int AS year,\n",
" COUNT(*) AS total_violations,\n",
" COUNT(DISTINCT api_norm) AS unique_wells_with_violations,\n",
" SUM(CASE WHEN major_viol_ind = 'Y' THEN 1 ELSE 0 END) AS major_violations,\n",
" ROUND(AVG(CASE WHEN compliant_on_reinsp = 'Y' THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)\n",
" AS resolution_rate,\n",
" ROUND(AVG(CASE WHEN last_enf_action IS NOT NULL AND last_enf_action <> ''\n",
" THEN 1.0 ELSE 0.0 END)::numeric * 100, 2) AS enforcement_rate,\n",
" ROUND(AVG(\n",
" CASE WHEN last_enf_action_date IS NOT NULL\n",
" THEN EXTRACT(EPOCH FROM (last_enf_action_date - violation_disc_date)) / 86400.0\n",
" END\n",
" )::numeric, 1) AS avg_days_to_enforcement\n",
"FROM violations\n",
"WHERE violation_disc_date IS NOT NULL\n",
" AND district IS NOT NULL\n",
" AND EXTRACT(year FROM violation_disc_date) BETWEEN 2016 AND 2024\n",
"GROUP BY district, year\n",
"ORDER BY district, year\n",
"\"\"\"\n",
"\n",
"viol = pd.read_sql(text(viol_sql), engine)\n",
"print(f\"Violations panel: {len(viol):,} district-year rows\")\n",
"viol.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e196cac",
"metadata": {},
"outputs": [],
"source": [
"BUDGET_PATH = Path(\"RRC Budget Data.xlsx\")\n",
"raw = pd.read_excel(BUDGET_PATH, header=None)\n",
"YEARS = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]\n",
"COLS = slice(1, 10) # spreadsheet columns 1-9 map to years 2016-2024\n",
"\n",
"# ── Section 1: Energy Resource Development (rows 7-18) ──────────────────────\n",
"erd = pd.DataFrame({\n",
" \"year\": YEARS,\n",
" \"strategy\": \"Energy Resource Development\",\n",
" \"total_budget\": raw.iloc[1, COLS].values.astype(float),\n",
" \"salaries\": raw.iloc[7, COLS].values.astype(float),\n",
" \"other_personnel\": raw.iloc[8, COLS].values.astype(float),\n",
" \"professional_fees\": raw.iloc[9, COLS].values.astype(float),\n",
" \"travel\": raw.iloc[13, COLS].values.astype(float),\n",
" \"other_operating\": raw.iloc[16, COLS].values.astype(float),\n",
" \"capital_exp\": raw.iloc[17, COLS].values.astype(float),\n",
" \"fte\": raw.iloc[18, COLS].values.astype(float),\n",
"})\n",
"\n",
"# ── Section 2: Oil/Gas Monitoring & Inspections (rows 20-31) ────────────────\n",
"ogi = pd.DataFrame({\n",
" \"year\": YEARS,\n",
" \"strategy\": \"Oil/Gas Monitoring & Inspections\",\n",
" \"total_budget\": raw.iloc[2, COLS].values.astype(float),\n",
" \"salaries\": raw.iloc[20, COLS].values.astype(float),\n",
" \"other_personnel\": raw.iloc[21, COLS].values.astype(float),\n",
" \"professional_fees\": raw.iloc[22, COLS].values.astype(float),\n",
" \"travel\": raw.iloc[26, COLS].values.astype(float),\n",
" \"other_operating\": raw.iloc[29, COLS].values.astype(float),\n",
" \"capital_exp\": raw.iloc[30, COLS].values.astype(float),\n",
" \"fte\": raw.iloc[31, COLS].values.astype(float),\n",
"})\n",
"\n",
"budget_long = pd.concat([erd, ogi], ignore_index=True)\n",
"print(f\"Budget long: {len(budget_long)} rows (2 strategies × {len(YEARS)} years)\")\n",
"budget_long\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "896d152b",
"metadata": {},
"outputs": [],
"source": [
"# ── Wide budget: one row per year with ogi_ / erd_ prefixed columns ──────────\n",
"ogi_wide = ogi.drop(columns=\"strategy\").add_prefix(\"ogi_\")\n",
"erd_wide = erd.drop(columns=\"strategy\").add_prefix(\"erd_\")\n",
"\n",
"budget_wide = (\n",
" ogi_wide\n",
" .merge(erd_wide, left_on=\"ogi_year\", right_on=\"erd_year\")\n",
" .rename(columns={\"ogi_year\": \"year\"})\n",
" .drop(columns=\"erd_year\")\n",
")\n",
"\n",
"# ── Merge inspections + violations, then join statewide budget on year ────────\n",
"panel = (\n",
" insp\n",
" .merge(viol, on=[\"district\", \"year\"], how=\"left\")\n",
" .merge(budget_wide, on=\"year\", how=\"left\")\n",
")\n",
"\n",
"# ── Derived columns ───────────────────────────────────────────────────────────\n",
"panel[\"violations_per_inspection\"] = panel[\"total_violations\"] / panel[\"total_inspections\"]\n",
"panel[\"ogi_budget_m\"] = panel[\"ogi_total_budget\"] / 1_000_000 # dollars → millions\n",
"panel[\"erd_budget_m\"] = panel[\"erd_total_budget\"] / 1_000_000\n",
"panel[\"post_2019\"] = (panel[\"year\"] >= 2019).astype(int)\n",
"panel[\"is_budget_year\"] = (panel[\"year\"] == 2024).astype(int)\n",
"\n",
"# Goal ambiguity: share of combined budget going to the inspection mission.\n",
"# Higher share = clearer mission focus; lower share = more goal ambiguity.\n",
"panel[\"inspection_budget_share\"] = (\n",
" panel[\"ogi_total_budget\"] / (panel[\"ogi_total_budget\"] + panel[\"erd_total_budget\"])\n",
")\n",
"\n",
"# Fill violation NaNs for districts with zero violations in a given year\n",
"fill_cols = [\n",
" \"total_violations\", \"unique_wells_with_violations\", \"major_violations\",\n",
" \"resolution_rate\", \"enforcement_rate\", \"avg_days_to_enforcement\",\n",
" \"violations_per_inspection\",\n",
"]\n",
"panel[fill_cols] = panel[fill_cols].fillna(0)\n",
"\n",
"print(f\"Analysis panel: {len(panel):,} rows | \"\n",
" f\"{panel['district'].nunique()} districts | \"\n",
" f\"{panel['year'].nunique()} years\")\n",
"panel.head()\n"
]
},
{
"cell_type": "markdown",
"id": "3558bae7",
"metadata": {},
"source": [
"## 2. Exploratory Overview"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92921756",
"metadata": {},
"outputs": [],
"source": [
"# Year-level means across districts\n",
"yearly = panel.groupby(\"year\").agg(\n",
" ogi_budget_m = (\"ogi_budget_m\", \"first\"),\n",
" ogi_fte = (\"ogi_fte\", \"first\"),\n",
" total_inspections = (\"total_inspections\", \"mean\"),\n",
" compliance_rate = (\"compliance_rate\", \"mean\"),\n",
" total_violations = (\"total_violations\", \"mean\"),\n",
" resolution_rate = (\"resolution_rate\", \"mean\"),\n",
" avg_days_to_enf = (\"avg_days_to_enforcement\",\"mean\"),\n",
").round(2)\n",
"\n",
"print(yearly.to_string())\n",
"\n",
"fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n",
"axes = axes.flatten()\n",
"\n",
"yearly[\"ogi_budget_m\"].plot(ax=axes[0], marker=\"o\", title=\"OGI Budget ($M)\")\n",
"axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f\"${x:.0f}M\"))\n",
"\n",
"yearly[\"ogi_fte\"].plot(ax=axes[1], marker=\"o\", title=\"OGI FTE Positions\")\n",
"\n",
"yearly[\"total_inspections\"].plot(ax=axes[2], marker=\"o\", title=\"Avg Inspections / District\")\n",
"\n",
"yearly[\"compliance_rate\"].plot(ax=axes[3], marker=\"o\", title=\"Avg Compliance Rate (%)\")\n",
"\n",
"yearly[\"resolution_rate\"].plot(ax=axes[4], marker=\"o\", title=\"Avg Resolution Rate (%)\")\n",
"\n",
"yearly[\"avg_days_to_enf\"].plot(ax=axes[5], marker=\"o\", title=\"Avg Days to Enforcement\")\n",
"\n",
"for ax in axes:\n",
" ax.axvline(2019, color=\"red\", linestyle=\"--\", alpha=0.5, label=\"2019 policy\")\n",
" ax.set_xlabel(\"Year\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d2671c9",
"metadata": {},
"outputs": [],
"source": [
"corr_cols = [\n",
" \"ogi_budget_m\", \"ogi_fte\", \"inspection_budget_share\",\n",
" \"total_inspections\", \"compliance_rate\",\n",
" \"total_violations\", \"resolution_rate\", \"avg_days_to_enforcement\",\n",
"]\n",
"corr = panel[corr_cols].corr().round(2)\n",
"\n",
"fig, ax = plt.subplots(figsize=(9, 7))\n",
"im = ax.imshow(corr, cmap=\"RdBu_r\", vmin=-1, vmax=1)\n",
"ax.set_xticks(range(len(corr_cols)))\n",
"ax.set_yticks(range(len(corr_cols)))\n",
"ax.set_xticklabels(corr_cols, rotation=45, ha=\"right\", fontsize=9)\n",
"ax.set_yticklabels(corr_cols, fontsize=9)\n",
"for i in range(len(corr_cols)):\n",
" for j in range(len(corr_cols)):\n",
" ax.text(j, i, corr.iloc[i, j], ha=\"center\", va=\"center\", fontsize=8)\n",
"plt.colorbar(im, ax=ax)\n",
"ax.set_title(\"Correlation Matrix — Key Variables\")\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "3ca1410a",
"metadata": {},
"source": [
"## H1: Organizational Capacity → Policy Outputs\n",
"\n",
"**Prediction:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n",
"\n",
"**Model:** OLS with district fixed effects and year 20162023 (excluding 2024 budget estimate). Budget varies only over time, so it identifies via year-to-year changes in statewide RRC allocations; district FE absorbs persistent cross-district differences.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "463387d3",
"metadata": {},
"outputs": [],
"source": [
"actuals = panel[panel[\"is_budget_year\"] == 0].copy()\n",
"\n",
"fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
"\n",
"actuals.plot.scatter(x=\"ogi_budget_m\", y=\"total_inspections\",\n",
" alpha=0.4, ax=axes[0], title=\"Budget → Inspections\")\n",
"actuals.plot.scatter(x=\"ogi_budget_m\", y=\"compliance_rate\",\n",
" alpha=0.4, ax=axes[1], title=\"Budget → Compliance Rate (%)\")\n",
"actuals.plot.scatter(x=\"ogi_budget_m\", y=\"resolution_rate\",\n",
" alpha=0.4, ax=axes[2], title=\"Budget → Resolution Rate (%)\")\n",
"\n",
"for ax in axes:\n",
" ax.set_xlabel(\"OGI Budget ($M)\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "535fc4eb",
"metadata": {},
"outputs": [],
"source": [
"m_inspections = smf.ols(\n",
" \"total_inspections ~ ogi_budget_m + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"m_compliance = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"m_resolution = smf.ols(\n",
" \"resolution_rate ~ ogi_budget_m + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"display_cols = [\"Coef.\", \"Std.Err.\", \"t\", \"P>|t|\"]\n",
"\n",
"print(\"H1a — OGI Budget ($M) → Total Inspections\")\n",
"print(m_inspections.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
"print(f\" R² = {m_inspections.rsquared:.3f} Adj. R² = {m_inspections.rsquared_adj:.3f}\\n\")\n",
"\n",
"print(\"H1b — OGI Budget ($M) → Compliance Rate (%)\")\n",
"print(m_compliance.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
"print(f\" R² = {m_compliance.rsquared:.3f} Adj. R² = {m_compliance.rsquared_adj:.3f}\\n\")\n",
"\n",
"print(\"H1c — OGI Budget ($M) → Resolution Rate (%)\")\n",
"print(m_resolution.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n",
"print(f\" R² = {m_resolution.rsquared:.3f} Adj. R² = {m_resolution.rsquared_adj:.3f}\")\n"
]
},
{
"cell_type": "markdown",
"id": "56add68a",
"metadata": {},
"source": [
"## H2: Goal Ambiguity Moderates Capacity Effects\n",
"\n",
"**Prediction:** When a larger share of the combined RRC budget flows to the broader \"Energy Resource Development\" goal (lower `inspection_budget_share`), the capacity → output link weakens.\n",
"\n",
"**Operationalization:**\n",
"`inspection_budget_share = ogi_budget / (ogi_budget + erd_budget)`\n",
"\n",
"A negative interaction coefficient `ogi_budget_m × inspection_budget_share` would be unexpected (higher share → weaker effect). A positive coefficient supports H2 — clearer mission focus amplifies the budget → compliance relationship.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24187ce8",
"metadata": {},
"outputs": [],
"source": [
"m_h2 = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"key_rows = [\"ogi_budget_m\", \"inspection_budget_share\", \"ogi_budget_m:inspection_budget_share\"]\n",
"print(\"H2 — Goal Ambiguity Moderation (DV: compliance_rate)\")\n",
"print(m_h2.summary2().tables[1][display_cols].loc[key_rows])\n",
"print(f\"\\nR² = {m_h2.rsquared:.3f} Adj. R² = {m_h2.rsquared_adj:.3f}\")\n",
"\n",
"# ── Same model with resolution rate as DV ────────────────────────────────────\n",
"m_h2_res = smf.ols(\n",
" \"resolution_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"print(\"\\nH2 — Goal Ambiguity Moderation (DV: resolution_rate)\")\n",
"print(m_h2_res.summary2().tables[1][display_cols].loc[key_rows])\n",
"print(f\"\\nR² = {m_h2_res.rsquared:.3f} Adj. R² = {m_h2_res.rsquared_adj:.3f}\")\n"
]
},
{
"cell_type": "markdown",
"id": "b6583857",
"metadata": {},
"source": [
"## H3: District Multilevel Effects\n",
"\n",
"**Prediction:** The budget → output slope varies across RRC districts — some districts translate budget increases into better outputs more effectively than others.\n",
"\n",
"**Model:** Interaction `ogi_budget_m × C(district)` — the reference district captures the baseline budget slope; interaction terms show how each other district's slope differs.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "151faefd",
"metadata": {},
"outputs": [],
"source": [
"m_h3 = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m * C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"coef_table = m_h3.summary2().tables[1]\n",
"\n",
"# Baseline budget slope (reference district)\n",
"baseline_row = coef_table.loc[[\"ogi_budget_m\"]]\n",
"print(\"H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)\")\n",
"print(f\"Baseline (reference district) budget slope:\")\n",
"print(baseline_row[display_cols])\n",
"\n",
"# District-specific deviations from baseline\n",
"interaction_rows = coef_table[coef_table.index.str.contains(\"ogi_budget_m:C\")]\n",
"print(\"\\nDistrict interaction terms (deviation from reference slope):\")\n",
"print(interaction_rows[display_cols].round(4))\n",
"print(f\"\\nR² = {m_h3.rsquared:.3f} Adj. R² = {m_h3.rsquared_adj:.3f}\")\n",
"\n",
"# ── Plot district-specific budget slopes ─────────────────────────────────────\n",
"districts = actuals[\"district\"].unique()\n",
"slopes = {}\n",
"for d in districts:\n",
" key = f\"ogi_budget_m:C(district)[T.{d}]\"\n",
" base = m_h3.params.get(\"ogi_budget_m\", 0)\n",
" delta = m_h3.params.get(key, 0)\n",
" slopes[str(d)] = base + delta\n",
"\n",
"slope_df = pd.Series(slopes).sort_values()\n",
"fig, ax = plt.subplots(figsize=(10, 4))\n",
"slope_df.plot.barh(ax=ax, color=[\"#d62728\" if v < 0 else \"#1f77b4\" for v in slope_df])\n",
"ax.axvline(0, color=\"black\", linewidth=0.8)\n",
"ax.set_xlabel(\"Budget slope (compliance rate pp per $M)\")\n",
"ax.set_title(\"H3 — District-Specific Budget → Compliance Slopes\")\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "5bb27b3e",
"metadata": {},
"source": [
"## H4: Spatial and Geographic Factors\n",
"\n",
"**Predictions:**\n",
"- Offshore-jurisdiction districts (02, 03, 04) show a different budget → output relationship due to dual onshore/offshore oversight burden.\n",
"- Border-proximate districts show a different relationship due to cross-jurisdiction enforcement complexity.\n",
"- Spatial autocorrelation in H1 residuals (Moran's I) would indicate unmodeled geographic spillovers.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6e56f00",
"metadata": {},
"outputs": [],
"source": [
"# Texas RRC district geography flags (based on known RRC district locations)\n",
"OFFSHORE_DISTRICTS = {\"02\", \"03\", \"04\"} # dual onshore + offshore jurisdiction\n",
"BORDER_DISTRICTS = {\"01\", \"02\", \"03\", \"04\"} # south / gulf coast proximity\n",
"\n",
"actuals = actuals.copy()\n",
"actuals[\"district_str\"] = actuals[\"district\"].astype(str).str.strip()\n",
"actuals[\"offshore\"] = actuals[\"district_str\"].isin(OFFSHORE_DISTRICTS).astype(int)\n",
"actuals[\"border\"] = actuals[\"district_str\"].isin(BORDER_DISTRICTS).astype(int)\n",
"\n",
"print(\"District classification:\")\n",
"print(\n",
" actuals.groupby([\"district_str\", \"offshore\", \"border\"])\n",
" .size()\n",
" .reset_index(name=\"district_year_obs\")\n",
" .to_string(index=False)\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74686bfe",
"metadata": {},
"outputs": [],
"source": [
"# ── Spatial regression: offshore and border interactions ─────────────────────\n",
"m_h4 = smf.ols(\n",
" \"compliance_rate ~ ogi_budget_m + offshore + border \"\n",
" \"+ ogi_budget_m:offshore + ogi_budget_m:border + C(district)\",\n",
" data=actuals,\n",
").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n",
"\n",
"spatial_rows = [\n",
" \"ogi_budget_m\", \"offshore\", \"border\",\n",
" \"ogi_budget_m:offshore\", \"ogi_budget_m:border\",\n",
"]\n",
"available = [r for r in spatial_rows if r in m_h4.params.index]\n",
"print(\"H4 — Spatial Moderators (DV: compliance_rate)\")\n",
"print(m_h4.summary2().tables[1][display_cols].loc[available])\n",
"print(f\"\\nR² = {m_h4.rsquared:.3f} Adj. R² = {m_h4.rsquared_adj:.3f}\")\n",
"\n",
"# ── Moran's I on H1 residuals ─────────────────────────────────────────────────\n",
"# Compute district centroids from well lat/lon joined via inspections\n",
"centroids_sql = \"\"\"\n",
"SELECT\n",
" i.district,\n",
" AVG(w.latitude) AS lat,\n",
" AVG(w.longitude) AS lon\n",
"FROM inspections i\n",
"JOIN well_shape_tract w USING (api_norm)\n",
"WHERE w.latitude IS NOT NULL\n",
" AND w.longitude IS NOT NULL\n",
" AND i.district IS NOT NULL\n",
"GROUP BY i.district\n",
"\"\"\"\n",
"\n",
"try:\n",
" centroids = pd.read_sql(text(centroids_sql), engine)\n",
"\n",
" # Average H1 compliance residuals to district level\n",
" resid_df = actuals[[\"district\", \"compliance_rate\"]].copy()\n",
" resid_df[\"resid\"] = m_compliance.resid.reindex(actuals.index).values\n",
" resid_by_district = resid_df.groupby(\"district\")[\"resid\"].mean().reset_index()\n",
"\n",
" centroids = centroids.merge(resid_by_district, on=\"district\").dropna()\n",
"\n",
" # Row-normalised inverse-distance weights matrix\n",
" coords = centroids[[\"lon\", \"lat\"]].values\n",
" D = cdist(coords, coords)\n",
" np.fill_diagonal(D, np.inf)\n",
" W = 1 / D\n",
" W = W / W.sum(axis=1, keepdims=True)\n",
"\n",
" z = centroids[\"resid\"].values\n",
" z = z - z.mean()\n",
" n = len(z)\n",
" morans_i = (n / W.sum()) * (z @ W @ z) / (z @ z)\n",
"\n",
" print(f\"\\nMoran's I on H1 compliance residuals = {morans_i:.4f}\")\n",
" print(\" > 0 → residuals cluster spatially (similar neighbours)\")\n",
" print(\" ≈ 0 → no spatial pattern\")\n",
" print(\" < 0 → spatial dispersion (dissimilar neighbours)\")\n",
"\n",
" print(\"\\nDistrict centroids used:\")\n",
" print(centroids[[\"district\", \"lat\", \"lon\"]].round(2).to_string(index=False))\n",
"\n",
"except Exception as e:\n",
" print(f\"Moran's I skipped: {e}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}