{ "cells": [ { "cell_type": "markdown", "id": "dc6818b8", "metadata": {}, "source": [ "# Texas RRC Inspection Expenses Analysis\n", "\n", "**Research question:** Does organizational capacity (budget, staffing) predict better regulatory outputs (inspections, compliance, enforcement), and how is that relationship moderated by goal ambiguity, district-level heterogeneity, and spatial/geographic factors?\n", "\n", "## Hypotheses\n", "- **H1 — Capacity → Outputs:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n", "- **H2 — Goal Ambiguity:** When a larger share of RRC budget goes to the more ambiguous \"Energy Resource Development\" goal, the capacity → output relationship weakens.\n", "- **H3 — Multilevel / District Effects:** The capacity → output relationship varies across RRC districts (budget slope heterogeneity).\n", "- **H4 — Spatial & Geographic:** Offshore-jurisdiction and border districts moderate the capacity → output relationship; spatial autocorrelation in residuals is tested via Moran's I.\n", "\n", "**Data:**\n", "- PostgreSQL warehouse (`texas_data`): `inspections`, `violations`, `well_shape_tract`\n", "- `RRC Budget Data.xlsx`: statewide RRC budget by strategy, 2016–2024\n", "- Analysis period: 2016–2023 (2024 is budget estimate, excluded from regressions)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "49de2b5c", "metadata": {}, "outputs": [], "source": [ "import os\n", "import warnings\n", "from pathlib import Path\n", "from urllib.parse import quote_plus\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import matplotlib.ticker as mticker\n", "import statsmodels.formula.api as smf\n", "from dotenv import load_dotenv\n", "from sqlalchemy import create_engine, text\n", "from scipy.spatial.distance import cdist\n", "\n", "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", "pd.set_option(\"display.float_format\", \"{:,.2f}\".format)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "08420da3", "metadata": {}, "outputs": [], "source": [ "load_dotenv(override=False)\n", "\n", "host = os.getenv(\"PGHOST\", \"localhost\")\n", "port = os.getenv(\"PGPORT\", \"5432\")\n", "user = os.getenv(\"PGUSER\", \"postgres\")\n", "password = quote_plus(os.getenv(\"PGPASSWORD\", \"\"))\n", "database = os.getenv(\"PGDATABASE\", \"texas_data\")\n", "\n", "engine = create_engine(\n", " f\"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}\"\n", ")\n", "print(f\"Connected → {database} on {host}:{port}\")\n" ] }, { "cell_type": "markdown", "id": "b4e6c44f", "metadata": {}, "source": [ "## 1. Data Loading" ] }, { "cell_type": "code", "execution_count": null, "id": "43886f13", "metadata": {}, "outputs": [], "source": [ "# District-year inspection metrics aggregated in SQL.\n", "# LAG() computes days since the previous inspection for the same well (api_norm).\n", "insp_sql = \"\"\"\n", "WITH lagged AS (\n", " SELECT\n", " district,\n", " EXTRACT(year FROM inspection_date)::int AS year,\n", " api_norm,\n", " inspection_date,\n", " CASE WHEN UPPER(compliance::text) IN ('YES', 'Y') THEN 1.0 ELSE 0.0 END AS is_compliant,\n", " EXTRACT(EPOCH FROM (\n", " inspection_date\n", " - LAG(inspection_date) OVER (PARTITION BY api_norm ORDER BY inspection_date)\n", " )) / 86400.0 AS days_since_prev\n", " FROM inspections\n", " WHERE inspection_date IS NOT NULL\n", " AND district IS NOT NULL\n", " AND EXTRACT(year FROM inspection_date) BETWEEN 2016 AND 2024\n", ")\n", "SELECT\n", " district,\n", " year,\n", " COUNT(*) AS total_inspections,\n", " COUNT(DISTINCT api_norm) AS unique_wells,\n", " ROUND(AVG(is_compliant)::numeric * 100, 2) AS compliance_rate,\n", " ROUND(AVG(days_since_prev)::numeric, 1) AS avg_days_between_inspections\n", "FROM lagged\n", "GROUP BY district, year\n", "ORDER BY district, year\n", "\"\"\"\n", "\n", "insp = pd.read_sql(text(insp_sql), engine)\n", "print(f\"Inspections panel: {len(insp):,} district-year rows | {insp['district'].nunique()} districts\")\n", "insp.head()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3841e2f5", "metadata": {}, "outputs": [], "source": [ "# District-year violation metrics. Blank last_enf_action strings treated as no action.\n", "viol_sql = \"\"\"\n", "SELECT\n", " district,\n", " EXTRACT(year FROM violation_disc_date)::int AS year,\n", " COUNT(*) AS total_violations,\n", " COUNT(DISTINCT api_norm) AS unique_wells_with_violations,\n", " SUM(CASE WHEN major_viol_ind = 'Y' THEN 1 ELSE 0 END) AS major_violations,\n", " ROUND(AVG(CASE WHEN compliant_on_reinsp = 'Y' THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)\n", " AS resolution_rate,\n", " ROUND(AVG(CASE WHEN last_enf_action IS NOT NULL AND last_enf_action <> ''\n", " THEN 1.0 ELSE 0.0 END)::numeric * 100, 2) AS enforcement_rate,\n", " ROUND(AVG(\n", " CASE WHEN last_enf_action_date IS NOT NULL\n", " THEN EXTRACT(EPOCH FROM (last_enf_action_date - violation_disc_date)) / 86400.0\n", " END\n", " )::numeric, 1) AS avg_days_to_enforcement\n", "FROM violations\n", "WHERE violation_disc_date IS NOT NULL\n", " AND district IS NOT NULL\n", " AND EXTRACT(year FROM violation_disc_date) BETWEEN 2016 AND 2024\n", "GROUP BY district, year\n", "ORDER BY district, year\n", "\"\"\"\n", "\n", "viol = pd.read_sql(text(viol_sql), engine)\n", "print(f\"Violations panel: {len(viol):,} district-year rows\")\n", "viol.head()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9e196cac", "metadata": {}, "outputs": [], "source": [ "BUDGET_PATH = Path(\"RRC Budget Data.xlsx\")\n", "raw = pd.read_excel(BUDGET_PATH, header=None)\n", "YEARS = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]\n", "COLS = slice(1, 10) # spreadsheet columns 1-9 map to years 2016-2024\n", "\n", "# ── Section 1: Energy Resource Development (rows 7-18) ──────────────────────\n", "erd = pd.DataFrame({\n", " \"year\": YEARS,\n", " \"strategy\": \"Energy Resource Development\",\n", " \"total_budget\": raw.iloc[1, COLS].values.astype(float),\n", " \"salaries\": raw.iloc[7, COLS].values.astype(float),\n", " \"other_personnel\": raw.iloc[8, COLS].values.astype(float),\n", " \"professional_fees\": raw.iloc[9, COLS].values.astype(float),\n", " \"travel\": raw.iloc[13, COLS].values.astype(float),\n", " \"other_operating\": raw.iloc[16, COLS].values.astype(float),\n", " \"capital_exp\": raw.iloc[17, COLS].values.astype(float),\n", " \"fte\": raw.iloc[18, COLS].values.astype(float),\n", "})\n", "\n", "# ── Section 2: Oil/Gas Monitoring & Inspections (rows 20-31) ────────────────\n", "ogi = pd.DataFrame({\n", " \"year\": YEARS,\n", " \"strategy\": \"Oil/Gas Monitoring & Inspections\",\n", " \"total_budget\": raw.iloc[2, COLS].values.astype(float),\n", " \"salaries\": raw.iloc[20, COLS].values.astype(float),\n", " \"other_personnel\": raw.iloc[21, COLS].values.astype(float),\n", " \"professional_fees\": raw.iloc[22, COLS].values.astype(float),\n", " \"travel\": raw.iloc[26, COLS].values.astype(float),\n", " \"other_operating\": raw.iloc[29, COLS].values.astype(float),\n", " \"capital_exp\": raw.iloc[30, COLS].values.astype(float),\n", " \"fte\": raw.iloc[31, COLS].values.astype(float),\n", "})\n", "\n", "budget_long = pd.concat([erd, ogi], ignore_index=True)\n", "print(f\"Budget long: {len(budget_long)} rows (2 strategies × {len(YEARS)} years)\")\n", "budget_long\n" ] }, { "cell_type": "code", "execution_count": null, "id": "896d152b", "metadata": {}, "outputs": [], "source": [ "# ── Wide budget: one row per year with ogi_ / erd_ prefixed columns ──────────\n", "ogi_wide = ogi.drop(columns=\"strategy\").add_prefix(\"ogi_\")\n", "erd_wide = erd.drop(columns=\"strategy\").add_prefix(\"erd_\")\n", "\n", "budget_wide = (\n", " ogi_wide\n", " .merge(erd_wide, left_on=\"ogi_year\", right_on=\"erd_year\")\n", " .rename(columns={\"ogi_year\": \"year\"})\n", " .drop(columns=\"erd_year\")\n", ")\n", "\n", "# ── Merge inspections + violations, then join statewide budget on year ────────\n", "panel = (\n", " insp\n", " .merge(viol, on=[\"district\", \"year\"], how=\"left\")\n", " .merge(budget_wide, on=\"year\", how=\"left\")\n", ")\n", "\n", "# ── Derived columns ───────────────────────────────────────────────────────────\n", "panel[\"violations_per_inspection\"] = panel[\"total_violations\"] / panel[\"total_inspections\"]\n", "panel[\"ogi_budget_m\"] = panel[\"ogi_total_budget\"] / 1_000_000 # dollars → millions\n", "panel[\"erd_budget_m\"] = panel[\"erd_total_budget\"] / 1_000_000\n", "panel[\"post_2019\"] = (panel[\"year\"] >= 2019).astype(int)\n", "panel[\"is_budget_year\"] = (panel[\"year\"] == 2024).astype(int)\n", "\n", "# Goal ambiguity: share of combined budget going to the inspection mission.\n", "# Higher share = clearer mission focus; lower share = more goal ambiguity.\n", "panel[\"inspection_budget_share\"] = (\n", " panel[\"ogi_total_budget\"] / (panel[\"ogi_total_budget\"] + panel[\"erd_total_budget\"])\n", ")\n", "\n", "# Fill violation NaNs for districts with zero violations in a given year\n", "fill_cols = [\n", " \"total_violations\", \"unique_wells_with_violations\", \"major_violations\",\n", " \"resolution_rate\", \"enforcement_rate\", \"avg_days_to_enforcement\",\n", " \"violations_per_inspection\",\n", "]\n", "panel[fill_cols] = panel[fill_cols].fillna(0)\n", "\n", "print(f\"Analysis panel: {len(panel):,} rows | \"\n", " f\"{panel['district'].nunique()} districts | \"\n", " f\"{panel['year'].nunique()} years\")\n", "panel.head()\n" ] }, { "cell_type": "markdown", "id": "3558bae7", "metadata": {}, "source": [ "## 2. Exploratory Overview" ] }, { "cell_type": "code", "execution_count": null, "id": "92921756", "metadata": {}, "outputs": [], "source": [ "# Year-level means across districts\n", "yearly = panel.groupby(\"year\").agg(\n", " ogi_budget_m = (\"ogi_budget_m\", \"first\"),\n", " ogi_fte = (\"ogi_fte\", \"first\"),\n", " total_inspections = (\"total_inspections\", \"mean\"),\n", " compliance_rate = (\"compliance_rate\", \"mean\"),\n", " total_violations = (\"total_violations\", \"mean\"),\n", " resolution_rate = (\"resolution_rate\", \"mean\"),\n", " avg_days_to_enf = (\"avg_days_to_enforcement\",\"mean\"),\n", ").round(2)\n", "\n", "print(yearly.to_string())\n", "\n", "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n", "axes = axes.flatten()\n", "\n", "yearly[\"ogi_budget_m\"].plot(ax=axes[0], marker=\"o\", title=\"OGI Budget ($M)\")\n", "axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f\"${x:.0f}M\"))\n", "\n", "yearly[\"ogi_fte\"].plot(ax=axes[1], marker=\"o\", title=\"OGI FTE Positions\")\n", "\n", "yearly[\"total_inspections\"].plot(ax=axes[2], marker=\"o\", title=\"Avg Inspections / District\")\n", "\n", "yearly[\"compliance_rate\"].plot(ax=axes[3], marker=\"o\", title=\"Avg Compliance Rate (%)\")\n", "\n", "yearly[\"resolution_rate\"].plot(ax=axes[4], marker=\"o\", title=\"Avg Resolution Rate (%)\")\n", "\n", "yearly[\"avg_days_to_enf\"].plot(ax=axes[5], marker=\"o\", title=\"Avg Days to Enforcement\")\n", "\n", "for ax in axes:\n", " ax.axvline(2019, color=\"red\", linestyle=\"--\", alpha=0.5, label=\"2019 policy\")\n", " ax.set_xlabel(\"Year\")\n", "\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5d2671c9", "metadata": {}, "outputs": [], "source": [ "corr_cols = [\n", " \"ogi_budget_m\", \"ogi_fte\", \"inspection_budget_share\",\n", " \"total_inspections\", \"compliance_rate\",\n", " \"total_violations\", \"resolution_rate\", \"avg_days_to_enforcement\",\n", "]\n", "corr = panel[corr_cols].corr().round(2)\n", "\n", "fig, ax = plt.subplots(figsize=(9, 7))\n", "im = ax.imshow(corr, cmap=\"RdBu_r\", vmin=-1, vmax=1)\n", "ax.set_xticks(range(len(corr_cols)))\n", "ax.set_yticks(range(len(corr_cols)))\n", "ax.set_xticklabels(corr_cols, rotation=45, ha=\"right\", fontsize=9)\n", "ax.set_yticklabels(corr_cols, fontsize=9)\n", "for i in range(len(corr_cols)):\n", " for j in range(len(corr_cols)):\n", " ax.text(j, i, corr.iloc[i, j], ha=\"center\", va=\"center\", fontsize=8)\n", "plt.colorbar(im, ax=ax)\n", "ax.set_title(\"Correlation Matrix — Key Variables\")\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "id": "3ca1410a", "metadata": {}, "source": [ "## H1: Organizational Capacity → Policy Outputs\n", "\n", "**Prediction:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n", "\n", "**Model:** OLS with district fixed effects and year 2016–2023 (excluding 2024 budget estimate). Budget varies only over time, so it identifies via year-to-year changes in statewide RRC allocations; district FE absorbs persistent cross-district differences.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "463387d3", "metadata": {}, "outputs": [], "source": [ "actuals = panel[panel[\"is_budget_year\"] == 0].copy()\n", "\n", "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n", "\n", "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"total_inspections\",\n", " alpha=0.4, ax=axes[0], title=\"Budget → Inspections\")\n", "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"compliance_rate\",\n", " alpha=0.4, ax=axes[1], title=\"Budget → Compliance Rate (%)\")\n", "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"resolution_rate\",\n", " alpha=0.4, ax=axes[2], title=\"Budget → Resolution Rate (%)\")\n", "\n", "for ax in axes:\n", " ax.set_xlabel(\"OGI Budget ($M)\")\n", "\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "535fc4eb", "metadata": {}, "outputs": [], "source": [ "m_inspections = smf.ols(\n", " \"total_inspections ~ ogi_budget_m + C(district)\",\n", " data=actuals,\n", ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", "\n", "m_compliance = smf.ols(\n", " \"compliance_rate ~ ogi_budget_m + C(district)\",\n", " data=actuals,\n", ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", "\n", "m_resolution = smf.ols(\n", " \"resolution_rate ~ ogi_budget_m + C(district)\",\n", " data=actuals,\n", ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", "\n", "display_cols = [\"Coef.\", \"Std.Err.\", \"t\", \"P>|t|\"]\n", "\n", "print(\"H1a — OGI Budget ($M) → Total Inspections\")\n", "print(m_inspections.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", "print(f\" R² = {m_inspections.rsquared:.3f} Adj. R² = {m_inspections.rsquared_adj:.3f}\\n\")\n", "\n", "print(\"H1b — OGI Budget ($M) → Compliance Rate (%)\")\n", "print(m_compliance.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", "print(f\" R² = {m_compliance.rsquared:.3f} Adj. R² = {m_compliance.rsquared_adj:.3f}\\n\")\n", "\n", "print(\"H1c — OGI Budget ($M) → Resolution Rate (%)\")\n", "print(m_resolution.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", "print(f\" R² = {m_resolution.rsquared:.3f} Adj. R² = {m_resolution.rsquared_adj:.3f}\")\n" ] }, { "cell_type": "markdown", "id": "56add68a", "metadata": {}, "source": [ "## H2: Goal Ambiguity Moderates Capacity Effects\n", "\n", "**Prediction:** When a larger share of the combined RRC budget flows to the broader \"Energy Resource Development\" goal (lower `inspection_budget_share`), the capacity → output link weakens.\n", "\n", "**Operationalization:**\n", "`inspection_budget_share = ogi_budget / (ogi_budget + erd_budget)`\n", "\n", "A negative interaction coefficient `ogi_budget_m × inspection_budget_share` would be unexpected (higher share → weaker effect). A positive coefficient supports H2 — clearer mission focus amplifies the budget → compliance relationship.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "24187ce8", "metadata": {}, "outputs": [], "source": [ "m_h2 = smf.ols(\n", " \"compliance_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n", " data=actuals,\n", ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", "\n", "key_rows = [\"ogi_budget_m\", \"inspection_budget_share\", \"ogi_budget_m:inspection_budget_share\"]\n", "print(\"H2 — Goal Ambiguity Moderation (DV: compliance_rate)\")\n", "print(m_h2.summary2().tables[1][display_cols].loc[key_rows])\n", "print(f\"\\nR² = {m_h2.rsquared:.3f} Adj. R² = {m_h2.rsquared_adj:.3f}\")\n", "\n", "# ── Same model with resolution rate as DV ────────────────────────────────────\n", "m_h2_res = smf.ols(\n", " \"resolution_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n", " data=actuals,\n", ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", "\n", "print(\"\\nH2 — Goal Ambiguity Moderation (DV: resolution_rate)\")\n", "print(m_h2_res.summary2().tables[1][display_cols].loc[key_rows])\n", "print(f\"\\nR² = {m_h2_res.rsquared:.3f} Adj. R² = {m_h2_res.rsquared_adj:.3f}\")\n" ] }, { "cell_type": "markdown", "id": "b6583857", "metadata": {}, "source": [ "## H3: District Multilevel Effects\n", "\n", "**Prediction:** The budget → output slope varies across RRC districts — some districts translate budget increases into better outputs more effectively than others.\n", "\n", "**Model:** Interaction `ogi_budget_m × C(district)` — the reference district captures the baseline budget slope; interaction terms show how each other district's slope differs.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "151faefd", "metadata": {}, "outputs": [], "source": [ "m_h3 = smf.ols(\n", " \"compliance_rate ~ ogi_budget_m * C(district)\",\n", " data=actuals,\n", ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", "\n", "coef_table = m_h3.summary2().tables[1]\n", "\n", "# Baseline budget slope (reference district)\n", "baseline_row = coef_table.loc[[\"ogi_budget_m\"]]\n", "print(\"H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)\")\n", "print(f\"Baseline (reference district) budget slope:\")\n", "print(baseline_row[display_cols])\n", "\n", "# District-specific deviations from baseline\n", "interaction_rows = coef_table[coef_table.index.str.contains(\"ogi_budget_m:C\")]\n", "print(\"\\nDistrict interaction terms (deviation from reference slope):\")\n", "print(interaction_rows[display_cols].round(4))\n", "print(f\"\\nR² = {m_h3.rsquared:.3f} Adj. R² = {m_h3.rsquared_adj:.3f}\")\n", "\n", "# ── Plot district-specific budget slopes ─────────────────────────────────────\n", "districts = actuals[\"district\"].unique()\n", "slopes = {}\n", "for d in districts:\n", " key = f\"ogi_budget_m:C(district)[T.{d}]\"\n", " base = m_h3.params.get(\"ogi_budget_m\", 0)\n", " delta = m_h3.params.get(key, 0)\n", " slopes[str(d)] = base + delta\n", "\n", "slope_df = pd.Series(slopes).sort_values()\n", "fig, ax = plt.subplots(figsize=(10, 4))\n", "slope_df.plot.barh(ax=ax, color=[\"#d62728\" if v < 0 else \"#1f77b4\" for v in slope_df])\n", "ax.axvline(0, color=\"black\", linewidth=0.8)\n", "ax.set_xlabel(\"Budget slope (compliance rate pp per $M)\")\n", "ax.set_title(\"H3 — District-Specific Budget → Compliance Slopes\")\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "id": "5bb27b3e", "metadata": {}, "source": [ "## H4: Spatial and Geographic Factors\n", "\n", "**Predictions:**\n", "- Offshore-jurisdiction districts (02, 03, 04) show a different budget → output relationship due to dual onshore/offshore oversight burden.\n", "- Border-proximate districts show a different relationship due to cross-jurisdiction enforcement complexity.\n", "- Spatial autocorrelation in H1 residuals (Moran's I) would indicate unmodeled geographic spillovers.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d6e56f00", "metadata": {}, "outputs": [], "source": [ "# Texas RRC district geography flags (based on known RRC district locations)\n", "OFFSHORE_DISTRICTS = {\"02\", \"03\", \"04\"} # dual onshore + offshore jurisdiction\n", "BORDER_DISTRICTS = {\"01\", \"02\", \"03\", \"04\"} # south / gulf coast proximity\n", "\n", "actuals = actuals.copy()\n", "actuals[\"district_str\"] = actuals[\"district\"].astype(str).str.strip()\n", "actuals[\"offshore\"] = actuals[\"district_str\"].isin(OFFSHORE_DISTRICTS).astype(int)\n", "actuals[\"border\"] = actuals[\"district_str\"].isin(BORDER_DISTRICTS).astype(int)\n", "\n", "print(\"District classification:\")\n", "print(\n", " actuals.groupby([\"district_str\", \"offshore\", \"border\"])\n", " .size()\n", " .reset_index(name=\"district_year_obs\")\n", " .to_string(index=False)\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "74686bfe", "metadata": {}, "outputs": [], "source": [ "# ── Spatial regression: offshore and border interactions ─────────────────────\n", "m_h4 = smf.ols(\n", " \"compliance_rate ~ ogi_budget_m + offshore + border \"\n", " \"+ ogi_budget_m:offshore + ogi_budget_m:border + C(district)\",\n", " data=actuals,\n", ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", "\n", "spatial_rows = [\n", " \"ogi_budget_m\", \"offshore\", \"border\",\n", " \"ogi_budget_m:offshore\", \"ogi_budget_m:border\",\n", "]\n", "available = [r for r in spatial_rows if r in m_h4.params.index]\n", "print(\"H4 — Spatial Moderators (DV: compliance_rate)\")\n", "print(m_h4.summary2().tables[1][display_cols].loc[available])\n", "print(f\"\\nR² = {m_h4.rsquared:.3f} Adj. R² = {m_h4.rsquared_adj:.3f}\")\n", "\n", "# ── Moran's I on H1 residuals ─────────────────────────────────────────────────\n", "# Compute district centroids from well lat/lon joined via inspections\n", "centroids_sql = \"\"\"\n", "SELECT\n", " i.district,\n", " AVG(w.latitude) AS lat,\n", " AVG(w.longitude) AS lon\n", "FROM inspections i\n", "JOIN well_shape_tract w USING (api_norm)\n", "WHERE w.latitude IS NOT NULL\n", " AND w.longitude IS NOT NULL\n", " AND i.district IS NOT NULL\n", "GROUP BY i.district\n", "\"\"\"\n", "\n", "try:\n", " centroids = pd.read_sql(text(centroids_sql), engine)\n", "\n", " # Average H1 compliance residuals to district level\n", " resid_df = actuals[[\"district\", \"compliance_rate\"]].copy()\n", " resid_df[\"resid\"] = m_compliance.resid.reindex(actuals.index).values\n", " resid_by_district = resid_df.groupby(\"district\")[\"resid\"].mean().reset_index()\n", "\n", " centroids = centroids.merge(resid_by_district, on=\"district\").dropna()\n", "\n", " # Row-normalised inverse-distance weights matrix\n", " coords = centroids[[\"lon\", \"lat\"]].values\n", " D = cdist(coords, coords)\n", " np.fill_diagonal(D, np.inf)\n", " W = 1 / D\n", " W = W / W.sum(axis=1, keepdims=True)\n", "\n", " z = centroids[\"resid\"].values\n", " z = z - z.mean()\n", " n = len(z)\n", " morans_i = (n / W.sum()) * (z @ W @ z) / (z @ z)\n", "\n", " print(f\"\\nMoran's I on H1 compliance residuals = {morans_i:.4f}\")\n", " print(\" > 0 → residuals cluster spatially (similar neighbours)\")\n", " print(\" ≈ 0 → no spatial pattern\")\n", " print(\" < 0 → spatial dispersion (dissimilar neighbours)\")\n", "\n", " print(\"\\nDistrict centroids used:\")\n", " print(centroids[[\"district\", \"lat\", \"lon\"]].round(2).to_string(index=False))\n", "\n", "except Exception as e:\n", " print(f\"Moran's I skipped: {e}\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 5 }