From 8038a3fab4ae3aa19181b8f14f8d1fb301e2f871 Mon Sep 17 00:00:00 2001 From: dadams Date: Wed, 25 Feb 2026 12:17:56 -0800 Subject: [PATCH] Initial commit: texas inspection expenses analysis notebook Jupyter notebook analyzing RRC budget (2016-2024) against inspection/violation outcomes across Texas RRC districts. Tests four hypotheses: organizational capacity, goal ambiguity, district multilevel effects, and spatial factors. Co-Authored-By: Claude Sonnet 4.6 --- RRC Budget Data.xlsx | Bin 0 -> 12144 bytes texas_inspection_expenses.ipynb | 634 ++++++++++++++++++++++++++++++++ 2 files changed, 634 insertions(+) create mode 100644 RRC Budget Data.xlsx create mode 100644 texas_inspection_expenses.ipynb diff --git a/RRC Budget Data.xlsx b/RRC Budget Data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d35f8a667490974533026c1233a5965d55ec6183 GIT binary patch literal 12144 zcmeHtg;!il_I2a#Zb1SB4H`7KJ86QuJB_=$1_|yS+$DHJ6EwI6C%C%={qf$+d~YT* z-(N6Oz1FSOtM5MdcGcN+PSx4Uaxk#C0C)f*005u_7#(L>=s^Jhaj*aY4ge8aTMT69 zY-;DM|Ix$V)JgZPyR8jHJ}fj{E&%%F{{OZA;t{Aw99D#|VoKadKZtEI%P!RkBfdKd z?8j#KAPDaL+*f9-oo#9PoE>qGDVm38#a)X%w&cZrHeyj{XWJAQ-q!|1jS3h5tLl+( zvwZ76r0F6cN^sQDJjul+5oROO*N-vF0HiuJLx7y(>msrh#1?q?g8my>3;k+Xt9>n~ zWm+Y?1B=~jifSuZJVT{Zm>Ux~(^i)BUV0ZB%rdZ1R9;K?u6jkvlo~4ZM!*)wyA-)j zC%Ug3GX}=cE3Bb$9q7xWtMF}|+^-ZkiR(%uFUz$0;;b3Fymh=iyXZbeT`J$Ta^YF% zGizrH2T%r1&I6RKqZ;rl+3m)BdWG-` zv&EHoxfud|zYGk1$$Nh6Pw)W1^D_)U`Cn++sLJ~K^2OKWUs#9oLQ8!|QyV9iw|`v!N5}tS4*t_$ zuSif(g0P~7o=8804cyEu$6<=fxrs=(QGWFGms!GSj47ZdS?+p6iusW^2u8}U!}nom zaru4B{s86mPtJ-cY+U}=O>UK;DR&Mo2n@82$&wBg>%Ca6GgmX$X;QKtbgmt-4CP-- z^5h0rUrSD%iPvI|y;Ubh!YLvR#TQ5m(CSmrTrs+>f|?OkIjRh;Y39z||2&rAHKC!9iUqG_jZwBEAC-r`i4FKdJfjQe(VJN@a{x%i ziG9mHCe=@qd*!FqzjFIjnsQ2$O6Z3Ngd7Z?Db{UwxPyo`*y z4T~Gd(b@7(RjAs63V1M|d?z0fdfnzyg`rFK7~1z38p|0*n%r_22r`ZESk3tr z%B-I6KgmchjO-;I=q1h*jh)`6rS)$=3>TMH642ri98J@jf3mG@!lVJ^6kj(>e1cvC zrZ>@4N(>iYGpbytM6LSq>!bX7pzE!N15~CjB+cpu*D^57D!b6~%?6s&n~1$jJx3~6 zWY;Q-WLut|cgbY4qXQi=i&86fV-C}>XfQ;oLbN*I;wy@YcgrK)tc0y2Rn~|Za@5Hf z#HXNoUst#I9*NkN)8`L=QIo#Y5L+PmsZD~k%JvWr6U2WATUAg=tS6n<3(SGyTuBfd z%Hc_k!=RZbom=Ci#*n-2QNXyV*u4E*(dSih1{KZI6)h>O)uBd3xAJDv% z;%3(?(pCHjW$4wOsqNW)Rf|T-^$pkejf`i-^D~94R%*II2_*U8`g!af)DSyyuv*WO!MogR}9KKto9M07Uy|O9YAMt04%rq~0+N9b$NFLOehEB&163*9B@VR;_|#7PW!*u`DOqI)9o{(x6!nLm1w1U5}xlY5Bl=7ULE=+1!L7jHDv zzja#1w%isxZ`>U;lB0oeA$9pc7Ha0~bpcNE;Z%n4DK5&j`)r#-gjx>hNIgZz&G>N< zhxgZUIoq%OL?@{)E?_i2Kb?P0QS!h4LNvq3)3^=3b~WkqZf@1ru_JKxxwF5__psXU z^txMx22tLpXAx$uW)Y@(cj)Ql5PBsszdF892{NB^;NutY<;od*(lYTw*Ix5uqB|zB z>(eWCj~o2<2mk(^p#g%4DHkulw7!HfVgMr43q$`F$^MzA|A=Z(FG=PL&;Pqy1yDh* zmld-ExVN7)uHME2I61?{UR+ThwtSo2}7%{#;hC*3@Fs&WTfBq<{kP^ z7@Vta#*#>ESWg_A6Mi_*$i)Z@qR~<1t3W9@9PGpWgYqL3^mHej7Quu8rfgiE?OQq; zqOgKuD)Gyf$xu$KQC1grE2nX?p_f&CxM9=(E?DKTGO(e!nIfIuZ0os2Z1EaCC(wg@ z;(`cC<~^DRU}ATglJT}%;u`V1{lQGZW7N94;0|?*v|R30**gL6P-lv;0V#4n`UIz~ z{IoXJlXYj&`yu+S#dD_N%M-KB!Ty=mmzn-|m%-4V)cuJ90N}F&0E91h{C!b(vM@Du zc4GOnWBX%u&(!{5_qqko2P4i8^Ku2-tqyfjM2^8$*q`E{o+)LqN18zj2tqtRX{lfs zY!9>TPw$pja=Vybcf66iyH*l%U0iA2w_lpRZ|fut_pM&2KIk8L8rFR#_sj7`eq1?g zSP-gey6S8_c&_;LD75^1hS9uw00H&yl<{4K8~U{$2<0fUa6{G|9(9h?`b8drN0ytV zf`@8v?l#tkPj4Md)M7`Z+K!Zw)qXq+>R1S5mvKoiZ=Oik|OtS*+8e`I(k6gla zPm$i>-&++20Ygc;lRHu~mV&AMoD_lJp#*7EAB{SyI4K(CXxri2>_t=9M&=tey?ja-DnOnMwqJwLpb~hy4}~SIK{yDLz1`6vkr*rn8W-zZ z9$%%1y=+{c`5xoY&mGeXZw@&NMtFfqH;PwI54%-LU3&9|R|WWa8EHkARD_`QLn3ip zU}O*YS2N~R4Q}^WOksc1$T%Rw5i_F05QQF`A(^yQAH^r>cK@R9D&ciptxWUdfg(DT z4KCn1dCCPXg*bJ5T7q-2G#%))hc?Fua)%cGg=j7DHw4&`&giy*BJJ3c8p;ncD3)7P z9WFx_%Kq#jbebA_Xvm==nKk0UTs;&4fVcjK+o(n;mLDApV{kMIF>M?0q1+lw;tWFS zudj4c4U9fYFDr4TJ~O7?ViBMc9+pTn1m(FCd|iFqf@>h5-SmnM-DhJLagin<_e>PQdk3tnQ0^}CSPf{Q^pd)3lXCWN}cMi zauLbz!x?)9i6$lVth8<8cl`wVEW=F^A$ODO8^bqr zaEE5yl(?|4o*6Vj7DRLtzN$L3%9GKt!&HGSOke{^=KQ3Pw*&lnbc7VR3%OEuP+9jB zPLmeD=z|hE5y7mcfFn9~k?f#tbHGLe6h@N)Aj5h%0qP?q4sZQNtmSI|SeJ+s z6nIjKtnjlx?c<<0m=c)41wm5alV%L*@iGVR%1Tlxi0mN?7o!?QDbvLytjXn_lCaih zT}0qowl&hBgL@xj0}PpY`-W0c+v96{VV{xnHi;N4HcKem58Ont;)bGs)rd#!hH4uU zUJ}yW^R3OPDB7)QnAz~4%s6q`u%2mxmkbW&xa5h(zJ33SX0ZKaSCU}6EfeO8gvfeS z64cON$@I&MDOuy1flj~QF{0>F4?$X4mMdg-^Lwp{kxD8i1#{||M7lwJUBuRT2%P~M z$|wbzXDJQLKo1_slk)q-YKm}%Q89L$LHz5%1Wi}D^7Yhc}&Z5*rM)&~AT z2@y%kCS`!Z=-dt&{5c#9&m=38*3{8`+@ZgnP^epNtly%c=wN!FlVVh=Lh*h;Ba-$7(FnmA>-@a_3aA=t zG0kx1fHKfrFyvb;;80%$wzIuA)K@v*fgw~I#DYTtd;$$&irjtQxS-Ks68nXoSeOpS z&lR1QE6+3aj4Psm6JHUC0ys#zVa}~oNd?qAMBA##0^4x#r)xxZa-a)q-Gy=bgm=Q; z=&<|ar(r4Zf)*1Hjtow5YQrgvnY0d#965NGVT zz)53Vk>SZC8$pA89z$GUTpVsP6so9%fBz}>7WJei4!gEAfhbmDStS!HM>v(fQJrw6 zXoFnoVL!p5{aLY#!Z4EIL$LLxQ#gy0R8dl93sv3HO^wZ!Fi)4y(mj?#< z@cQ;o`7RfySzTXmwKIo2ws>VDnkU(1?QJZ!^eXGB`{itt3FLp}&WzJAYB;tAzz zn|2vEMqJo(c1l|_MnOWx7{%dFCPo=0GN8Uod>Orj6;a7&R;%j_@Gbv7e;zad{hA&LEnk9z372PhrlHCmltW&KBk_ z>s!nG4U27W@Z1>wLtagyZXWC8&+o86t?%~c(IK8PxKrzNJE%rRp&1Pti;KjgK*I8R z*$}Z3BxUOL47trRtq)s`R9r=T(sPN(g^GklJfetER2i(nNTPWMj6^-adNykuM|4K| zZ-KtdJGpjl$#kr+%O1cf1a`Pk`)N+FIB3vK4kLNRDvkRDPte|~5;pl>OazB(C;gK4j zG)uW}ze9r(0|z>ci(%Ue<0X@dTxkQj98#QB_ey*}@R`LjyVFQ)5o$q~_aT?R{MT2I3WvVEn_GqueNu~A}9M^4rRxVMT6nPt3 z&LDXe6^WIhK~{`{Hl%VE>L`uEQW^LSB_C-gm!I01kVY)7OF30vk+)|mhT$5UDDZvc zH^xX_igQ>{HCW7yQh;_f+PDYDmGB)ONyaB!R_QCNM2T96vcDF z@BYvb3ArYj{94vm%ssDJk5WH}QtXjtwG04h^MY}MzVnSHc|&p|q44L)#BOe9v<|;0 z@KUr{Z$VZtzyTw?0Nq{a&J-B=Gy`8xB-Agfo$Mg}=g#+280wi^)}E?QA5WCdIy?`* zJbUMTS-$twIcduo4+pjBsp*#qRio>3-5Z0n1>Ujr(L;C}l!m_f`}&pX|KxX6LV6JM z$yZq>p~cK7XGd~`+e8TkazqG_w}*odXny@v`k{C-l)AWu#FKH z3VCN*E&$qcLM~v%lP(h8!XLh6GnUQZwZe1wdq>Ii^~lJMC*cGmHx`cR6M?7gj{bh` zSf`uM<*9inRMcBCb128Ruc!v2$1Lj(J(XxTWEaJN>n5Lwl%(gharS&1Pc~wYf5PC! zr&7czZ9*M;#sl|68X3%4qqoSi6JUL{?XisEnvFEGP`r^mjn&Y6x-?_h6y7JU5l#zg zG3`d@MyTyAFVc~-eN{-DhT)(cZLl(uAv}C|KaSXv3`08(XQqp>opv#d)B=2Eq``2T zckpX_tbCI9Wto6X+$EcQq(xb2ns&ifdtq$zXE$-EJ_ou6^a9zGXutqYkvF%ihg-ii3u5@Tnl+*?%1zI$c9Aeoz?MDQ_ z5nF2$3?Y}Yx(o7h4sf$DHvMIt zO5wrmGIQ_R=+DDt8BN9p5>D2g+B=ioXL%xO=fC`dZMO?K{s=a5R(9_-+c8~33OUF4 z`OPv>3fnm`(5pe|z%sb9LvB^ht-UH_tBYIHxaMunva@a=YD+CD^)+|X2rP$nAe8}a zYW{2WRyeHT$B^(9^_ck~`sNw}+jV%l=nBs_#e)ZtIXFzPT@M~sIT&hJU-7!+I_g=f zN%hJduh)G;8b>gz6I7rV2km~gGTYk8k(@ktHvDK#2BD-c25vXyI&=j3#03_Y80NOq zYY7ye;hY@ZGtn8FTa66Fb5@cgRlmDnLru?4w3N5I_qT*ix$`i9Mrp>KAE4--0hL6K zQe}$z(Xr}&FZ!;^LL=J}+!Rm5_4v_Iq1jTf@~5%j*g{ru*QPmbq6XfEFATBL51xjy zkD+!==OQB@vE>@wbhU#}|Ktv&h)EN!i1Z+FYmQrD>{h31UkztcrUk9XK2`o=fR%PN z*o!-``KoHJ#5_9pHNv|)oqKwxtG>1c-jkWB`{=bD zL2Xw-OIIZ`{Jq6BoB<&8(P8sN?>p3leoRFybte8JGo+=oNp=CezGf9c{hbtfvk2OY zmbbF?AD>d0FthYsI>uFbx5N59-hr`&Gtvpv=GDeEkA4Jt)(=b6!?od!4$WI%l`*#` z&6C#Kg(z50NNDTYDo(pp+L_Iw4`(kF8XD7oU1+3g7=#O*fa@#+anOU>~9u z=-RP*7o+Ec$pw$swM?kG-`J{e$?e%FTRl+%S0K5DalgJJtR&$uKG5k+2(iS}-dPn& zJ<9@7{2U3G%OZ*`U-I5iez(HCDh#{+1fl@P3p0|!UDXhvGXxoqS4=|;>nH7IJd;G` zNkawZ8!|Rlc3!ozuH_1me(H~@nIr{N9p4Zq>2-0Ha4r`3@}Nsz1R1{%OzFdu4zIIsAxe4sN`^{(H^hXh+w%LSgrPU+2f@8hGE*BdD@ zxVxcg>DEbWYxDi#r8)#UmDms03>UP7Gs$Iv$dwv_+!cK+Q{;Z()@@ja=jm)w0MQww z&ZeexEa1+Z{N$a6x_q~6X-px0v6;mF5UbYoaU3r7)wCU_a4hq9JOuztq)JkjF#EtEZv;7DP8(w9 z0?eWup{hooGnE_uyQ|%S=Yt4W>iSXp8ZyO_O;u%(HRw1Qw%o~L0n8>2hlku$ZI@{l@Rt-(V5<#ibWyw-wdGHJ zA42iGnJJwft5$DgnwZ4{Z#L$t?t*HkplrN!AtM042CnzrIiovcU!pFUE@_{WPvxzxYO#ahK;~^{?G*N+Xznro1W94?ZgQ+`6#@u)D27H)z zG-ZF7BN=dF`o?!HrqB7OLf@PC6bX^9gt6N@AMnv^Vzb@X@2L@6zXWjzrfa@UTeHe2 zw+#&iN@L4+t;7B>3p~tm*(3Omlq3-P)G_{2!g>FqRlxl_B`pjcO-)ps9WCw5|72!G zqBdxO71w_?{uV>ZBegY*D?qP_Ef=|4S)Fq9XK-b#C97>=uUT!-tX~&SE^&b~fo<${ zi{H*<*OMB44Sv#S5~)TOM?W+rN8`X21T6b=aL)^+L1v4JIT07;K*r+0bKdT%k;}gc zh*2SBtR}EH`=fwP#iv+JG~cAOjQ!nh#VebRGV#{&SkctFnzm0)rYV+hnFl^HkAV65 zYZ||I&Gsz%I+Iz?&;6m1c? zvP@?dE^Z5GA{nFw9iQ*%9#{iJasQlL9(!AGAPc4*VGIY>%k`_=>Qe2}+EL@bGlSGs zR55HaH)YRy6q$V1)&jp*m$0f~BvvH6Iw~DD?1Lt*w;%|Ee^0F^uL%|z{J960S<}gK zj=IEzqZRZ)c&mwi*Ci-4$||d%CMbcdU;jsgHW}jQcqM_{?qCUyW+ZewR(QpH-YZ0* zLN3!P3>)XgP2M*G1|ua%6%WlLKRE8fXq&a)e_BeFKk-zEk~Qsh>i;y-J7(;xA>Z%7 z2i9GUpz@J$!+0Gzku*gJ-RUGA4dO z(p-K6miw2oL&B|~XF3)|PNX&;0>K0AUYtV(;jhb2XtveMcWePiv%C+@kD%r!tNWWb z?ta#0$e^sUQ$>(knruhU=jo-W*T(l@8((7?@|Biign*@4b$$FEHba4j%h>=aILatlTGsXCO|%I%*&uhA5ePNAJgk1@ z9I5dh_q0Rp7D=w^2OSl(0U9YqukNLo$W03RZ*+UEoiflsWeee;ZsB1W86_RqYDgFi z60fmUNLCbxZ(d+7*$5zY?WO#>)^<^bUx6$CIpN9U;VdybopH*X#(x@xo{+&NS8sfZ zExYGs`P}()VNPIm(DfgqvoM`Ep8H5w*yS@kWj+dfs?&Qh^NGU zS&5kk+t_f1i)u}^lbY(-W5L_)``pk_!TEcgGd`BG_VmhO60VF|N6%t++>$9f$KaT& zWdr?aRW0ox2T*q0#xZ7b4SB<8O54-R@yWjzT<0^c*79FmTj9mEG5^`M4ejm!=h-hV z{r8snSpl^8hu#Y2njCeRgm+mPH6Y(ej{CC404l-NT(Q_VugN65BK~kLw&}y%p6fyk zIg6e^A+?j2)qVv;Trjjz>#bvwp?&S?`d2lKFpzrAM@iH?e?EAfa*l-7#*@`F-AHEI zYw0u%SgrEN7F|n)svdX?$|H&__F{OAN$2TWrVdEkZZ-*OMfJslNP+yT&~6KQo1-NYqzT7*eLN zFM~g_-qbC9j2a~P;Vg7)Dp=G|N!;@@-?zgzkJ-1RRjr7!Y~m$&)-6!v!m zzwhV%GLS&=r-8qAcfUh_&pZEu(op{m`g<1oyM=#^+kfE!fH!XdfPahPzr+7^ZTx3A eG{Zl^|GrWx%fY?OE&zc1^6`I3mlv4+`1XJHx!Z35 literal 0 HcmV?d00001 diff --git a/texas_inspection_expenses.ipynb b/texas_inspection_expenses.ipynb new file mode 100644 index 0000000..403ff46 --- /dev/null +++ b/texas_inspection_expenses.ipynb @@ -0,0 +1,634 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dc6818b8", + "metadata": {}, + "source": [ + "# Texas RRC Inspection Expenses Analysis\n", + "\n", + "**Research question:** Does organizational capacity (budget, staffing) predict better regulatory outputs (inspections, compliance, enforcement), and how is that relationship moderated by goal ambiguity, district-level heterogeneity, and spatial/geographic factors?\n", + "\n", + "## Hypotheses\n", + "- **H1 — Capacity → Outputs:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n", + "- **H2 — Goal Ambiguity:** When a larger share of RRC budget goes to the more ambiguous \"Energy Resource Development\" goal, the capacity → output relationship weakens.\n", + "- **H3 — Multilevel / District Effects:** The capacity → output relationship varies across RRC districts (budget slope heterogeneity).\n", + "- **H4 — Spatial & Geographic:** Offshore-jurisdiction and border districts moderate the capacity → output relationship; spatial autocorrelation in residuals is tested via Moran's I.\n", + "\n", + "**Data:**\n", + "- PostgreSQL warehouse (`texas_data`): `inspections`, `violations`, `well_shape_tract`\n", + "- `RRC Budget Data.xlsx`: statewide RRC budget by strategy, 2016–2024\n", + "- Analysis period: 2016–2023 (2024 is budget estimate, excluded from regressions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49de2b5c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import warnings\n", + "from pathlib import Path\n", + "from urllib.parse import quote_plus\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as mticker\n", + "import statsmodels.formula.api as smf\n", + "from dotenv import load_dotenv\n", + "from sqlalchemy import create_engine, text\n", + "from scipy.spatial.distance import cdist\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "pd.set_option(\"display.float_format\", \"{:,.2f}\".format)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08420da3", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=False)\n", + "\n", + "host = os.getenv(\"PGHOST\", \"localhost\")\n", + "port = os.getenv(\"PGPORT\", \"5432\")\n", + "user = os.getenv(\"PGUSER\", \"postgres\")\n", + "password = quote_plus(os.getenv(\"PGPASSWORD\", \"\"))\n", + "database = os.getenv(\"PGDATABASE\", \"texas_data\")\n", + "\n", + "engine = create_engine(\n", + " f\"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}\"\n", + ")\n", + "print(f\"Connected → {database} on {host}:{port}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "b4e6c44f", + "metadata": {}, + "source": [ + "## 1. Data Loading" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43886f13", + "metadata": {}, + "outputs": [], + "source": [ + "# District-year inspection metrics aggregated in SQL.\n", + "# LAG() computes days since the previous inspection for the same well (api_norm).\n", + "insp_sql = \"\"\"\n", + "WITH lagged AS (\n", + " SELECT\n", + " district,\n", + " EXTRACT(year FROM inspection_date)::int AS year,\n", + " api_norm,\n", + " inspection_date,\n", + " CASE WHEN UPPER(compliance::text) IN ('YES', 'Y') THEN 1.0 ELSE 0.0 END AS is_compliant,\n", + " EXTRACT(EPOCH FROM (\n", + " inspection_date\n", + " - LAG(inspection_date) OVER (PARTITION BY api_norm ORDER BY inspection_date)\n", + " )) / 86400.0 AS days_since_prev\n", + " FROM inspections\n", + " WHERE inspection_date IS NOT NULL\n", + " AND district IS NOT NULL\n", + " AND EXTRACT(year FROM inspection_date) BETWEEN 2016 AND 2024\n", + ")\n", + "SELECT\n", + " district,\n", + " year,\n", + " COUNT(*) AS total_inspections,\n", + " COUNT(DISTINCT api_norm) AS unique_wells,\n", + " ROUND(AVG(is_compliant)::numeric * 100, 2) AS compliance_rate,\n", + " ROUND(AVG(days_since_prev)::numeric, 1) AS avg_days_between_inspections\n", + "FROM lagged\n", + "GROUP BY district, year\n", + "ORDER BY district, year\n", + "\"\"\"\n", + "\n", + "insp = pd.read_sql(text(insp_sql), engine)\n", + "print(f\"Inspections panel: {len(insp):,} district-year rows | {insp['district'].nunique()} districts\")\n", + "insp.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3841e2f5", + "metadata": {}, + "outputs": [], + "source": [ + "# District-year violation metrics. Blank last_enf_action strings treated as no action.\n", + "viol_sql = \"\"\"\n", + "SELECT\n", + " district,\n", + " EXTRACT(year FROM violation_disc_date)::int AS year,\n", + " COUNT(*) AS total_violations,\n", + " COUNT(DISTINCT api_norm) AS unique_wells_with_violations,\n", + " SUM(CASE WHEN major_viol_ind = 'Y' THEN 1 ELSE 0 END) AS major_violations,\n", + " ROUND(AVG(CASE WHEN compliant_on_reinsp = 'Y' THEN 1.0 ELSE 0.0 END)::numeric * 100, 2)\n", + " AS resolution_rate,\n", + " ROUND(AVG(CASE WHEN last_enf_action IS NOT NULL AND last_enf_action <> ''\n", + " THEN 1.0 ELSE 0.0 END)::numeric * 100, 2) AS enforcement_rate,\n", + " ROUND(AVG(\n", + " CASE WHEN last_enf_action_date IS NOT NULL\n", + " THEN EXTRACT(EPOCH FROM (last_enf_action_date - violation_disc_date)) / 86400.0\n", + " END\n", + " )::numeric, 1) AS avg_days_to_enforcement\n", + "FROM violations\n", + "WHERE violation_disc_date IS NOT NULL\n", + " AND district IS NOT NULL\n", + " AND EXTRACT(year FROM violation_disc_date) BETWEEN 2016 AND 2024\n", + "GROUP BY district, year\n", + "ORDER BY district, year\n", + "\"\"\"\n", + "\n", + "viol = pd.read_sql(text(viol_sql), engine)\n", + "print(f\"Violations panel: {len(viol):,} district-year rows\")\n", + "viol.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e196cac", + "metadata": {}, + "outputs": [], + "source": [ + "BUDGET_PATH = Path(\"RRC Budget Data.xlsx\")\n", + "raw = pd.read_excel(BUDGET_PATH, header=None)\n", + "YEARS = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]\n", + "COLS = slice(1, 10) # spreadsheet columns 1-9 map to years 2016-2024\n", + "\n", + "# ── Section 1: Energy Resource Development (rows 7-18) ──────────────────────\n", + "erd = pd.DataFrame({\n", + " \"year\": YEARS,\n", + " \"strategy\": \"Energy Resource Development\",\n", + " \"total_budget\": raw.iloc[1, COLS].values.astype(float),\n", + " \"salaries\": raw.iloc[7, COLS].values.astype(float),\n", + " \"other_personnel\": raw.iloc[8, COLS].values.astype(float),\n", + " \"professional_fees\": raw.iloc[9, COLS].values.astype(float),\n", + " \"travel\": raw.iloc[13, COLS].values.astype(float),\n", + " \"other_operating\": raw.iloc[16, COLS].values.astype(float),\n", + " \"capital_exp\": raw.iloc[17, COLS].values.astype(float),\n", + " \"fte\": raw.iloc[18, COLS].values.astype(float),\n", + "})\n", + "\n", + "# ── Section 2: Oil/Gas Monitoring & Inspections (rows 20-31) ────────────────\n", + "ogi = pd.DataFrame({\n", + " \"year\": YEARS,\n", + " \"strategy\": \"Oil/Gas Monitoring & Inspections\",\n", + " \"total_budget\": raw.iloc[2, COLS].values.astype(float),\n", + " \"salaries\": raw.iloc[20, COLS].values.astype(float),\n", + " \"other_personnel\": raw.iloc[21, COLS].values.astype(float),\n", + " \"professional_fees\": raw.iloc[22, COLS].values.astype(float),\n", + " \"travel\": raw.iloc[26, COLS].values.astype(float),\n", + " \"other_operating\": raw.iloc[29, COLS].values.astype(float),\n", + " \"capital_exp\": raw.iloc[30, COLS].values.astype(float),\n", + " \"fte\": raw.iloc[31, COLS].values.astype(float),\n", + "})\n", + "\n", + "budget_long = pd.concat([erd, ogi], ignore_index=True)\n", + "print(f\"Budget long: {len(budget_long)} rows (2 strategies × {len(YEARS)} years)\")\n", + "budget_long\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "896d152b", + "metadata": {}, + "outputs": [], + "source": [ + "# ── Wide budget: one row per year with ogi_ / erd_ prefixed columns ──────────\n", + "ogi_wide = ogi.drop(columns=\"strategy\").add_prefix(\"ogi_\")\n", + "erd_wide = erd.drop(columns=\"strategy\").add_prefix(\"erd_\")\n", + "\n", + "budget_wide = (\n", + " ogi_wide\n", + " .merge(erd_wide, left_on=\"ogi_year\", right_on=\"erd_year\")\n", + " .rename(columns={\"ogi_year\": \"year\"})\n", + " .drop(columns=\"erd_year\")\n", + ")\n", + "\n", + "# ── Merge inspections + violations, then join statewide budget on year ────────\n", + "panel = (\n", + " insp\n", + " .merge(viol, on=[\"district\", \"year\"], how=\"left\")\n", + " .merge(budget_wide, on=\"year\", how=\"left\")\n", + ")\n", + "\n", + "# ── Derived columns ───────────────────────────────────────────────────────────\n", + "panel[\"violations_per_inspection\"] = panel[\"total_violations\"] / panel[\"total_inspections\"]\n", + "panel[\"ogi_budget_m\"] = panel[\"ogi_total_budget\"] / 1_000_000 # dollars → millions\n", + "panel[\"erd_budget_m\"] = panel[\"erd_total_budget\"] / 1_000_000\n", + "panel[\"post_2019\"] = (panel[\"year\"] >= 2019).astype(int)\n", + "panel[\"is_budget_year\"] = (panel[\"year\"] == 2024).astype(int)\n", + "\n", + "# Goal ambiguity: share of combined budget going to the inspection mission.\n", + "# Higher share = clearer mission focus; lower share = more goal ambiguity.\n", + "panel[\"inspection_budget_share\"] = (\n", + " panel[\"ogi_total_budget\"] / (panel[\"ogi_total_budget\"] + panel[\"erd_total_budget\"])\n", + ")\n", + "\n", + "# Fill violation NaNs for districts with zero violations in a given year\n", + "fill_cols = [\n", + " \"total_violations\", \"unique_wells_with_violations\", \"major_violations\",\n", + " \"resolution_rate\", \"enforcement_rate\", \"avg_days_to_enforcement\",\n", + " \"violations_per_inspection\",\n", + "]\n", + "panel[fill_cols] = panel[fill_cols].fillna(0)\n", + "\n", + "print(f\"Analysis panel: {len(panel):,} rows | \"\n", + " f\"{panel['district'].nunique()} districts | \"\n", + " f\"{panel['year'].nunique()} years\")\n", + "panel.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "3558bae7", + "metadata": {}, + "source": [ + "## 2. Exploratory Overview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92921756", + "metadata": {}, + "outputs": [], + "source": [ + "# Year-level means across districts\n", + "yearly = panel.groupby(\"year\").agg(\n", + " ogi_budget_m = (\"ogi_budget_m\", \"first\"),\n", + " ogi_fte = (\"ogi_fte\", \"first\"),\n", + " total_inspections = (\"total_inspections\", \"mean\"),\n", + " compliance_rate = (\"compliance_rate\", \"mean\"),\n", + " total_violations = (\"total_violations\", \"mean\"),\n", + " resolution_rate = (\"resolution_rate\", \"mean\"),\n", + " avg_days_to_enf = (\"avg_days_to_enforcement\",\"mean\"),\n", + ").round(2)\n", + "\n", + "print(yearly.to_string())\n", + "\n", + "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n", + "axes = axes.flatten()\n", + "\n", + "yearly[\"ogi_budget_m\"].plot(ax=axes[0], marker=\"o\", title=\"OGI Budget ($M)\")\n", + "axes[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f\"${x:.0f}M\"))\n", + "\n", + "yearly[\"ogi_fte\"].plot(ax=axes[1], marker=\"o\", title=\"OGI FTE Positions\")\n", + "\n", + "yearly[\"total_inspections\"].plot(ax=axes[2], marker=\"o\", title=\"Avg Inspections / District\")\n", + "\n", + "yearly[\"compliance_rate\"].plot(ax=axes[3], marker=\"o\", title=\"Avg Compliance Rate (%)\")\n", + "\n", + "yearly[\"resolution_rate\"].plot(ax=axes[4], marker=\"o\", title=\"Avg Resolution Rate (%)\")\n", + "\n", + "yearly[\"avg_days_to_enf\"].plot(ax=axes[5], marker=\"o\", title=\"Avg Days to Enforcement\")\n", + "\n", + "for ax in axes:\n", + " ax.axvline(2019, color=\"red\", linestyle=\"--\", alpha=0.5, label=\"2019 policy\")\n", + " ax.set_xlabel(\"Year\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d2671c9", + "metadata": {}, + "outputs": [], + "source": [ + "corr_cols = [\n", + " \"ogi_budget_m\", \"ogi_fte\", \"inspection_budget_share\",\n", + " \"total_inspections\", \"compliance_rate\",\n", + " \"total_violations\", \"resolution_rate\", \"avg_days_to_enforcement\",\n", + "]\n", + "corr = panel[corr_cols].corr().round(2)\n", + "\n", + "fig, ax = plt.subplots(figsize=(9, 7))\n", + "im = ax.imshow(corr, cmap=\"RdBu_r\", vmin=-1, vmax=1)\n", + "ax.set_xticks(range(len(corr_cols)))\n", + "ax.set_yticks(range(len(corr_cols)))\n", + "ax.set_xticklabels(corr_cols, rotation=45, ha=\"right\", fontsize=9)\n", + "ax.set_yticklabels(corr_cols, fontsize=9)\n", + "for i in range(len(corr_cols)):\n", + " for j in range(len(corr_cols)):\n", + " ax.text(j, i, corr.iloc[i, j], ha=\"center\", va=\"center\", fontsize=8)\n", + "plt.colorbar(im, ax=ax)\n", + "ax.set_title(\"Correlation Matrix — Key Variables\")\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "3ca1410a", + "metadata": {}, + "source": [ + "## H1: Organizational Capacity → Policy Outputs\n", + "\n", + "**Prediction:** Higher OGI budget and FTE predict more inspections, higher compliance rates, and faster violation resolution.\n", + "\n", + "**Model:** OLS with district fixed effects and year 2016–2023 (excluding 2024 budget estimate). Budget varies only over time, so it identifies via year-to-year changes in statewide RRC allocations; district FE absorbs persistent cross-district differences.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "463387d3", + "metadata": {}, + "outputs": [], + "source": [ + "actuals = panel[panel[\"is_budget_year\"] == 0].copy()\n", + "\n", + "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n", + "\n", + "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"total_inspections\",\n", + " alpha=0.4, ax=axes[0], title=\"Budget → Inspections\")\n", + "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"compliance_rate\",\n", + " alpha=0.4, ax=axes[1], title=\"Budget → Compliance Rate (%)\")\n", + "actuals.plot.scatter(x=\"ogi_budget_m\", y=\"resolution_rate\",\n", + " alpha=0.4, ax=axes[2], title=\"Budget → Resolution Rate (%)\")\n", + "\n", + "for ax in axes:\n", + " ax.set_xlabel(\"OGI Budget ($M)\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "535fc4eb", + "metadata": {}, + "outputs": [], + "source": [ + "m_inspections = smf.ols(\n", + " \"total_inspections ~ ogi_budget_m + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "m_compliance = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "m_resolution = smf.ols(\n", + " \"resolution_rate ~ ogi_budget_m + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "display_cols = [\"Coef.\", \"Std.Err.\", \"t\", \"P>|t|\"]\n", + "\n", + "print(\"H1a — OGI Budget ($M) → Total Inspections\")\n", + "print(m_inspections.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", + "print(f\" R² = {m_inspections.rsquared:.3f} Adj. R² = {m_inspections.rsquared_adj:.3f}\\n\")\n", + "\n", + "print(\"H1b — OGI Budget ($M) → Compliance Rate (%)\")\n", + "print(m_compliance.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", + "print(f\" R² = {m_compliance.rsquared:.3f} Adj. R² = {m_compliance.rsquared_adj:.3f}\\n\")\n", + "\n", + "print(\"H1c — OGI Budget ($M) → Resolution Rate (%)\")\n", + "print(m_resolution.summary2().tables[1][display_cols].loc[[\"ogi_budget_m\"]])\n", + "print(f\" R² = {m_resolution.rsquared:.3f} Adj. R² = {m_resolution.rsquared_adj:.3f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "56add68a", + "metadata": {}, + "source": [ + "## H2: Goal Ambiguity Moderates Capacity Effects\n", + "\n", + "**Prediction:** When a larger share of the combined RRC budget flows to the broader \"Energy Resource Development\" goal (lower `inspection_budget_share`), the capacity → output link weakens.\n", + "\n", + "**Operationalization:**\n", + "`inspection_budget_share = ogi_budget / (ogi_budget + erd_budget)`\n", + "\n", + "A negative interaction coefficient `ogi_budget_m × inspection_budget_share` would be unexpected (higher share → weaker effect). A positive coefficient supports H2 — clearer mission focus amplifies the budget → compliance relationship.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24187ce8", + "metadata": {}, + "outputs": [], + "source": [ + "m_h2 = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "key_rows = [\"ogi_budget_m\", \"inspection_budget_share\", \"ogi_budget_m:inspection_budget_share\"]\n", + "print(\"H2 — Goal Ambiguity Moderation (DV: compliance_rate)\")\n", + "print(m_h2.summary2().tables[1][display_cols].loc[key_rows])\n", + "print(f\"\\nR² = {m_h2.rsquared:.3f} Adj. R² = {m_h2.rsquared_adj:.3f}\")\n", + "\n", + "# ── Same model with resolution rate as DV ────────────────────────────────────\n", + "m_h2_res = smf.ols(\n", + " \"resolution_rate ~ ogi_budget_m * inspection_budget_share + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "print(\"\\nH2 — Goal Ambiguity Moderation (DV: resolution_rate)\")\n", + "print(m_h2_res.summary2().tables[1][display_cols].loc[key_rows])\n", + "print(f\"\\nR² = {m_h2_res.rsquared:.3f} Adj. R² = {m_h2_res.rsquared_adj:.3f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "b6583857", + "metadata": {}, + "source": [ + "## H3: District Multilevel Effects\n", + "\n", + "**Prediction:** The budget → output slope varies across RRC districts — some districts translate budget increases into better outputs more effectively than others.\n", + "\n", + "**Model:** Interaction `ogi_budget_m × C(district)` — the reference district captures the baseline budget slope; interaction terms show how each other district's slope differs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "151faefd", + "metadata": {}, + "outputs": [], + "source": [ + "m_h3 = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m * C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "coef_table = m_h3.summary2().tables[1]\n", + "\n", + "# Baseline budget slope (reference district)\n", + "baseline_row = coef_table.loc[[\"ogi_budget_m\"]]\n", + "print(\"H3 — District-Heterogeneous Budget Effect (DV: compliance_rate)\")\n", + "print(f\"Baseline (reference district) budget slope:\")\n", + "print(baseline_row[display_cols])\n", + "\n", + "# District-specific deviations from baseline\n", + "interaction_rows = coef_table[coef_table.index.str.contains(\"ogi_budget_m:C\")]\n", + "print(\"\\nDistrict interaction terms (deviation from reference slope):\")\n", + "print(interaction_rows[display_cols].round(4))\n", + "print(f\"\\nR² = {m_h3.rsquared:.3f} Adj. R² = {m_h3.rsquared_adj:.3f}\")\n", + "\n", + "# ── Plot district-specific budget slopes ─────────────────────────────────────\n", + "districts = actuals[\"district\"].unique()\n", + "slopes = {}\n", + "for d in districts:\n", + " key = f\"ogi_budget_m:C(district)[T.{d}]\"\n", + " base = m_h3.params.get(\"ogi_budget_m\", 0)\n", + " delta = m_h3.params.get(key, 0)\n", + " slopes[str(d)] = base + delta\n", + "\n", + "slope_df = pd.Series(slopes).sort_values()\n", + "fig, ax = plt.subplots(figsize=(10, 4))\n", + "slope_df.plot.barh(ax=ax, color=[\"#d62728\" if v < 0 else \"#1f77b4\" for v in slope_df])\n", + "ax.axvline(0, color=\"black\", linewidth=0.8)\n", + "ax.set_xlabel(\"Budget slope (compliance rate pp per $M)\")\n", + "ax.set_title(\"H3 — District-Specific Budget → Compliance Slopes\")\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "5bb27b3e", + "metadata": {}, + "source": [ + "## H4: Spatial and Geographic Factors\n", + "\n", + "**Predictions:**\n", + "- Offshore-jurisdiction districts (02, 03, 04) show a different budget → output relationship due to dual onshore/offshore oversight burden.\n", + "- Border-proximate districts show a different relationship due to cross-jurisdiction enforcement complexity.\n", + "- Spatial autocorrelation in H1 residuals (Moran's I) would indicate unmodeled geographic spillovers.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6e56f00", + "metadata": {}, + "outputs": [], + "source": [ + "# Texas RRC district geography flags (based on known RRC district locations)\n", + "OFFSHORE_DISTRICTS = {\"02\", \"03\", \"04\"} # dual onshore + offshore jurisdiction\n", + "BORDER_DISTRICTS = {\"01\", \"02\", \"03\", \"04\"} # south / gulf coast proximity\n", + "\n", + "actuals = actuals.copy()\n", + "actuals[\"district_str\"] = actuals[\"district\"].astype(str).str.strip()\n", + "actuals[\"offshore\"] = actuals[\"district_str\"].isin(OFFSHORE_DISTRICTS).astype(int)\n", + "actuals[\"border\"] = actuals[\"district_str\"].isin(BORDER_DISTRICTS).astype(int)\n", + "\n", + "print(\"District classification:\")\n", + "print(\n", + " actuals.groupby([\"district_str\", \"offshore\", \"border\"])\n", + " .size()\n", + " .reset_index(name=\"district_year_obs\")\n", + " .to_string(index=False)\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74686bfe", + "metadata": {}, + "outputs": [], + "source": [ + "# ── Spatial regression: offshore and border interactions ─────────────────────\n", + "m_h4 = smf.ols(\n", + " \"compliance_rate ~ ogi_budget_m + offshore + border \"\n", + " \"+ ogi_budget_m:offshore + ogi_budget_m:border + C(district)\",\n", + " data=actuals,\n", + ").fit(cov_type=\"cluster\", cov_kwds={\"groups\": actuals[\"district\"]})\n", + "\n", + "spatial_rows = [\n", + " \"ogi_budget_m\", \"offshore\", \"border\",\n", + " \"ogi_budget_m:offshore\", \"ogi_budget_m:border\",\n", + "]\n", + "available = [r for r in spatial_rows if r in m_h4.params.index]\n", + "print(\"H4 — Spatial Moderators (DV: compliance_rate)\")\n", + "print(m_h4.summary2().tables[1][display_cols].loc[available])\n", + "print(f\"\\nR² = {m_h4.rsquared:.3f} Adj. R² = {m_h4.rsquared_adj:.3f}\")\n", + "\n", + "# ── Moran's I on H1 residuals ─────────────────────────────────────────────────\n", + "# Compute district centroids from well lat/lon joined via inspections\n", + "centroids_sql = \"\"\"\n", + "SELECT\n", + " i.district,\n", + " AVG(w.latitude) AS lat,\n", + " AVG(w.longitude) AS lon\n", + "FROM inspections i\n", + "JOIN well_shape_tract w USING (api_norm)\n", + "WHERE w.latitude IS NOT NULL\n", + " AND w.longitude IS NOT NULL\n", + " AND i.district IS NOT NULL\n", + "GROUP BY i.district\n", + "\"\"\"\n", + "\n", + "try:\n", + " centroids = pd.read_sql(text(centroids_sql), engine)\n", + "\n", + " # Average H1 compliance residuals to district level\n", + " resid_df = actuals[[\"district\", \"compliance_rate\"]].copy()\n", + " resid_df[\"resid\"] = m_compliance.resid.reindex(actuals.index).values\n", + " resid_by_district = resid_df.groupby(\"district\")[\"resid\"].mean().reset_index()\n", + "\n", + " centroids = centroids.merge(resid_by_district, on=\"district\").dropna()\n", + "\n", + " # Row-normalised inverse-distance weights matrix\n", + " coords = centroids[[\"lon\", \"lat\"]].values\n", + " D = cdist(coords, coords)\n", + " np.fill_diagonal(D, np.inf)\n", + " W = 1 / D\n", + " W = W / W.sum(axis=1, keepdims=True)\n", + "\n", + " z = centroids[\"resid\"].values\n", + " z = z - z.mean()\n", + " n = len(z)\n", + " morans_i = (n / W.sum()) * (z @ W @ z) / (z @ z)\n", + "\n", + " print(f\"\\nMoran's I on H1 compliance residuals = {morans_i:.4f}\")\n", + " print(\" > 0 → residuals cluster spatially (similar neighbours)\")\n", + " print(\" ≈ 0 → no spatial pattern\")\n", + " print(\" < 0 → spatial dispersion (dissimilar neighbours)\")\n", + "\n", + " print(\"\\nDistrict centroids used:\")\n", + " print(centroids[[\"district\", \"lat\", \"lon\"]].round(2).to_string(index=False))\n", + "\n", + "except Exception as e:\n", + " print(f\"Moran's I skipped: {e}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}