{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Well Analyzer Notebook Templates\n", "Use these cells as starting points for future analysis notebooks." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Imports & Environment" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "import pandas as pd\n", "\n", "repo_root = Path('..').resolve()\n", "if str(repo_root) not in os.sys.path:\n", " os.sys.path.insert(0, str(repo_root))\n", "\n", "from analysis.well_analyzer import WellAnalyzer\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Instantiate the analyzer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-11-08 20:31:14,124 - INFO - Connecting to Postgres\n", "2025-11-08 20:32:36,129 - INFO - Loaded 1010431 wells from public.well_enriched_all_plus\n", "2025-11-08 20:32:55,260 - INFO - Loaded 2151839 inspections from public.inspections\n", "2025-11-08 20:32:58,951 - INFO - Loaded 242899 violations from public.violations\n" ] }, { "data": { "text/plain": [ "{'total_wells': 1010431,\n", " 'unique_census_tracts': 2981,\n", " 'total_inspections': 2151839,\n", " 'total_violations': 242899}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "analyzer = WellAnalyzer(chunk_size=50_000)\n", "summary_stats = analyzer.get_summary_stats()\n", "summary_stats\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Summary stats as DataFrame" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 value
total_wells1,010,431.00
unique_census_tracts2,981.00
total_inspections2,151,839.00
total_violations242,899.00
\n" ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame([summary_stats]).T.rename(columns={0: 'value'}).style.format({'value': '{:,.2f}'})\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Inspection analysis helpers" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "object", "type": "string" }, { "name": "value", "rawType": "float64", "type": "float" } ], "ref": "648d5bab-d6a9-4e87-95a4-d81a3087ad63", "rows": [ [ "total_inspections", "2151839.0" ], [ "unique_wells_inspected", "483352.0" ], [ "overall_compliance_rate", "89.15035000295096" ], [ "avg_days_between_inspections", "548.291420910082" ], [ "median_days_between_inspections", "322.0" ] ], "shape": { "columns": 1, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value
total_inspections2.151839e+06
unique_wells_inspected4.833520e+05
overall_compliance_rate8.915035e+01
avg_days_between_inspections5.482914e+02
median_days_between_inspections3.220000e+02
\n", "
" ], "text/plain": [ " value\n", "total_inspections 2.151839e+06\n", "unique_wells_inspected 4.833520e+05\n", "overall_compliance_rate 8.915035e+01\n", "avg_days_between_inspections 5.482914e+02\n", "median_days_between_inspections 3.220000e+02" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inspection_analysis = analyzer.analyze_inspection_patterns()\n", "pd.Series(inspection_analysis['overall_statistics']).to_frame('value')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Violations slice" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "canonical_api10", "rawType": "object", "type": "string" }, { "name": "violation_disc_date", "rawType": "datetime64[ns]", "type": "datetime" }, { "name": "violated_rule", "rawType": "object", "type": "string" }, { "name": "major_viol_ind", "rawType": "object", "type": "string" } ], "ref": "7a3969db-0981-48f8-846b-31946d7c0e64", "rows": [ [ "0", "4233530876", "2017-09-19 00:00:00", "SWR 91(d)(1)", "N" ], [ "1", "4233532284", "2017-07-26 00:00:00", "SWR 91(d)(1)", "N" ], [ "2", "4233532284", "2017-09-13 00:00:00", "SWR 91(d)(1)", "N" ], [ "3", "4210300169", "2017-10-25 00:00:00", "SWR 91(d)(1)", "N" ], [ "4", "4222736906", "2016-02-02 00:00:00", "SWR 91(d)(1)", "N" ] ], "shape": { "columns": 4, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
canonical_api10violation_disc_dateviolated_rulemajor_viol_ind
042335308762017-09-19SWR 91(d)(1)N
142335322842017-07-26SWR 91(d)(1)N
242335322842017-09-13SWR 91(d)(1)N
342103001692017-10-25SWR 91(d)(1)N
442227369062016-02-02SWR 91(d)(1)N
\n", "
" ], "text/plain": [ " canonical_api10 violation_disc_date violated_rule major_viol_ind\n", "0 4233530876 2017-09-19 SWR 91(d)(1) N\n", "1 4233532284 2017-07-26 SWR 91(d)(1) N\n", "2 4233532284 2017-09-13 SWR 91(d)(1) N\n", "3 4210300169 2017-10-25 SWR 91(d)(1) N\n", "4 4222736906 2016-02-02 SWR 91(d)(1) N" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "violations_df = analyzer.data['violations'][['canonical_api10', 'violation_disc_date', 'violated_rule', 'major_viol_ind']]\n", "violations_df.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Environmental-justice aggregation" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "object", "type": "string" }, { "name": "high", "rawType": "float64", "type": "float" }, { "name": "low", "rawType": "float64", "type": "float" } ], "ref": "5ab7f2d3-3c34-4a70-83f8-23a6929c2e30", "rows": [ [ "avg_inspections", "5.668602168650754", "5.8424998043529195" ], [ "avg_violations", "0.6798208564386784", "0.7418603711839766" ], [ "major_violations", "0.01963439404197698", "0.02503382949932341" ], [ "avg_compliance_rate", "91.80852389969775", "91.22339687712729" ], [ "avg_days_between_inspections", "757.1575143021564", "732.3780043474643" ], [ "reinspection_compliance_rate", "13.868187092556035", "14.88125749212477" ], [ "wells_in_tract", "307.38253215978335", "374.30311231393773" ] ], "shape": { "columns": 2, "rows": 7 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
highlow
avg_inspections5.6686025.842500
avg_violations0.6798210.741860
major_violations0.0196340.025034
avg_compliance_rate91.80852491.223397
avg_days_between_inspections757.157514732.378004
reinspection_compliance_rate13.86818714.881257
wells_in_tract307.382532374.303112
\n", "
" ], "text/plain": [ " high low\n", "avg_inspections 5.668602 5.842500\n", "avg_violations 0.679821 0.741860\n", "major_violations 0.019634 0.025034\n", "avg_compliance_rate 91.808524 91.223397\n", "avg_days_between_inspections 757.157514 732.378004\n", "reinspection_compliance_rate 13.868187 14.881257\n", "wells_in_tract 307.382532 374.303112" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ej = analyzer.analyze_environmental_justice()\n", "pd.DataFrame(ej['high_vulnerability_vs_low']).T\n" ] }, { "cell_type": "markdown", "id": "c233c217", "metadata": {}, "source": [ "## 7. District comparisons\n", "Group inspections by district (alphanumeric-safe) to see volume + compliance deltas." ] }, { "cell_type": "code", "execution_count": null, "id": "b4ada35c", "metadata": {}, "outputs": [], "source": [ "insp = analyzer.data['inspections'].copy()\n", "if 'district' not in insp.columns:\n", " raise KeyError('district column missing in inspections data')\n", "\n", "insp['district_str'] = insp['district'].astype(str).fillna('Unknown')\n", "summary = insp.groupby('district_str').agg(\n", " inspections=('district_str', 'size'),\n", " unique_wells=('canonical_api10', 'nunique'),\n", " compliance_rate=('compliance', lambda x: (x == 'Yes').mean() * 100 if 'Yes' in x.values else float('nan'))\n", ")\n", "summary = summary.sort_values('inspections', ascending=False)\n", "summary.head(15)\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 5 }