updated analaysis with offshore considerations

This commit is contained in:
2026-02-18 17:07:03 -08:00
parent 61d274d6f3
commit 1342b06871
27 changed files with 9153 additions and 236 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 142 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 451 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 142 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 197 KiB

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 154 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 231 KiB

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,781 @@
from __future__ import annotations
import json
import logging
import os
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import quote_plus
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import Engine, create_engine, text
from sqlalchemy.exc import SQLAlchemyError
# Configure logging early so that downstream modules inherit the settings.
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Pandas throws a lot of noisy warnings when casting timestamps out of Postgres.
warnings.filterwarnings("ignore", category=UserWarning)
class WellAnalyzerError(Exception):
"""Base exception class for all analyzer failures."""
class ConfigError(WellAnalyzerError):
"""Raised when we cannot build a working configuration."""
class DataLoadError(WellAnalyzerError):
"""Raised when data cannot be loaded from the warehouse."""
class AnalysisError(WellAnalyzerError):
"""Raised when a downstream analytic task fails."""
@dataclass
class Config:
"""Runtime configuration for the analyzer."""
engine: Engine
chunk_size: int = 10_000
cache_dir: Path = Path("./cache")
well_source: str = ""
inspections_source: str = ""
violations_source: str = ""
def _load_engine_from_env() -> Engine:
"""Build a SQLAlchemy engine using PG* environment variables."""
load_dotenv(override=False)
host = os.getenv("PGHOST", "localhost")
port = os.getenv("PGPORT", "5432")
user = os.getenv("PGUSER", "postgres")
password = quote_plus(os.getenv("PGPASSWORD", ""))
database = os.getenv("PGDATABASE", "texas_data")
url = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
logger.info("Connecting to Postgres", extra={"host": host, "database": database})
try:
return create_engine(url)
except SQLAlchemyError as exc:
raise ConfigError(f"Failed to create engine for {url}: {exc}") from exc
class WellAnalyzer:
"""
Analyze wells, inspections, violations, and environmental-justice indicators held in the
rebuilt PostGIS warehouse. The class auto-detects the rebuilt tables that now exist
in the texas-rebuild-postgis project so it works against both fresh rebuilds and future
refreshes without hand-editing SQL strings.
"""
ID_COLUMN = "api_norm"
WELL_SOURCE_CANDIDATES = [
"well_shape_tract",
]
WELL_COLUMN_MAP: Dict[str, List[str]] = {
"census_tract_geoid": ["census_tract_geoid", "geoid", "tract_geoid"],
"tract_name": ["tract_name", "name"],
"ruca_category": ["ruca_category"],
"ruca_code_2020": ["ruca_code_2020"],
"ruca_primary_description": ["ruca_primary_description"],
"ruca_secondary_description": ["ruca_secondary_description"],
"ej_composite_score": ["ej_composite_score"],
"pct_minority": ["pct_minority"],
"pct_hispanic": ["pct_hispanic"],
"poverty_rate": ["poverty_rate"],
"unemployment_rate": ["unemployment_rate"],
"less_than_hs_pct": ["less_than_hs_pct"],
"linguistic_isolation_rate": ["linguistic_isolation_rate"],
"renter_cost_burden_rate": ["renter_cost_burden_rate"],
"disability_rate": ["disability_rate"],
"pct_under5": ["pct_under5"],
"pct_65plus": ["pct_65plus"],
"median_household_income": ["median_household_income"],
"latitude": ["latitude", "lat"],
"longitude": ["longitude", "lon", "lng"],
"basin_label": ["basin_label", "basin_name"],
"play_label": ["play_label", "play_name"],
"texmex_name": ["texmex_name"],
"distance_to_texmex_km": ["distance_to_texmex_km"],
"within_25km_texmex": ["within_25km_texmex"],
"within_50km_texmex": ["within_50km_texmex"],
}
def __init__(
self,
engine: Optional[Engine] = None,
*,
chunk_size: int = 10_000,
cache_dir: Optional[Path] = None,
well_source: Optional[str] = None,
) -> None:
if engine is None:
engine = _load_engine_from_env()
self.config = Config(
engine=engine,
chunk_size=chunk_size,
cache_dir=(cache_dir or Path("./cache")),
)
self.config.well_source = well_source or self._detect_table(self.WELL_SOURCE_CANDIDATES)
if not self.config.well_source:
raise ConfigError(
"Could not find a well table/view. "
"Expected one of well_enriched_all_plus, well_enriched_all, well_with_demographics_table, well_shape_tract."
)
self.config.inspections_source = self._detect_table(["inspections"])
self.config.violations_source = self._detect_table(["violations"])
if not self.config.inspections_source or not self.config.violations_source:
raise ConfigError("Both inspections and violations tables must exist in the database.")
self.data: Dict[str, pd.DataFrame] = {}
self._initialize_data()
# --------------------------------------------------------------------------------------
# Helper methods for metadata detection and SQL building
# --------------------------------------------------------------------------------------
def _detect_table(self, candidates: List[str]) -> Optional[str]:
for candidate in candidates:
found = self._table_exists(candidate)
if found:
return found
return None
def _table_exists(self, table: str) -> Optional[str]:
names_to_try = []
if "." in table:
names_to_try.append(table)
else:
names_to_try.append(f"public.{table}")
names_to_try.append(table)
query = text("SELECT to_regclass(:name) IS NOT NULL AS exists")
with self.config.engine.begin() as conn:
for name in names_to_try:
exists = conn.execute(query, {"name": name}).scalar()
if exists:
return name
return None
def _split_table_name(self, qualified: str) -> Dict[str, Optional[str]]:
if "." in qualified:
schema, table = qualified.split(".", 1)
return {"schema": schema, "table": table}
return {"schema": None, "table": qualified}
def _get_columns(self, table: str) -> Dict[str, str]:
pieces = self._split_table_name(table)
sql = [
"SELECT column_name",
"FROM information_schema.columns",
"WHERE table_name = :table",
]
params = {"table": pieces["table"]}
if pieces["schema"]:
sql.append("AND table_schema = :schema")
params["schema"] = pieces["schema"]
sql.append("ORDER BY ordinal_position")
with self.config.engine.begin() as conn:
rows = conn.execute(text(" ".join(sql)), params).fetchall()
return {row[0].lower(): row[0] for row in rows}
def _pick_column(self, columns: Dict[str, str], names: List[str]) -> Optional[str]:
for name in names:
if name.lower() in columns:
return columns[name.lower()]
return None
def _execute_query(self, query: str, params: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
try:
df_chunks: List[pd.DataFrame] = []
for chunk in pd.read_sql(
text(query),
self.config.engine,
params=params,
chunksize=self.config.chunk_size,
):
df_chunks.append(chunk)
return pd.concat(df_chunks, ignore_index=True) if df_chunks else pd.DataFrame()
except SQLAlchemyError as exc:
logger.error("Query failed: %s", query, exc_info=True)
raise DataLoadError(f"Failed executing query: {exc}") from exc
# --------------------------------------------------------------------------------------
# Data loading
# --------------------------------------------------------------------------------------
def _initialize_data(self) -> None:
self.data["well_data"] = self._load_wells()
self.data["inspections"] = self._load_inspections()
self.data["violations"] = self._load_violations()
if not self.data["well_data"].empty and (
not self.data["inspections"].empty or not self.data["violations"].empty
):
self._create_performance_metrics()
def _load_wells(self) -> pd.DataFrame:
table = self.config.well_source
columns = self._get_columns(table)
alias = "w"
api_norm_col = self._pick_column(columns, ["api_norm"])
if not api_norm_col:
raise DataLoadError(f"{table} does not expose api_norm")
select_parts = [f'{alias}."{api_norm_col}" AS {self.ID_COLUMN}']
for target, candidates in self.WELL_COLUMN_MAP.items():
column = self._pick_column(columns, candidates)
if column:
select_parts.append(f'{alias}."{column}" AS {target}')
query = f'SELECT {", ".join(select_parts)} FROM {table} AS {alias}'
df = self._execute_query(query)
df[self.ID_COLUMN] = df[self.ID_COLUMN].astype(str).str.strip()
df = df[df[self.ID_COLUMN].notna()]
df = df.drop_duplicates(subset=[self.ID_COLUMN]).reset_index(drop=True)
logger.info("Loaded %s wells from %s", len(df), table)
return df
def _load_inspections(self) -> pd.DataFrame:
table = self.config.inspections_source
columns = self._get_columns(table)
alias = "i"
select_parts = []
base_candidates = [
"id",
"district",
"county",
"inspection_date",
"inspection_type",
"operator_name",
"field_name",
"compliance",
"file_date",
"created_at",
]
for column in base_candidates:
picked = self._pick_column(columns, [column])
if picked:
select_parts.append(f'{alias}."{picked}" AS {column}')
api_norm_col = self._pick_column(columns, ["api_norm"])
if not api_norm_col:
raise DataLoadError(f"{table} does not expose api_norm")
select_parts.append(f'{alias}."{api_norm_col}" AS {self.ID_COLUMN}')
where_clause = f'WHERE {alias}."{api_norm_col}" IS NOT NULL'
query = f'SELECT {", ".join(select_parts)} FROM {table} AS {alias} {where_clause}'
df = self._execute_query(query)
for col in ["inspection_date", "file_date", "created_at"]:
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors="coerce")
df = df[df[self.ID_COLUMN].notna()].copy()
df = df[df[self.ID_COLUMN].notna()].copy()
if "inspection_date" in df.columns:
df = df.sort_values([self.ID_COLUMN, "inspection_date"])
df["days_since_last_inspection"] = (
df.groupby(self.ID_COLUMN)["inspection_date"].diff().dt.days
)
logger.info("Loaded %s inspections from %s", len(df), table)
return df.reset_index(drop=True)
def _load_violations(self) -> pd.DataFrame:
table = self.config.violations_source
columns = self._get_columns(table)
alias = "v"
select_parts = []
row_id_candidates = ["id", "violation_id", "violationid", "objectid", "row_id"]
base_candidates = [
"operator_name",
"p5_operator_no",
"district",
"oil_lease_gas_well_id",
"lease_fac_name",
"well_no",
"drilling_permit_no",
"field_name",
"violated_rule",
"violated_rule_desc",
"major_viol_ind",
"compliant_on_reinsp",
"last_enf_action",
"last_enf_action_date",
"violation_disc_date",
"file_date",
"created_at",
]
for row_id in row_id_candidates:
picked = self._pick_column(columns, [row_id])
if picked:
select_parts.append(f'{alias}."{picked}" AS {row_id}')
break
for column in base_candidates:
picked = self._pick_column(columns, [column])
if picked:
select_parts.append(f'{alias}."{picked}" AS {column}')
api_norm_col = self._pick_column(columns, ["api_norm"])
if not api_norm_col:
raise DataLoadError(f"{table} does not expose api_norm")
select_parts.append(f'{alias}."{api_norm_col}" AS {self.ID_COLUMN}')
where_clause = f'WHERE {alias}."{api_norm_col}" IS NOT NULL'
query = f'SELECT {", ".join(select_parts)} FROM {table} AS {alias} {where_clause}'
df = self._execute_query(query)
for col in ["violation_disc_date", "last_enf_action_date", "file_date", "created_at"]:
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors="coerce")
df = df[df[self.ID_COLUMN].notna()].reset_index(drop=True)
df = df[df[self.ID_COLUMN].notna()].reset_index(drop=True)
if not df.empty:
row_id_col = next(
(c for c in ["id", "violation_id", "violationid", "objectid", "row_id"] if c in df.columns),
None,
)
if row_id_col is None:
df["violation_row_id"] = range(1, len(df) + 1)
else:
df = df.rename(columns={row_id_col: "violation_row_id"})
df["total_violations"] = df.groupby(self.ID_COLUMN)["violation_row_id"].transform("count")
df["violation_number"] = df.groupby(self.ID_COLUMN).cumcount() + 1
logger.info("Loaded %s violations from %s", len(df), table)
return df
def _create_performance_metrics(self) -> None:
insp = self.data.get("inspections", pd.DataFrame()).copy()
viol = self.data.get("violations", pd.DataFrame()).copy()
if insp.empty and viol.empty:
return
if not insp.empty:
count_col = "id" if "id" in insp.columns else "inspection_date"
insp_metrics = insp.groupby(self.ID_COLUMN).agg(
total_inspections=(count_col, "count")
)
if "compliance" in insp.columns:
insp_metrics["compliance_rate"] = (
insp.groupby(self.ID_COLUMN)["compliance"]
.apply(lambda x: (x == "Yes").mean() * 100)
)
if "days_since_last_inspection" in insp.columns:
insp_metrics["avg_days_between_inspections"] = (
insp.groupby(self.ID_COLUMN)["days_since_last_inspection"].mean()
)
else:
insp_metrics = pd.DataFrame()
if not viol.empty:
count_col = (
"violation_row_id"
if "violation_row_id" in viol.columns
else ("id" if "id" in viol.columns else self.ID_COLUMN)
)
viol_metrics = viol.groupby(self.ID_COLUMN).agg(
total_violations=(count_col, "count"),
)
if "major_viol_ind" in viol.columns:
viol_metrics["major_violations"] = (
viol.groupby(self.ID_COLUMN)["major_viol_ind"]
.apply(lambda x: (x == "Y").sum())
)
if "compliant_on_reinsp" in viol.columns:
viol_metrics["reinspection_compliance_rate"] = (
viol.groupby(self.ID_COLUMN)["compliant_on_reinsp"]
.apply(lambda x: (x == "Y").mean() * 100)
)
else:
viol_metrics = pd.DataFrame()
metrics = insp_metrics.join(viol_metrics, how="outer").fillna(0)
metrics = metrics.reset_index()
self.data["performance_metrics"] = metrics
# --------------------------------------------------------------------------------------
# Analytic helpers
# --------------------------------------------------------------------------------------
def analyze_inspection_patterns(self) -> Dict[str, Any]:
insp_df = self.data.get("inspections", pd.DataFrame())
if insp_df.empty:
return {}
result: Dict[str, Any] = {
"overall_statistics": {
"total_inspections": int(len(insp_df)),
"unique_wells_inspected": int(insp_df[self.ID_COLUMN].nunique()),
}
}
if "compliance" in insp_df.columns:
result["overall_statistics"]["overall_compliance_rate"] = (
(insp_df["compliance"] == "Yes").mean() * 100
)
if "days_since_last_inspection" in insp_df.columns:
result["overall_statistics"]["avg_days_between_inspections"] = insp_df[
"days_since_last_inspection"
].mean()
result["overall_statistics"]["median_days_between_inspections"] = insp_df[
"days_since_last_inspection"
].median()
if "inspection_date" in insp_df.columns:
insp_df["inspection_date"] = pd.to_datetime(insp_df["inspection_date"], errors="coerce")
insp_df["year"] = insp_df["inspection_date"].dt.year
result["temporal_patterns"] = {
"inspections_by_year": insp_df.groupby("year").size().dropna().to_dict()
}
if "compliance" in insp_df.columns:
result["temporal_patterns"]["compliance_by_year"] = (
insp_df.groupby("year")["compliance"]
.apply(lambda x: (x == "Yes").mean() * 100)
.dropna()
.to_dict()
)
if "district" in insp_df.columns:
district_counts = insp_df.groupby("district").size().to_dict()
district_compliance = {}
if "compliance" in insp_df.columns:
district_compliance = (
insp_df.groupby("district")["compliance"]
.apply(lambda x: (x == "Yes").mean() * 100)
.to_dict()
)
result["district_performance"] = {
"inspections_by_district": district_counts,
"compliance_by_district": district_compliance,
}
return result
def analyze_violations(self) -> Dict[str, Any]:
viol_df = self.data.get("violations", pd.DataFrame())
if viol_df.empty:
return {}
result: Dict[str, Any] = {
"overall_statistics": {
"total_violations": int(len(viol_df)),
"unique_wells_with_violations": int(viol_df[self.ID_COLUMN].nunique()),
},
"violation_types": {},
"enforcement_effectiveness": {},
}
if "major_viol_ind" in viol_df.columns:
result["overall_statistics"]["major_violations"] = int(
(viol_df["major_viol_ind"] == "Y").sum()
)
if "compliant_on_reinsp" in viol_df.columns:
result["overall_statistics"]["compliance_on_reinspection_rate"] = (
(viol_df["compliant_on_reinsp"] == "Y").mean() * 100
)
if "violated_rule" in viol_df.columns:
result["violation_types"]["common_violations"] = (
viol_df["violated_rule"].value_counts().head(10).to_dict()
)
if "major_viol_ind" in viol_df.columns:
result["violation_types"]["major_violation_types"] = (
viol_df[viol_df["major_viol_ind"] == "Y"]["violated_rule"]
.value_counts()
.head(5)
.to_dict()
)
if {"violated_rule", "compliant_on_reinsp"} <= set(viol_df.columns):
result["enforcement_effectiveness"]["resolution_rate_by_type"] = (
viol_df.groupby("violated_rule")["compliant_on_reinsp"]
.apply(lambda x: (x == "Y").mean() * 100)
.to_dict()
)
return result
def analyze_regulatory_chain(self) -> Dict[str, Any]:
insp_df = self.data.get("inspections", pd.DataFrame()).copy()
viol_df = self.data.get("violations", pd.DataFrame()).copy()
if insp_df.empty or viol_df.empty:
return {}
if "inspection_date" not in insp_df.columns or "violation_disc_date" not in viol_df.columns:
return {}
insp_df["inspection_date"] = pd.to_datetime(insp_df["inspection_date"], errors="coerce")
viol_df["violation_disc_date"] = pd.to_datetime(viol_df["violation_disc_date"], errors="coerce")
viol_df["last_enf_action_date"] = pd.to_datetime(
viol_df.get("last_enf_action_date"), errors="coerce"
)
insp_df = insp_df.dropna(subset=[self.ID_COLUMN, "inspection_date"])
viol_df = viol_df.dropna(subset=[self.ID_COLUMN, "violation_disc_date"])
if insp_df.empty or viol_df.empty:
return {}
insp_sorted = (
insp_df.sort_values(["inspection_date", self.ID_COLUMN])
.reset_index(drop=True)
)
viol_sorted = (
viol_df.sort_values(["violation_disc_date", self.ID_COLUMN])
.reset_index(drop=True)
)
matched_df = pd.merge_asof(
viol_sorted,
insp_sorted,
left_on="violation_disc_date",
right_on="inspection_date",
by=self.ID_COLUMN,
direction="backward",
suffixes=("_viol", "_insp"),
).dropna(subset=["inspection_date"])
if matched_df.empty:
return {}
total_inspections = len(insp_df)
inspection_id_col = "id_insp" if "id_insp" in matched_df.columns else "inspection_date"
inspections_with_violations = matched_df[inspection_id_col].nunique()
violation_rate = (inspections_with_violations / total_inspections) * 100 if total_inspections else 0
time_spans = matched_df.dropna(
subset=["inspection_date", "violation_disc_date", "last_enf_action_date"]
).copy()
time_spans["insp_to_viol"] = (
time_spans["violation_disc_date"] - time_spans["inspection_date"]
).dt.days
time_spans["viol_to_enforce"] = (
time_spans["last_enf_action_date"] - time_spans["violation_disc_date"]
).dt.days
time_spans["total_span"] = time_spans["insp_to_viol"] + time_spans["viol_to_enforce"]
enforcement_patterns = {
"action_types": viol_df["last_enf_action"].value_counts().to_dict()
if "last_enf_action" in viol_df.columns
else {},
"enforcement_rate": (
matched_df["last_enf_action"].notna().sum() / len(matched_df) * 100
),
}
if not time_spans.empty:
enforcement_patterns["avg_days_to_enforcement"] = time_spans["viol_to_enforce"].mean()
return {
"summary": {
"total_inspections": total_inspections,
"violation_rate": violation_rate,
"unique_wells_inspected": int(insp_df[self.ID_COLUMN].nunique()),
},
"conversion_funnel": {
"inspections_with_violations": inspections_with_violations,
"violations_with_enforcement": int(matched_df["last_enf_action"].notna().sum()),
},
"enforcement_patterns": enforcement_patterns,
"time_spans": time_spans.describe().to_dict() if not time_spans.empty else {},
}
def analyze_environmental_justice(self) -> Dict[str, Any]:
well_df = self.data.get("well_data", pd.DataFrame()).copy()
if well_df.empty or "census_tract_geoid" not in well_df.columns:
return {}
metrics = self.data.get("performance_metrics")
if metrics is not None and not metrics.empty:
well_df = well_df.merge(metrics, on=self.ID_COLUMN, how="left")
tract_df = well_df.dropna(subset=["census_tract_geoid"]).copy()
if tract_df.empty:
return {}
agg_map: Dict[str, str] = {self.ID_COLUMN: "count"}
mean_cols = [
"ej_composite_score",
"pct_minority",
"pct_hispanic",
"poverty_rate",
"median_household_income",
"avg_days_between_inspections",
"reinspection_compliance_rate",
"compliance_rate",
]
for col in mean_cols:
if col in tract_df.columns:
agg_map[col] = "mean"
if "total_inspections" in tract_df.columns:
agg_map["total_inspections"] = "mean"
if "total_violations" in tract_df.columns:
agg_map["total_violations"] = "mean"
if "major_violations" in tract_df.columns:
agg_map["major_violations"] = "sum"
tract_summary = (
tract_df.groupby("census_tract_geoid")
.agg(agg_map)
.rename(columns={self.ID_COLUMN: "wells_in_tract"})
.reset_index()
)
rename_map = {
"total_inspections": "avg_inspections",
"total_violations": "avg_violations",
"compliance_rate": "avg_compliance_rate",
}
tract_summary = tract_summary.rename(columns=rename_map)
demographic_vars = [
col
for col in [
"ej_composite_score",
"pct_minority",
"pct_hispanic",
"poverty_rate",
"median_household_income",
]
if col in tract_summary.columns
]
performance_vars = [
col
for col in [
"avg_inspections",
"avg_violations",
"major_violations",
"avg_compliance_rate",
"avg_days_between_inspections",
"reinspection_compliance_rate",
"wells_in_tract",
]
if col in tract_summary.columns
]
correlations: Dict[str, Dict[str, float]] = {dv: {} for dv in demographic_vars}
for d in demographic_vars:
for p in performance_vars:
correlations[d][p] = tract_summary[d].corr(tract_summary[p], method="spearman")
def split_high_low(column: str) -> Dict[str, Dict[str, float]]:
result: Dict[str, Dict[str, float]] = {}
if column not in tract_summary.columns:
return result
median_value = tract_summary[column].median()
high = tract_summary[tract_summary[column] > median_value]
low = tract_summary[tract_summary[column] <= median_value]
for metric in performance_vars:
result[metric] = {
"high": high[metric].mean() if not high.empty else float("nan"),
"low": low[metric].mean() if not low.empty else float("nan"),
}
return result
return {
"summary": {
"total_tracts": int(len(tract_summary)),
"total_wells": int(tract_summary["wells_in_tract"].sum()),
"avg_wells_per_tract": tract_summary["wells_in_tract"].mean(),
"avg_ej_score": tract_summary.get("ej_composite_score", pd.Series(dtype=float)).mean(),
},
"correlations": correlations,
"high_vulnerability_vs_low": split_high_low("ej_composite_score"),
"high_poverty_vs_low": split_high_low("poverty_rate"),
}
# --------------------------------------------------------------------------------------
# Public helpers
# --------------------------------------------------------------------------------------
def get_analysis(self) -> Dict[str, Any]:
return {
"inspection_analysis": self.analyze_inspection_patterns(),
"violation_analysis": self.analyze_violations(),
"regulatory_chain": self.analyze_regulatory_chain(),
"environmental_justice": self.analyze_environmental_justice(),
}
@staticmethod
def _format_stat(stat_value: Any) -> str:
if stat_value is None:
return "n/a"
if isinstance(stat_value, (int, float)):
if pd.isna(stat_value):
return "n/a"
return f"{stat_value:,.2f}"
return str(stat_value)
def print_analysis(self) -> None:
analysis = self.get_analysis()
def print_block(title: str, stats: Dict[str, Any]) -> None:
if not stats:
return
print(f"\n{title}")
for stat_key, stat_value in stats.items():
print(f" {stat_key.replace('_', ' ').title()}: {self._format_stat(stat_value)}")
print_block("INSPECTION ANALYSIS", analysis.get("inspection_analysis", {}).get("overall_statistics", {}))
print_block("VIOLATION ANALYSIS", analysis.get("violation_analysis", {}).get("overall_statistics", {}))
print_block("REGULATORY CHAIN", analysis.get("regulatory_chain", {}).get("summary", {}))
print_block("ENVIRONMENTAL JUSTICE", analysis.get("environmental_justice", {}).get("summary", {}))
violation_types = analysis.get("violation_analysis", {}).get("violation_types", {})
if violation_types:
print("\nTop Violations:")
for rule, count in violation_types.get("common_violations", {}).items():
print(f" {rule}: {self._format_stat(count)}")
def get_summary_stats(self) -> Dict[str, Any]:
stats: Dict[str, Any] = {}
wells = self.data.get("well_data", pd.DataFrame())
if not wells.empty:
stats["total_wells"] = len(wells)
stats["unique_census_tracts"] = wells["census_tract_geoid"].nunique(
dropna=True
) if "census_tract_geoid" in wells.columns else None
insp = self.data.get("inspections", pd.DataFrame())
if not insp.empty:
stats["total_inspections"] = len(insp)
viol = self.data.get("violations", pd.DataFrame())
if not viol.empty:
stats["total_violations"] = len(viol)
return stats
def export_analysis(self, path: Path | str) -> None:
output_path = Path(path)
output_path.parent.mkdir(parents=True, exist_ok=True)
analysis = self.get_analysis()
output_path.write_text(json.dumps(analysis, indent=2, default=str), encoding="utf-8")
logger.info("Analysis exported to %s", output_path)
if __name__ == "__main__":
try:
analyzer = WellAnalyzer()
print("Summary Stats:")
for key, value in analyzer.get_summary_stats().items():
print(f" {key.replace('_', ' ').title()}: {value}")
analyzer.print_analysis()
analyzer.export_analysis(Path("analysis_output.json"))
except WellAnalyzerError as exc:
logger.error("Well Analyzer failed: %s", exc, exc_info=True)

View File

@@ -0,0 +1,635 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Well Analyzer Notebook Templates\n",
"Use these cells as starting points for future analysis notebooks."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Imports & Environment"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"\n",
"repo_root = Path('..').resolve()\n",
"if str(repo_root) not in os.sys.path:\n",
" os.sys.path.insert(0, str(repo_root))\n",
"\n",
"from analysis.well_analyzer import WellAnalyzer\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Instantiate the analyzer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-11-08 20:31:14,124 - INFO - Connecting to Postgres\n",
"2025-11-08 20:32:36,129 - INFO - Loaded 1010431 wells from public.well_enriched_all_plus\n",
"2025-11-08 20:32:55,260 - INFO - Loaded 2151839 inspections from public.inspections\n",
"2025-11-08 20:32:58,951 - INFO - Loaded 242899 violations from public.violations\n"
]
},
{
"data": {
"text/plain": [
"{'total_wells': 1010431,\n",
" 'unique_census_tracts': 2981,\n",
" 'total_inspections': 2151839,\n",
" 'total_violations': 242899}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"analyzer = WellAnalyzer(chunk_size=50_000)\n",
"summary_stats = analyzer.get_summary_stats()\n",
"summary_stats\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Summary stats as DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_fcd18\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_fcd18_level0_col0\" class=\"col_heading level0 col0\" >value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_fcd18_level0_row0\" class=\"row_heading level0 row0\" >total_wells</th>\n",
" <td id=\"T_fcd18_row0_col0\" class=\"data row0 col0\" >1,010,431.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_fcd18_level0_row1\" class=\"row_heading level0 row1\" >unique_census_tracts</th>\n",
" <td id=\"T_fcd18_row1_col0\" class=\"data row1 col0\" >2,981.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_fcd18_level0_row2\" class=\"row_heading level0 row2\" >total_inspections</th>\n",
" <td id=\"T_fcd18_row2_col0\" class=\"data row2 col0\" >2,151,839.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_fcd18_level0_row3\" class=\"row_heading level0 row3\" >total_violations</th>\n",
" <td id=\"T_fcd18_row3_col0\" class=\"data row3 col0\" >242,899.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x7f623f6cacf0>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame([summary_stats]).T.rename(columns={0: 'value'}).style.format({'value': '{:,.2f}'})\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Inspection analysis helpers"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "object",
"type": "string"
},
{
"name": "value",
"rawType": "float64",
"type": "float"
}
],
"ref": "648d5bab-d6a9-4e87-95a4-d81a3087ad63",
"rows": [
[
"total_inspections",
"2151839.0"
],
[
"unique_wells_inspected",
"483352.0"
],
[
"overall_compliance_rate",
"89.15035000295096"
],
[
"avg_days_between_inspections",
"548.291420910082"
],
[
"median_days_between_inspections",
"322.0"
]
],
"shape": {
"columns": 1,
"rows": 5
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>total_inspections</th>\n",
" <td>2.151839e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique_wells_inspected</th>\n",
" <td>4.833520e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>overall_compliance_rate</th>\n",
" <td>8.915035e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>avg_days_between_inspections</th>\n",
" <td>5.482914e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>median_days_between_inspections</th>\n",
" <td>3.220000e+02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value\n",
"total_inspections 2.151839e+06\n",
"unique_wells_inspected 4.833520e+05\n",
"overall_compliance_rate 8.915035e+01\n",
"avg_days_between_inspections 5.482914e+02\n",
"median_days_between_inspections 3.220000e+02"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inspection_analysis = analyzer.analyze_inspection_patterns()\n",
"pd.Series(inspection_analysis['overall_statistics']).to_frame('value')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Violations slice"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "canonical_api10",
"rawType": "object",
"type": "string"
},
{
"name": "violation_disc_date",
"rawType": "datetime64[ns]",
"type": "datetime"
},
{
"name": "violated_rule",
"rawType": "object",
"type": "string"
},
{
"name": "major_viol_ind",
"rawType": "object",
"type": "string"
}
],
"ref": "7a3969db-0981-48f8-846b-31946d7c0e64",
"rows": [
[
"0",
"4233530876",
"2017-09-19 00:00:00",
"SWR 91(d)(1)",
"N"
],
[
"1",
"4233532284",
"2017-07-26 00:00:00",
"SWR 91(d)(1)",
"N"
],
[
"2",
"4233532284",
"2017-09-13 00:00:00",
"SWR 91(d)(1)",
"N"
],
[
"3",
"4210300169",
"2017-10-25 00:00:00",
"SWR 91(d)(1)",
"N"
],
[
"4",
"4222736906",
"2016-02-02 00:00:00",
"SWR 91(d)(1)",
"N"
]
],
"shape": {
"columns": 4,
"rows": 5
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>canonical_api10</th>\n",
" <th>violation_disc_date</th>\n",
" <th>violated_rule</th>\n",
" <th>major_viol_ind</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4233530876</td>\n",
" <td>2017-09-19</td>\n",
" <td>SWR 91(d)(1)</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4233532284</td>\n",
" <td>2017-07-26</td>\n",
" <td>SWR 91(d)(1)</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4233532284</td>\n",
" <td>2017-09-13</td>\n",
" <td>SWR 91(d)(1)</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4210300169</td>\n",
" <td>2017-10-25</td>\n",
" <td>SWR 91(d)(1)</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4222736906</td>\n",
" <td>2016-02-02</td>\n",
" <td>SWR 91(d)(1)</td>\n",
" <td>N</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" canonical_api10 violation_disc_date violated_rule major_viol_ind\n",
"0 4233530876 2017-09-19 SWR 91(d)(1) N\n",
"1 4233532284 2017-07-26 SWR 91(d)(1) N\n",
"2 4233532284 2017-09-13 SWR 91(d)(1) N\n",
"3 4210300169 2017-10-25 SWR 91(d)(1) N\n",
"4 4222736906 2016-02-02 SWR 91(d)(1) N"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"violations_df = analyzer.data['violations'][['canonical_api10', 'violation_disc_date', 'violated_rule', 'major_viol_ind']]\n",
"violations_df.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Environmental-justice aggregation"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "object",
"type": "string"
},
{
"name": "high",
"rawType": "float64",
"type": "float"
},
{
"name": "low",
"rawType": "float64",
"type": "float"
}
],
"ref": "5ab7f2d3-3c34-4a70-83f8-23a6929c2e30",
"rows": [
[
"avg_inspections",
"5.668602168650754",
"5.8424998043529195"
],
[
"avg_violations",
"0.6798208564386784",
"0.7418603711839766"
],
[
"major_violations",
"0.01963439404197698",
"0.02503382949932341"
],
[
"avg_compliance_rate",
"91.80852389969775",
"91.22339687712729"
],
[
"avg_days_between_inspections",
"757.1575143021564",
"732.3780043474643"
],
[
"reinspection_compliance_rate",
"13.868187092556035",
"14.88125749212477"
],
[
"wells_in_tract",
"307.38253215978335",
"374.30311231393773"
]
],
"shape": {
"columns": 2,
"rows": 7
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>high</th>\n",
" <th>low</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>avg_inspections</th>\n",
" <td>5.668602</td>\n",
" <td>5.842500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>avg_violations</th>\n",
" <td>0.679821</td>\n",
" <td>0.741860</td>\n",
" </tr>\n",
" <tr>\n",
" <th>major_violations</th>\n",
" <td>0.019634</td>\n",
" <td>0.025034</td>\n",
" </tr>\n",
" <tr>\n",
" <th>avg_compliance_rate</th>\n",
" <td>91.808524</td>\n",
" <td>91.223397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>avg_days_between_inspections</th>\n",
" <td>757.157514</td>\n",
" <td>732.378004</td>\n",
" </tr>\n",
" <tr>\n",
" <th>reinspection_compliance_rate</th>\n",
" <td>13.868187</td>\n",
" <td>14.881257</td>\n",
" </tr>\n",
" <tr>\n",
" <th>wells_in_tract</th>\n",
" <td>307.382532</td>\n",
" <td>374.303112</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" high low\n",
"avg_inspections 5.668602 5.842500\n",
"avg_violations 0.679821 0.741860\n",
"major_violations 0.019634 0.025034\n",
"avg_compliance_rate 91.808524 91.223397\n",
"avg_days_between_inspections 757.157514 732.378004\n",
"reinspection_compliance_rate 13.868187 14.881257\n",
"wells_in_tract 307.382532 374.303112"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ej = analyzer.analyze_environmental_justice()\n",
"pd.DataFrame(ej['high_vulnerability_vs_low']).T\n"
]
},
{
"cell_type": "markdown",
"id": "c233c217",
"metadata": {},
"source": [
"## 7. District comparisons\n",
"Group inspections by district (alphanumeric-safe) to see volume + compliance deltas."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4ada35c",
"metadata": {},
"outputs": [],
"source": [
"insp = analyzer.data['inspections'].copy()\n",
"if 'district' not in insp.columns:\n",
" raise KeyError('district column missing in inspections data')\n",
"\n",
"insp['district_str'] = insp['district'].astype(str).fillna('Unknown')\n",
"summary = insp.groupby('district_str').agg(\n",
" inspections=('district_str', 'size'),\n",
" unique_wells=('canonical_api10', 'nunique'),\n",
" compliance_rate=('compliance', lambda x: (x == 'Yes').mean() * 100 if 'Yes' in x.values else float('nan'))\n",
")\n",
"summary = summary.sort_values('inspections', ascending=False)\n",
"summary.head(15)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff