tables and maps

2025-11-01 15:10:32 -07:00
parent abff7981c2
commit dd40446f8c
12 changed files with 573 additions and 0 deletions
--- a/out_tables/spill_map_hex.pdf
+++ b/out_tables/spill_map_hex.pdf
--- a/out_tables/spill_map_points.png
+++ b/out_tables/spill_map_points.png
--- a/out_tables/table1_data_overview.csv
+++ b/out_tables/table1_data_overview.csv
@@ -0,0 +1,9 @@
+Component,Details
+Primary data source,"COGCC Spill/Release Reports (Form 19), statewide Colorado"
+Units and variables,"Dates, volumes, locations, facility, county; outcome = report delay (days)"
+Temporal coverage,2015–2022 (pre-policy and post-2020 policy)
+Geographic coverage,"Statewide; top counties: Weld, Garfield, Las Animas"
+Initial sample size,"N = 35,640 spill reports"
+Analytical sample size,"N = 33,202 after IQR trimming"
+Exclusions,"2,438 observations removed by IQR/outlier rules"
+Demographics join,"Census tract RUCA and demographics (ACS), spatial join to spill locations"
--- a/out_tables/table2_definitions_and_summary.csv
+++ b/out_tables/table2_definitions_and_summary.csv
@@ -0,0 +1,7 @@
+Variable,Definition,Type,N,Mean (SD),Median [IQR]
+Report Delay (days),Days between discovery and initial report; outcome variable,Continuous,33202.0,0.99 (1.09),"1.00 [0.00, 1.00]"
+Spill Type,Historical vs Recent spill classification,Categorical,,,
+Policy Period,Before 2020 vs 2020 and After (SB 19-181 implementation),Categorical,,,
+Rurality,"Rural, Suburban, Urban (RUCA-based)",Categorical,,,
+Median Household Income,Census tract ACS median income (USD),Continuous (optional),33202.0,-5904822.76 (62883297.78),"72717.00 [64573.00, 85561.00]"
+Percent Poverty,Census tract percent below poverty line,Continuous (optional),33200.0,10.28 (6.37),"9.57 [5.83, 13.51]"
--- a/out_tables/table2_summary_statistics.csv
+++ b/out_tables/table2_summary_statistics.csv
@@ -0,0 +1,14 @@
+Variable,N,Mean (SD),Median [IQR],Percent
+Report Delay (days),33202,0.99 (1.09),"1.00 [0.00, 1.00]",
+Spill Type: Recent,20702,,,62.35%
+Spill Type: Historical,12500,,,37.65%
+Period: 2020 and After,19546,,,58.87%
+Period: Before 2020,13656,,,41.13%
+Rurality: Urban,15732,,,47.38%
+Rurality: Rural,14826,,,44.65%
+Rurality: Suburban,2644,,,7.96%
+Median Household Income,33202,-5904822.76 (62883297.78),"72717.00 [64573.00, 85561.00]",
+Percent Poverty,33200,10.28 (6.37),"9.57 [5.83, 13.51]",
+Percent White,33200,83.51 (8.48),"85.92 [78.83, 87.85]",
+Percent Hispanic,33200,22.82 (13.06),"18.38 [13.92, 35.42]",
+Unemployment Rate,33200,2.66 (1.37),"2.41 [1.54, 3.97]",
--- a/out_tables/table2_variable_definitions.csv
+++ b/out_tables/table2_variable_definitions.csv
@@ -0,0 +1,7 @@
+Variable,Definition,Type
+Report Delay (days),Days between discovery and initial report; outcome variable,Continuous
+Spill Type,Historical vs Recent spill classification,Categorical
+Policy Period,Before 2020 vs 2020 and After (SB 19-181 implementation),Categorical
+Rurality,"Rural, Suburban, Urban (RUCA-based)",Categorical
+Median Household Income,Census tract ACS median income (USD),Continuous (optional)
+Percent Poverty,Census tract percent below poverty line,Continuous (optional)
--- a/out_tables/table3_core_glm_contrasts.csv
+++ b/out_tables/table3_core_glm_contrasts.csv
@@ -0,0 +1,7 @@
+Period,rurality,Delta (days),95% CI,p,Delta (hours),Hours 95% CI
+Before 2020,Rural,0.0125075677145938,"(-0.10, 0.12)",0.828,0.3001816251502512,"(-2.42, 2.98)"
+Before 2020,Suburban,0.0192862765678301,"(-0.21, 0.25)",0.862,0.4628706376279224,"(-4.97, 5.94)"
+Before 2020,Urban,0.1305571823770012,"(0.05, 0.21)",<0.001,3.133372377048029,"(1.27, 5.00)"
+2020 and After,Rural,0.0727709345083023,"(0.00, 0.14)",0.049,1.7465024281992552,"(0.01, 3.46)"
+2020 and After,Suburban,0.1438421422124148,"(0.02, 0.26)",0.022,3.452211413097955,"(0.46, 6.34)"
+2020 and After,Urban,0.2921678186765097,"(0.24, 0.34)",<0.001,7.0120276482362325,"(5.86, 8.22)"
--- a/out_tables/table4_predicted_means.csv
+++ b/out_tables/table4_predicted_means.csv
@@ -0,0 +1,13 @@
+Spill Type,Period,Rurality,Predicted (days),95% CI
+Historical,Before 2020,Rural,0.8881118881118854,"(0.81, 0.97)"
+Historical,Before 2020,Suburban,0.7837837837837821,"(0.65, 0.93)"
+Historical,Before 2020,Urban,0.8045417680454152,"(0.77, 0.84)"
+Historical,2020 and After,Rural,0.6093189964157685,"(0.56, 0.66)"
+Historical,2020 and After,Suburban,0.4201183431952656,"(0.36, 0.49)"
+Historical,2020 and After,Urban,0.3444994290064702,"(0.33, 0.36)"
+Recent,Before 2020,Rural,0.9033877038895848,"(0.87, 0.94)"
+Recent,Before 2020,Suburban,0.8009708737864075,"(0.72, 0.89)"
+Recent,Before 2020,Urban,0.93542074363992,"(0.89, 0.98)"
+Recent,2020 and After,Rural,0.6839481555333978,"(0.66, 0.71)"
+Recent,2020 and After,Suburban,0.5621468926553672,"(0.51, 0.62)"
+Recent,2020 and After,Urban,0.6367281475541288,"(0.61, 0.67)"
--- a/out_tables/table5_overview.csv
+++ b/out_tables/table5_overview.csv
@@ -0,0 +1,2 @@
+Section,Details
+Model Robustness,Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB
--- a/out_tables/table5_robustness_poisson_vs_nb.csv
+++ b/out_tables/table5_robustness_poisson_vs_nb.csv
@@ -0,0 +1,7 @@
+Period,Rurality,Poisson Δ (days),NB Δ (days),Difference (NB - Poisson),Poisson p,NB p
+Before 2020,Rural,0.0125075677145938,-9.826219107892053,-9.838726675606646,0.828,0.0004
+Before 2020,Suburban,0.0192862765678301,-7.100280417538358,-7.119566694106188,0.862,0.1072
+Before 2020,Urban,0.1305571823770012,-1.6388416472647436,-1.7693988296417449,0.0,0.008
+2020 and After,Rural,0.0727709345083023,-13.917287633807826,-13.990058568316128,0.049,0.0
+2020 and After,Suburban,0.1438421422124148,-7.230488621220758,-7.374330763433172,0.022,0.0368
+2020 and After,Urban,0.2921678186765097,-2.869383763253332,-3.161551581929842,0.0,0.0
--- a/scripts/build_publication_tables.py
+++ b/scripts/build_publication_tables.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+ROOT = Path('/home/dadams/Repos/colorado_spills')
+ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025'
+OUT = ROOT / 'out_tables'
+OUT.mkdir(parents=True, exist_ok=True)
+
+# Helpers
+
+def read_csv_safely(path: Path) -> pd.DataFrame:
+    if path.exists():
+        return pd.read_csv(path)
+    raise FileNotFoundError(str(path))
+
+# -------------------------
+# Table 1: Data and Sample Overview
+# -------------------------
+
+def build_table1():
+    rows = []
+    # Attempt to load sample sizes and exclusions
+    raw_path = ANALYSIS_DIR / 'spills_with_trim_flag.csv'
+    trimmed_path = ANALYSIS_DIR / 'spills_trimmed.csv'
+    removed_path = ANALYSIS_DIR / 'spills_trimmed_removed.csv'
+
+    n_raw = read_csv_safely(raw_path).shape[0] if raw_path.exists() else None
+    n_trimmed = read_csv_safely(trimmed_path).shape[0]
+    n_removed = read_csv_safely(removed_path).shape[0] if removed_path.exists() else (n_raw - n_trimmed if n_raw else None)
+
+    rows.append({'Component': 'Primary data source', 'Details': 'COGCC Spill/Release Reports (Form 19), statewide Colorado'})
+    rows.append({'Component': 'Units and variables', 'Details': 'Dates, volumes, locations, facility, county; outcome = report delay (days)'})
+    rows.append({'Component': 'Temporal coverage', 'Details': '2015–2022 (pre-policy and post-2020 policy)'})
+    rows.append({'Component': 'Geographic coverage', 'Details': 'Statewide; top counties: Weld, Garfield, Las Animas'})
+    if n_raw:
+        rows.append({'Component': 'Initial sample size', 'Details': f'N = {n_raw:,} spill reports'})
+    rows.append({'Component': 'Analytical sample size', 'Details': f'N = {n_trimmed:,} after IQR trimming'})
+    if n_removed is not None:
+        rows.append({'Component': 'Exclusions', 'Details': f'{n_removed:,} observations removed by IQR/outlier rules'})
+
+    # Optional joins
+    rows.append({'Component': 'Demographics join', 'Details': 'Census tract RUCA and demographics (ACS), spatial join to spill locations'})
+
+    t1 = pd.DataFrame(rows)
+    t1.to_csv(OUT / 'table1_data_overview.csv', index=False)
+    return t1
+
+# -------------------------
+# Table 2: Variable Definitions and Summary Statistics
+# -------------------------
+
+def build_table2():
+    df = read_csv_safely(ANALYSIS_DIR / 'spills_trimmed.csv')
+    # Ensure Period exists
+    if 'Period' not in df.columns:
+        df['Period'] = pd.to_datetime(df['Initial Report Date']).dt.year.map(lambda y: 'Before 2020' if y < 2020 else '2020 and After')
+
+    # Definitions
+    definitions = [
+        {'Variable': 'Report Delay (days)', 'Definition': 'Days between discovery and initial report; outcome variable', 'Type': 'Continuous'},
+        {'Variable': 'Spill Type', 'Definition': 'Historical vs Recent spill classification', 'Type': 'Categorical'},
+        {'Variable': 'Policy Period', 'Definition': 'Before 2020 vs 2020 and After (SB 19-181 implementation)', 'Type': 'Categorical'},
+        {'Variable': 'Rurality', 'Definition': 'Rural, Suburban, Urban (RUCA-based)', 'Type': 'Categorical'},
+        {'Variable': 'Median Household Income', 'Definition': 'Census tract ACS median income (USD)', 'Type': 'Continuous (optional)'},
+        {'Variable': 'Percent Poverty', 'Definition': 'Census tract percent below poverty line', 'Type': 'Continuous (optional)'}
+    ]
+    defs_df = pd.DataFrame(definitions)
+
+    # Summary stats
+    # Outcome: median [IQR] preferred due to skew; also mean (SD)
+    y = df['report_delay_full_days'].dropna()
+    outcome_row = {
+        'Variable': 'Report Delay (days)',
+        'N': len(y),
+        'Mean (SD)': f"{y.mean():.2f} ({y.std():.2f})",
+        'Median [IQR]': f"{y.median():.2f} [{y.quantile(0.25):.2f}, {y.quantile(0.75):.2f}]"
+    }
+
+    # Categorical stats
+    def cat_summary(series: pd.Series, label: str):
+        counts = series.value_counts(dropna=False)
+        rows = []
+        for k, v in counts.items():
+            pct = 100 * v / len(series)
+            rows.append({'Variable': f"{label}: {k}", 'N': int(v), 'Percent': f"{pct:.2f}%"})
+        return rows
+
+    cat_rows = []
+    cat_rows += cat_summary(df['Spill Type'], 'Spill Type') if 'Spill Type' in df.columns else []
+    cat_rows += cat_summary(df['Period'], 'Period')
+    cat_rows += cat_summary(df['rurality'], 'Rurality') if 'rurality' in df.columns else []
+
+    # Covariates
+    demo_rows = []
+    for col, label in [
+        ('median_household_income', 'Median Household Income'),
+        ('percent_poverty', 'Percent Poverty'),
+        ('percent_white', 'Percent White'),
+        ('percent_hispanic', 'Percent Hispanic'),
+        ('unemployment_rate', 'Unemployment Rate'),
+    ]:
+        if col in df.columns:
+            s = df[col].dropna()
+            demo_rows.append({'Variable': label, 'N': len(s), 'Mean (SD)': f"{s.mean():.2f} ({s.std():.2f})", 'Median [IQR]': f"{s.median():.2f} [{s.quantile(0.25):.2f}, {s.quantile(0.75):.2f}]"})
+
+    summary_df = pd.DataFrame([outcome_row] + cat_rows + demo_rows)
+
+    # Save
+    defs_df.to_csv(OUT / 'table2_variable_definitions.csv', index=False)
+    summary_df.to_csv(OUT / 'table2_summary_statistics.csv', index=False)
+
+    # Combined view
+    combined = defs_df.merge(summary_df[['Variable','N','Mean (SD)','Median [IQR]']], on='Variable', how='left')
+    combined.to_csv(OUT / 'table2_definitions_and_summary.csv', index=False)
+    return combined
+
+# -------------------------
+# Table 3: Core Model Results (GLM) — contrasts
+# -------------------------
+
+def build_table3():
+    path = ANALYSIS_DIR / 'final_contrast_comparison_combined.csv'
+    df = read_csv_safely(path)
+    # Keep Poisson as primary model
+    dfp = df[df['model'] == 'poisson'].copy()
+
+    # We have Period and rurality; compute formatted strings
+    dfp['Delta (days)'] = dfp['poisson_median']
+    dfp['95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower']:.2f}, {r['poisson_ci_upper']:.2f})", axis=1)
+    dfp['p'] = dfp['poisson_p'].apply(lambda x: '<0.001' if x < 0.001 else f"{x:.3f}")
+    dfp['Delta (hours)'] = dfp['poisson_median_hours']
+    dfp['Hours 95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower_hours']:.2f}, {r['poisson_ci_upper_hours']:.2f})", axis=1)
+
+    order_period = {'Before 2020': 0, '2020 and After': 1}
+    order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2}
+    dfp['po'] = dfp['Period'].map(order_period)
+    dfp['ro'] = dfp['rurality'].map(order_r)
+    dfp = dfp.sort_values(['po','ro']).drop(columns=['po','ro'])
+
+    keep = ['Period','rurality','Delta (days)','95% CI','p','Delta (hours)','Hours 95% CI']
+    out = dfp[keep]
+    out.to_csv(OUT / 'table3_core_glm_contrasts.csv', index=False)
+    return out
+
+# -------------------------
+# Table 4: Predicted Means or Contrasts (predicted reporting delays)
+# -------------------------
+
+def build_table4():
+    path = ANALYSIS_DIR / 'poisson_predicted_groups_boot_final.csv'
+    df = read_csv_safely(path)
+    # Expect columns: spill_type, Period, rurality, predicted_mean, ci_low, ci_high (names may vary)
+    # Attempt to infer
+    col_map = {}
+    for c in df.columns:
+        lc = c.lower()
+        if 'pred' in lc and ('mean' in lc or 'delay' in lc): col_map['pred'] = c
+        if ('ci_low' in lc or 'lower' in lc): col_map['low'] = c
+        if ('ci_high' in lc or 'upper' in lc): col_map['high'] = c
+    # Required grouping fields
+    gcols = []
+    for name in ['spill_type','Spill Type']:
+        if name in df.columns: gcols.append(name); break
+    for name in ['Period']:
+        if name in df.columns: gcols.append(name); break
+    for name in ['rurality','Rurality']:
+        if name in df.columns: gcols.append(name); break
+
+    if len(gcols) < 3 or not {'pred','low','high'} <= set(col_map):
+        # Fall back to exporting raw for manual review
+        df.to_csv(OUT / 'table4_predicted_means_raw.csv', index=False)
+        return df
+
+    tidy = df[gcols + [col_map['pred'], col_map['low'], col_map['high']]].copy()
+    tidy = tidy.rename(columns={gcols[0]:'Spill Type', gcols[1]:'Period', gcols[2]:'Rurality', col_map['pred']:'Predicted (days)', col_map['low']:'Lower', col_map['high']:'Upper'})
+    tidy['95% CI'] = tidy.apply(lambda r: f"({r['Lower']:.2f}, {r['Upper']:.2f})", axis=1)
+    out = tidy[['Spill Type','Period','Rurality','Predicted (days)','95% CI']].copy()
+
+    # Sort
+    order_period = {'Before 2020': 0, '2020 and After': 1}
+    order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2}
+    out['po'] = out['Period'].map(order_period)
+    out['ro'] = out['Rurality'].map(order_r)
+    out = out.sort_values(['Spill Type','po','ro']).drop(columns=['po','ro'])
+
+    out.to_csv(OUT / 'table4_predicted_means.csv', index=False)
+    return out
+
+# -------------------------
+# Table 5: ITS or Robustness Checks
+# -------------------------
+
+def build_table5():
+    rows = []
+    # ITS coefficients summary if available
+    its_path = ANALYSIS_DIR / 'its_summary_with_boot.csv'
+    if its_path.exists():
+        its = pd.read_csv(its_path)
+        # Expect columns: model/group identifiers + level change, slope change, CIs
+        # Try to infer
+        candidates = {
+            'level': [c for c in its.columns if 'level' in c.lower() and ('change' in c.lower() or 'beta' in c.lower() or 'immediate' in c.lower())],
+            'slope': [c for c in its.columns if 'slope' in c.lower() and ('change' in c.lower() or 'beta' in c.lower())],
+            'low': [c for c in its.columns if 'low' in c.lower()],
+            'high': [c for c in its.columns if 'high' in c.lower()],
+        }
+        # Build a minimal summary per group if possible
+        key_cols = []
+        for name in ['spill_type','Spill Type']:
+            if name in its.columns: key_cols.append(name); break
+        for name in ['rurality','Rurality']:
+            if name in its.columns: key_cols.append(name); break
+        if key_cols and candidates['level'] and candidates['slope']:
+            summary = its[key_cols + [candidates['level'][0], candidates['slope'][0]]].copy()
+            summary = summary.rename(columns={key_cols[0]:'Spill Type', key_cols[1]:'Rurality', candidates['level'][0]:'Level Change (days)', candidates['slope'][0]:'Slope Change (days/month)'})
+            summary.to_csv(OUT / 'table5_its_summary.csv', index=False)
+            rows.append({'Section':'ITS Summary','Details':'Saved table5_its_summary.csv with level and slope changes by group'})
+
+    # Robustness: Poisson vs NB differences from appendix_table2_model_comparison.csv
+    cmp_path = ANALYSIS_DIR / 'appendix_table2_model_comparison.csv'
+    if cmp_path.exists():
+        cmp = pd.read_csv(cmp_path)
+        # Expect columns: Period, Rurality, Poisson Δ (days), NB Δ (days), Difference (NB - Poisson), Poisson p, NB p
+        keep_cols = [c for c in cmp.columns if 'Period' in c or 'Rurality' in c or 'Difference' in c or 'Poisson' in c or 'NB' in c]
+        cmp[keep_cols].to_csv(OUT / 'table5_robustness_poisson_vs_nb.csv', index=False)
+        rows.append({'Section':'Model Robustness','Details':'Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB'})
+
+    t5 = pd.DataFrame(rows) if rows else pd.DataFrame([{'Section':'Note','Details':'No ITS or robustness sources found'}])
+    t5.to_csv(OUT / 'table5_overview.csv', index=False)
+    return t5
+
+
+def main():
+    t1 = build_table1()
+    t2 = build_table2()
+    t3 = build_table3()
+    t4 = build_table4()
+    t5 = build_table5()
+    print('Built tables:')
+    print(' -', OUT / 'table1_data_overview.csv')
+    print(' -', OUT / 'table2_definitions_and_summary.csv')
+    print(' -', OUT / 'table3_core_glm_contrasts.csv')
+    print(' -', OUT / 'table4_predicted_means.csv')
+    print(' -', OUT / 'table5_overview.csv')
+
+if __name__ == '__main__':
+    main()
--- a/scripts/make_spill_map.py
+++ b/scripts/make_spill_map.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Create a production-ready map of Colorado oil & gas spill locations for journal use.
+Outputs:
+ - out_tables/spill_map_points.png (vector-like high-res)
+ - out_tables/spill_map_hex.pdf (publication PDF)
+"""
+import os
+from pathlib import Path
+import pandas as pd
+import geopandas as gpd
+from shapely.geometry import Point
+import matplotlib.pyplot as plt
+from matplotlib_scalebar.scalebar import ScaleBar
+import matplotlib.patheffects as pe
+from sqlalchemy import create_engine
+
+ROOT = Path('/home/dadams/Repos/colorado_spills')
+ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025'
+OUT = ROOT / 'out_tables'
+OUT.mkdir(parents=True, exist_ok=True)
+
+DB_NAME = 'colorado_spills'
+
+
+def get_engine():
+    user = os.getenv('DB_USER')
+    password = os.getenv('DB_PASSWORD')
+    host = os.getenv('DB_HOST')
+    port = os.getenv('DB_PORT')
+    if not all([user, password, host, port]):
+        return None
+    url = f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{DB_NAME}'
+    try:
+        return create_engine(url)
+    except Exception:
+        return None
+
+
+def load_spills() -> gpd.GeoDataFrame:
+    """Load spills as a GeoDataFrame, preferring PostGIS, else CSV fallback."""
+    engine = get_engine()
+    if engine is not None:
+        # Try multiple geometry options from the PostGIS table
+        try_statements = [
+            ("SELECT *, geom FROM spills_with_demographics_geog", 'geom'),
+            ("SELECT *, geometry FROM spills_with_demographics_geog", 'geometry'),
+            ("SELECT *, CAST(geog AS geometry) AS geom FROM spills_with_demographics_geog", 'geom'),
+            ("SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM spills_with_demographics_geog", 'geom'),
+        ]
+        for sql, geom_col in try_statements:
+            try:
+                gdf = gpd.read_postgis(sql, engine, geom_col=geom_col)
+                # Ensure CRS
+                if gdf.crs is None:
+                    gdf.set_crs('EPSG:4326', inplace=True)
+                return gdf
+            except Exception:
+                pass
+        # Fallback to pandas + lat/lon if present in the DB table
+        try:
+            df = pd.read_sql_table('spills_with_demographics_geog', engine)
+            if {'Latitude', 'Longitude'}.issubset(df.columns):
+                return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
+        except Exception:
+            pass
+
+    # CSV fallback
+    csv = ANALYSIS_DIR / 'spills_trimmed.csv'
+    df = pd.read_csv(csv)
+    if not {'Latitude', 'Longitude'}.issubset(df.columns):
+        raise ValueError('Expected Latitude/Longitude columns in spills_trimmed.csv')
+    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
+    return gdf
+
+
+def try_query_boundary(engine) -> gpd.GeoDataFrame | None:
+    """Attempt to find a Colorado boundary polygon from common table names/columns in PostGIS."""
+    candidates = [
+        # table, geom expression, where
+        ("cb_2018_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
+        ("cb_2019_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
+        ("tl_2020_us_state", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
+        ("us_states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
+        ("states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
+        ("colorado_boundary", "geom", "1=1"),
+        ("co_boundary", "geom", "1=1"),
+    ]
+    alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"]
+    for table, geom_expr, where_clause in candidates:
+        for gexpr in alt_geoms:
+            sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}"
+            try:
+                gdf = gpd.read_postgis(sql, engine, geom_col='geom')
+                if not gdf.empty:
+                    if gdf.crs is None:
+                        gdf.set_crs('EPSG:4326', inplace=True)
+                    return gdf[["geom"]].to_crs(4326)
+            except Exception:
+                continue
+    return None
+
+
+def try_query_counties(engine) -> gpd.GeoDataFrame | None:
+    """Attempt to fetch Colorado counties from PostGIS for boundary overlays."""
+    candidates = [
+        ("cb_2018_us_county_500k", "geom", "statefp='08' OR stusps='CO'"),
+        ("cb_2019_us_county_500k", "geom", "statefp='08' OR stusps='CO'"),
+        ("tl_2020_us_county", "geom", "statefp='08' OR stusps='CO'"),
+        ("counties", "geom", "statefp='08' OR stusps='CO' OR state='CO' OR state_name ILIKE 'Colorado'"),
+        ("co_counties", "geom", "1=1"),
+        ("colorado_counties", "geom", "1=1"),
+    ]
+    alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"]
+    for table, geom_expr, where_clause in candidates:
+        for gexpr in alt_geoms:
+            sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}"
+            try:
+                gdf = gpd.read_postgis(sql, engine, geom_col='geom')
+                if not gdf.empty:
+                    if gdf.crs is None:
+                        gdf.set_crs('EPSG:4326', inplace=True)
+                    return gdf[["geom"]].to_crs(4326)
+            except Exception:
+                continue
+    return None
+
+
+def get_colorado_boundary(spills_gdf: gpd.GeoDataFrame) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame | None]:
+    """Return Colorado boundary and optional counties. Prefer PostGIS; fallback to convex-hull from points."""
+    engine = get_engine()
+    counties = None
+    if engine is not None:
+        boundary = try_query_boundary(engine)
+        if boundary is not None and not boundary.empty:
+            counties = try_query_counties(engine)
+            return boundary, counties
+    # Fallback: convex hull of points
+    boundary_geom = spills_gdf.to_crs(3857).unary_union.convex_hull.buffer(30_000)
+    boundary = gpd.GeoDataFrame(geometry=[boundary_geom], crs='EPSG:3857').to_crs(4326)
+    return boundary, counties
+
+
+def add_north_arrow(ax, x=0.95, y=0.1, size=18):
+    ax.annotate('N', xy=(x, y+0.06), xytext=(x, y), xycoords='axes fraction',
+                ha='center', va='bottom', fontsize=size,
+                fontweight='bold', color='black',
+                path_effects=[pe.withStroke(linewidth=3, foreground='white')],
+                arrowprops=dict(facecolor='black', width=5, headwidth=15, headlength=18, shrink=0.05,
+                                path_effects=[pe.withStroke(linewidth=3, foreground='white')]))
+
+
+def style_axes(ax):
+    ax.set_axis_off()
+    for spine in ax.spines.values():
+        spine.set_visible(False)
+
+
+def map_points(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None):
+    # Project to Web Mercator for plotting
+    spills_3857 = spills.to_crs(3857)
+    boundary_3857 = boundary.to_crs(3857)
+
+    # Figure aesthetics
+    fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True)
+    boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8)
+    if counties is not None and not counties.empty:
+        counties_3857 = counties.to_crs(3857)
+        counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3)
+
+    # Plot points with alpha and small size to avoid overplotting
+    spills_3857.plot(ax=ax, markersize=1.2, alpha=0.35, color='#2c7fb8')
+
+    # Tighten to boundary
+    minx, miny, maxx, maxy = boundary_3857.total_bounds
+    ax.set_xlim(minx, maxx)
+    ax.set_ylim(miny, maxy)
+
+    # Add scalebar (requires axis in meters)
+    ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7))
+
+    add_north_arrow(ax)
+    style_axes(ax)
+    # Dynamic title from available year fields
+    title = 'Oil & Gas Spill Reports in Colorado'
+    year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')]
+    if year_cols:
+        try:
+            yrs = spills[year_cols[0]].dropna().astype(int)
+            if not yrs.empty:
+                title = f"Oil & Gas Spill Reports in Colorado, {yrs.min()}–{yrs.max()}"
+        except Exception:
+            pass
+    ax.set_title(title, fontsize=12, pad=10)
+
+    out_png = OUT / 'spill_map_points.png'
+    fig.savefig(out_png, dpi=300)
+    plt.close(fig)
+    return out_png
+
+
+def map_hex(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None):
+    # Project to Web Mercator
+    spills_3857 = spills.to_crs(3857)
+    boundary_3857 = boundary.to_crs(3857)
+
+    # Build hexbin density using matplotlib hexbin on data coordinates
+    x = spills_3857.geometry.x.values
+    y = spills_3857.geometry.y.values
+
+    fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True)
+    boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8)
+    if counties is not None and not counties.empty:
+        counties_3857 = counties.to_crs(3857)
+        counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3)
+
+    # Use hexbin; gridsize tuned to CO scale; mincnt to suppress isolated cells
+    hb = ax.hexbin(x, y, gridsize=70, mincnt=3, cmap='viridis', alpha=0.9)
+    cb = fig.colorbar(hb, ax=ax, fraction=0.028, pad=0.01)
+    cb.set_label('Spill count')
+
+    # Scale bar and north arrow
+    ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7))
+    add_north_arrow(ax)
+
+    # Tighten to boundary
+    minx, miny, maxx, maxy = boundary_3857.total_bounds
+    ax.set_xlim(minx, maxx)
+    ax.set_ylim(miny, maxy)
+    style_axes(ax)
+    # Dynamic title from available year fields
+    title = 'Spill Density (Hexbin)'
+    year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')]
+    if year_cols:
+        try:
+            yrs = spills[year_cols[0]].dropna().astype(int)
+            if not yrs.empty:
+                title = f"Spill Density (Hexbin), {yrs.min()}–{yrs.max()}"
+        except Exception:
+            pass
+    ax.set_title(title, fontsize=12, pad=10)
+
+    out_pdf = OUT / 'spill_map_hex.pdf'
+    fig.savefig(out_pdf)
+    plt.close(fig)
+    return out_pdf
+
+
+def main():
+    spills = load_spills()
+    boundary, counties = get_colorado_boundary(spills)
+    p = map_points(spills, boundary, counties)
+    h = map_hex(spills, boundary, counties)
+    print(f"Saved: {p}")
+    print(f"Saved: {h}")
+
+if __name__ == '__main__':
+    main()