diff --git a/out_tables/spill_map_hex.pdf b/out_tables/spill_map_hex.pdf new file mode 100644 index 0000000..4f18e0a Binary files /dev/null and b/out_tables/spill_map_hex.pdf differ diff --git a/out_tables/spill_map_points.png b/out_tables/spill_map_points.png new file mode 100644 index 0000000..3d033f2 Binary files /dev/null and b/out_tables/spill_map_points.png differ diff --git a/out_tables/table1_data_overview.csv b/out_tables/table1_data_overview.csv new file mode 100644 index 0000000..75434fd --- /dev/null +++ b/out_tables/table1_data_overview.csv @@ -0,0 +1,9 @@ +Component,Details +Primary data source,"COGCC Spill/Release Reports (Form 19), statewide Colorado" +Units and variables,"Dates, volumes, locations, facility, county; outcome = report delay (days)" +Temporal coverage,2015–2022 (pre-policy and post-2020 policy) +Geographic coverage,"Statewide; top counties: Weld, Garfield, Las Animas" +Initial sample size,"N = 35,640 spill reports" +Analytical sample size,"N = 33,202 after IQR trimming" +Exclusions,"2,438 observations removed by IQR/outlier rules" +Demographics join,"Census tract RUCA and demographics (ACS), spatial join to spill locations" diff --git a/out_tables/table2_definitions_and_summary.csv b/out_tables/table2_definitions_and_summary.csv new file mode 100644 index 0000000..15ad5dd --- /dev/null +++ b/out_tables/table2_definitions_and_summary.csv @@ -0,0 +1,7 @@ +Variable,Definition,Type,N,Mean (SD),Median [IQR] +Report Delay (days),Days between discovery and initial report; outcome variable,Continuous,33202.0,0.99 (1.09),"1.00 [0.00, 1.00]" +Spill Type,Historical vs Recent spill classification,Categorical,,, +Policy Period,Before 2020 vs 2020 and After (SB 19-181 implementation),Categorical,,, +Rurality,"Rural, Suburban, Urban (RUCA-based)",Categorical,,, +Median Household Income,Census tract ACS median income (USD),Continuous (optional),33202.0,-5904822.76 (62883297.78),"72717.00 [64573.00, 85561.00]" +Percent Poverty,Census tract percent below poverty line,Continuous (optional),33200.0,10.28 (6.37),"9.57 [5.83, 13.51]" diff --git a/out_tables/table2_summary_statistics.csv b/out_tables/table2_summary_statistics.csv new file mode 100644 index 0000000..ac481f7 --- /dev/null +++ b/out_tables/table2_summary_statistics.csv @@ -0,0 +1,14 @@ +Variable,N,Mean (SD),Median [IQR],Percent +Report Delay (days),33202,0.99 (1.09),"1.00 [0.00, 1.00]", +Spill Type: Recent,20702,,,62.35% +Spill Type: Historical,12500,,,37.65% +Period: 2020 and After,19546,,,58.87% +Period: Before 2020,13656,,,41.13% +Rurality: Urban,15732,,,47.38% +Rurality: Rural,14826,,,44.65% +Rurality: Suburban,2644,,,7.96% +Median Household Income,33202,-5904822.76 (62883297.78),"72717.00 [64573.00, 85561.00]", +Percent Poverty,33200,10.28 (6.37),"9.57 [5.83, 13.51]", +Percent White,33200,83.51 (8.48),"85.92 [78.83, 87.85]", +Percent Hispanic,33200,22.82 (13.06),"18.38 [13.92, 35.42]", +Unemployment Rate,33200,2.66 (1.37),"2.41 [1.54, 3.97]", diff --git a/out_tables/table2_variable_definitions.csv b/out_tables/table2_variable_definitions.csv new file mode 100644 index 0000000..b509747 --- /dev/null +++ b/out_tables/table2_variable_definitions.csv @@ -0,0 +1,7 @@ +Variable,Definition,Type +Report Delay (days),Days between discovery and initial report; outcome variable,Continuous +Spill Type,Historical vs Recent spill classification,Categorical +Policy Period,Before 2020 vs 2020 and After (SB 19-181 implementation),Categorical +Rurality,"Rural, Suburban, Urban (RUCA-based)",Categorical +Median Household Income,Census tract ACS median income (USD),Continuous (optional) +Percent Poverty,Census tract percent below poverty line,Continuous (optional) diff --git a/out_tables/table3_core_glm_contrasts.csv b/out_tables/table3_core_glm_contrasts.csv new file mode 100644 index 0000000..89c3462 --- /dev/null +++ b/out_tables/table3_core_glm_contrasts.csv @@ -0,0 +1,7 @@ +Period,rurality,Delta (days),95% CI,p,Delta (hours),Hours 95% CI +Before 2020,Rural,0.0125075677145938,"(-0.10, 0.12)",0.828,0.3001816251502512,"(-2.42, 2.98)" +Before 2020,Suburban,0.0192862765678301,"(-0.21, 0.25)",0.862,0.4628706376279224,"(-4.97, 5.94)" +Before 2020,Urban,0.1305571823770012,"(0.05, 0.21)",<0.001,3.133372377048029,"(1.27, 5.00)" +2020 and After,Rural,0.0727709345083023,"(0.00, 0.14)",0.049,1.7465024281992552,"(0.01, 3.46)" +2020 and After,Suburban,0.1438421422124148,"(0.02, 0.26)",0.022,3.452211413097955,"(0.46, 6.34)" +2020 and After,Urban,0.2921678186765097,"(0.24, 0.34)",<0.001,7.0120276482362325,"(5.86, 8.22)" diff --git a/out_tables/table4_predicted_means.csv b/out_tables/table4_predicted_means.csv new file mode 100644 index 0000000..2d4b5df --- /dev/null +++ b/out_tables/table4_predicted_means.csv @@ -0,0 +1,13 @@ +Spill Type,Period,Rurality,Predicted (days),95% CI +Historical,Before 2020,Rural,0.8881118881118854,"(0.81, 0.97)" +Historical,Before 2020,Suburban,0.7837837837837821,"(0.65, 0.93)" +Historical,Before 2020,Urban,0.8045417680454152,"(0.77, 0.84)" +Historical,2020 and After,Rural,0.6093189964157685,"(0.56, 0.66)" +Historical,2020 and After,Suburban,0.4201183431952656,"(0.36, 0.49)" +Historical,2020 and After,Urban,0.3444994290064702,"(0.33, 0.36)" +Recent,Before 2020,Rural,0.9033877038895848,"(0.87, 0.94)" +Recent,Before 2020,Suburban,0.8009708737864075,"(0.72, 0.89)" +Recent,Before 2020,Urban,0.93542074363992,"(0.89, 0.98)" +Recent,2020 and After,Rural,0.6839481555333978,"(0.66, 0.71)" +Recent,2020 and After,Suburban,0.5621468926553672,"(0.51, 0.62)" +Recent,2020 and After,Urban,0.6367281475541288,"(0.61, 0.67)" diff --git a/out_tables/table5_overview.csv b/out_tables/table5_overview.csv new file mode 100644 index 0000000..176aafb --- /dev/null +++ b/out_tables/table5_overview.csv @@ -0,0 +1,2 @@ +Section,Details +Model Robustness,Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB diff --git a/out_tables/table5_robustness_poisson_vs_nb.csv b/out_tables/table5_robustness_poisson_vs_nb.csv new file mode 100644 index 0000000..0403868 --- /dev/null +++ b/out_tables/table5_robustness_poisson_vs_nb.csv @@ -0,0 +1,7 @@ +Period,Rurality,Poisson Δ (days),NB Δ (days),Difference (NB - Poisson),Poisson p,NB p +Before 2020,Rural,0.0125075677145938,-9.826219107892053,-9.838726675606646,0.828,0.0004 +Before 2020,Suburban,0.0192862765678301,-7.100280417538358,-7.119566694106188,0.862,0.1072 +Before 2020,Urban,0.1305571823770012,-1.6388416472647436,-1.7693988296417449,0.0,0.008 +2020 and After,Rural,0.0727709345083023,-13.917287633807826,-13.990058568316128,0.049,0.0 +2020 and After,Suburban,0.1438421422124148,-7.230488621220758,-7.374330763433172,0.022,0.0368 +2020 and After,Urban,0.2921678186765097,-2.869383763253332,-3.161551581929842,0.0,0.0 diff --git a/scripts/build_publication_tables.py b/scripts/build_publication_tables.py new file mode 100644 index 0000000..b89d0d9 --- /dev/null +++ b/scripts/build_publication_tables.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +from pathlib import Path + +ROOT = Path('/home/dadams/Repos/colorado_spills') +ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025' +OUT = ROOT / 'out_tables' +OUT.mkdir(parents=True, exist_ok=True) + +# Helpers + +def read_csv_safely(path: Path) -> pd.DataFrame: + if path.exists(): + return pd.read_csv(path) + raise FileNotFoundError(str(path)) + +# ------------------------- +# Table 1: Data and Sample Overview +# ------------------------- + +def build_table1(): + rows = [] + # Attempt to load sample sizes and exclusions + raw_path = ANALYSIS_DIR / 'spills_with_trim_flag.csv' + trimmed_path = ANALYSIS_DIR / 'spills_trimmed.csv' + removed_path = ANALYSIS_DIR / 'spills_trimmed_removed.csv' + + n_raw = read_csv_safely(raw_path).shape[0] if raw_path.exists() else None + n_trimmed = read_csv_safely(trimmed_path).shape[0] + n_removed = read_csv_safely(removed_path).shape[0] if removed_path.exists() else (n_raw - n_trimmed if n_raw else None) + + rows.append({'Component': 'Primary data source', 'Details': 'COGCC Spill/Release Reports (Form 19), statewide Colorado'}) + rows.append({'Component': 'Units and variables', 'Details': 'Dates, volumes, locations, facility, county; outcome = report delay (days)'}) + rows.append({'Component': 'Temporal coverage', 'Details': '2015–2022 (pre-policy and post-2020 policy)'}) + rows.append({'Component': 'Geographic coverage', 'Details': 'Statewide; top counties: Weld, Garfield, Las Animas'}) + if n_raw: + rows.append({'Component': 'Initial sample size', 'Details': f'N = {n_raw:,} spill reports'}) + rows.append({'Component': 'Analytical sample size', 'Details': f'N = {n_trimmed:,} after IQR trimming'}) + if n_removed is not None: + rows.append({'Component': 'Exclusions', 'Details': f'{n_removed:,} observations removed by IQR/outlier rules'}) + + # Optional joins + rows.append({'Component': 'Demographics join', 'Details': 'Census tract RUCA and demographics (ACS), spatial join to spill locations'}) + + t1 = pd.DataFrame(rows) + t1.to_csv(OUT / 'table1_data_overview.csv', index=False) + return t1 + +# ------------------------- +# Table 2: Variable Definitions and Summary Statistics +# ------------------------- + +def build_table2(): + df = read_csv_safely(ANALYSIS_DIR / 'spills_trimmed.csv') + # Ensure Period exists + if 'Period' not in df.columns: + df['Period'] = pd.to_datetime(df['Initial Report Date']).dt.year.map(lambda y: 'Before 2020' if y < 2020 else '2020 and After') + + # Definitions + definitions = [ + {'Variable': 'Report Delay (days)', 'Definition': 'Days between discovery and initial report; outcome variable', 'Type': 'Continuous'}, + {'Variable': 'Spill Type', 'Definition': 'Historical vs Recent spill classification', 'Type': 'Categorical'}, + {'Variable': 'Policy Period', 'Definition': 'Before 2020 vs 2020 and After (SB 19-181 implementation)', 'Type': 'Categorical'}, + {'Variable': 'Rurality', 'Definition': 'Rural, Suburban, Urban (RUCA-based)', 'Type': 'Categorical'}, + {'Variable': 'Median Household Income', 'Definition': 'Census tract ACS median income (USD)', 'Type': 'Continuous (optional)'}, + {'Variable': 'Percent Poverty', 'Definition': 'Census tract percent below poverty line', 'Type': 'Continuous (optional)'} + ] + defs_df = pd.DataFrame(definitions) + + # Summary stats + # Outcome: median [IQR] preferred due to skew; also mean (SD) + y = df['report_delay_full_days'].dropna() + outcome_row = { + 'Variable': 'Report Delay (days)', + 'N': len(y), + 'Mean (SD)': f"{y.mean():.2f} ({y.std():.2f})", + 'Median [IQR]': f"{y.median():.2f} [{y.quantile(0.25):.2f}, {y.quantile(0.75):.2f}]" + } + + # Categorical stats + def cat_summary(series: pd.Series, label: str): + counts = series.value_counts(dropna=False) + rows = [] + for k, v in counts.items(): + pct = 100 * v / len(series) + rows.append({'Variable': f"{label}: {k}", 'N': int(v), 'Percent': f"{pct:.2f}%"}) + return rows + + cat_rows = [] + cat_rows += cat_summary(df['Spill Type'], 'Spill Type') if 'Spill Type' in df.columns else [] + cat_rows += cat_summary(df['Period'], 'Period') + cat_rows += cat_summary(df['rurality'], 'Rurality') if 'rurality' in df.columns else [] + + # Covariates + demo_rows = [] + for col, label in [ + ('median_household_income', 'Median Household Income'), + ('percent_poverty', 'Percent Poverty'), + ('percent_white', 'Percent White'), + ('percent_hispanic', 'Percent Hispanic'), + ('unemployment_rate', 'Unemployment Rate'), + ]: + if col in df.columns: + s = df[col].dropna() + demo_rows.append({'Variable': label, 'N': len(s), 'Mean (SD)': f"{s.mean():.2f} ({s.std():.2f})", 'Median [IQR]': f"{s.median():.2f} [{s.quantile(0.25):.2f}, {s.quantile(0.75):.2f}]"}) + + summary_df = pd.DataFrame([outcome_row] + cat_rows + demo_rows) + + # Save + defs_df.to_csv(OUT / 'table2_variable_definitions.csv', index=False) + summary_df.to_csv(OUT / 'table2_summary_statistics.csv', index=False) + + # Combined view + combined = defs_df.merge(summary_df[['Variable','N','Mean (SD)','Median [IQR]']], on='Variable', how='left') + combined.to_csv(OUT / 'table2_definitions_and_summary.csv', index=False) + return combined + +# ------------------------- +# Table 3: Core Model Results (GLM) — contrasts +# ------------------------- + +def build_table3(): + path = ANALYSIS_DIR / 'final_contrast_comparison_combined.csv' + df = read_csv_safely(path) + # Keep Poisson as primary model + dfp = df[df['model'] == 'poisson'].copy() + + # We have Period and rurality; compute formatted strings + dfp['Delta (days)'] = dfp['poisson_median'] + dfp['95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower']:.2f}, {r['poisson_ci_upper']:.2f})", axis=1) + dfp['p'] = dfp['poisson_p'].apply(lambda x: '<0.001' if x < 0.001 else f"{x:.3f}") + dfp['Delta (hours)'] = dfp['poisson_median_hours'] + dfp['Hours 95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower_hours']:.2f}, {r['poisson_ci_upper_hours']:.2f})", axis=1) + + order_period = {'Before 2020': 0, '2020 and After': 1} + order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2} + dfp['po'] = dfp['Period'].map(order_period) + dfp['ro'] = dfp['rurality'].map(order_r) + dfp = dfp.sort_values(['po','ro']).drop(columns=['po','ro']) + + keep = ['Period','rurality','Delta (days)','95% CI','p','Delta (hours)','Hours 95% CI'] + out = dfp[keep] + out.to_csv(OUT / 'table3_core_glm_contrasts.csv', index=False) + return out + +# ------------------------- +# Table 4: Predicted Means or Contrasts (predicted reporting delays) +# ------------------------- + +def build_table4(): + path = ANALYSIS_DIR / 'poisson_predicted_groups_boot_final.csv' + df = read_csv_safely(path) + # Expect columns: spill_type, Period, rurality, predicted_mean, ci_low, ci_high (names may vary) + # Attempt to infer + col_map = {} + for c in df.columns: + lc = c.lower() + if 'pred' in lc and ('mean' in lc or 'delay' in lc): col_map['pred'] = c + if ('ci_low' in lc or 'lower' in lc): col_map['low'] = c + if ('ci_high' in lc or 'upper' in lc): col_map['high'] = c + # Required grouping fields + gcols = [] + for name in ['spill_type','Spill Type']: + if name in df.columns: gcols.append(name); break + for name in ['Period']: + if name in df.columns: gcols.append(name); break + for name in ['rurality','Rurality']: + if name in df.columns: gcols.append(name); break + + if len(gcols) < 3 or not {'pred','low','high'} <= set(col_map): + # Fall back to exporting raw for manual review + df.to_csv(OUT / 'table4_predicted_means_raw.csv', index=False) + return df + + tidy = df[gcols + [col_map['pred'], col_map['low'], col_map['high']]].copy() + tidy = tidy.rename(columns={gcols[0]:'Spill Type', gcols[1]:'Period', gcols[2]:'Rurality', col_map['pred']:'Predicted (days)', col_map['low']:'Lower', col_map['high']:'Upper'}) + tidy['95% CI'] = tidy.apply(lambda r: f"({r['Lower']:.2f}, {r['Upper']:.2f})", axis=1) + out = tidy[['Spill Type','Period','Rurality','Predicted (days)','95% CI']].copy() + + # Sort + order_period = {'Before 2020': 0, '2020 and After': 1} + order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2} + out['po'] = out['Period'].map(order_period) + out['ro'] = out['Rurality'].map(order_r) + out = out.sort_values(['Spill Type','po','ro']).drop(columns=['po','ro']) + + out.to_csv(OUT / 'table4_predicted_means.csv', index=False) + return out + +# ------------------------- +# Table 5: ITS or Robustness Checks +# ------------------------- + +def build_table5(): + rows = [] + # ITS coefficients summary if available + its_path = ANALYSIS_DIR / 'its_summary_with_boot.csv' + if its_path.exists(): + its = pd.read_csv(its_path) + # Expect columns: model/group identifiers + level change, slope change, CIs + # Try to infer + candidates = { + 'level': [c for c in its.columns if 'level' in c.lower() and ('change' in c.lower() or 'beta' in c.lower() or 'immediate' in c.lower())], + 'slope': [c for c in its.columns if 'slope' in c.lower() and ('change' in c.lower() or 'beta' in c.lower())], + 'low': [c for c in its.columns if 'low' in c.lower()], + 'high': [c for c in its.columns if 'high' in c.lower()], + } + # Build a minimal summary per group if possible + key_cols = [] + for name in ['spill_type','Spill Type']: + if name in its.columns: key_cols.append(name); break + for name in ['rurality','Rurality']: + if name in its.columns: key_cols.append(name); break + if key_cols and candidates['level'] and candidates['slope']: + summary = its[key_cols + [candidates['level'][0], candidates['slope'][0]]].copy() + summary = summary.rename(columns={key_cols[0]:'Spill Type', key_cols[1]:'Rurality', candidates['level'][0]:'Level Change (days)', candidates['slope'][0]:'Slope Change (days/month)'}) + summary.to_csv(OUT / 'table5_its_summary.csv', index=False) + rows.append({'Section':'ITS Summary','Details':'Saved table5_its_summary.csv with level and slope changes by group'}) + + # Robustness: Poisson vs NB differences from appendix_table2_model_comparison.csv + cmp_path = ANALYSIS_DIR / 'appendix_table2_model_comparison.csv' + if cmp_path.exists(): + cmp = pd.read_csv(cmp_path) + # Expect columns: Period, Rurality, Poisson Δ (days), NB Δ (days), Difference (NB - Poisson), Poisson p, NB p + keep_cols = [c for c in cmp.columns if 'Period' in c or 'Rurality' in c or 'Difference' in c or 'Poisson' in c or 'NB' in c] + cmp[keep_cols].to_csv(OUT / 'table5_robustness_poisson_vs_nb.csv', index=False) + rows.append({'Section':'Model Robustness','Details':'Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB'}) + + t5 = pd.DataFrame(rows) if rows else pd.DataFrame([{'Section':'Note','Details':'No ITS or robustness sources found'}]) + t5.to_csv(OUT / 'table5_overview.csv', index=False) + return t5 + + +def main(): + t1 = build_table1() + t2 = build_table2() + t3 = build_table3() + t4 = build_table4() + t5 = build_table5() + print('Built tables:') + print(' -', OUT / 'table1_data_overview.csv') + print(' -', OUT / 'table2_definitions_and_summary.csv') + print(' -', OUT / 'table3_core_glm_contrasts.csv') + print(' -', OUT / 'table4_predicted_means.csv') + print(' -', OUT / 'table5_overview.csv') + +if __name__ == '__main__': + main() diff --git a/scripts/make_spill_map.py b/scripts/make_spill_map.py new file mode 100644 index 0000000..3f5accd --- /dev/null +++ b/scripts/make_spill_map.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +""" +Create a production-ready map of Colorado oil & gas spill locations for journal use. +Outputs: + - out_tables/spill_map_points.png (vector-like high-res) + - out_tables/spill_map_hex.pdf (publication PDF) +""" +import os +from pathlib import Path +import pandas as pd +import geopandas as gpd +from shapely.geometry import Point +import matplotlib.pyplot as plt +from matplotlib_scalebar.scalebar import ScaleBar +import matplotlib.patheffects as pe +from sqlalchemy import create_engine + +ROOT = Path('/home/dadams/Repos/colorado_spills') +ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025' +OUT = ROOT / 'out_tables' +OUT.mkdir(parents=True, exist_ok=True) + +DB_NAME = 'colorado_spills' + + +def get_engine(): + user = os.getenv('DB_USER') + password = os.getenv('DB_PASSWORD') + host = os.getenv('DB_HOST') + port = os.getenv('DB_PORT') + if not all([user, password, host, port]): + return None + url = f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{DB_NAME}' + try: + return create_engine(url) + except Exception: + return None + + +def load_spills() -> gpd.GeoDataFrame: + """Load spills as a GeoDataFrame, preferring PostGIS, else CSV fallback.""" + engine = get_engine() + if engine is not None: + # Try multiple geometry options from the PostGIS table + try_statements = [ + ("SELECT *, geom FROM spills_with_demographics_geog", 'geom'), + ("SELECT *, geometry FROM spills_with_demographics_geog", 'geometry'), + ("SELECT *, CAST(geog AS geometry) AS geom FROM spills_with_demographics_geog", 'geom'), + ("SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM spills_with_demographics_geog", 'geom'), + ] + for sql, geom_col in try_statements: + try: + gdf = gpd.read_postgis(sql, engine, geom_col=geom_col) + # Ensure CRS + if gdf.crs is None: + gdf.set_crs('EPSG:4326', inplace=True) + return gdf + except Exception: + pass + # Fallback to pandas + lat/lon if present in the DB table + try: + df = pd.read_sql_table('spills_with_demographics_geog', engine) + if {'Latitude', 'Longitude'}.issubset(df.columns): + return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326') + except Exception: + pass + + # CSV fallback + csv = ANALYSIS_DIR / 'spills_trimmed.csv' + df = pd.read_csv(csv) + if not {'Latitude', 'Longitude'}.issubset(df.columns): + raise ValueError('Expected Latitude/Longitude columns in spills_trimmed.csv') + gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326') + return gdf + + +def try_query_boundary(engine) -> gpd.GeoDataFrame | None: + """Attempt to find a Colorado boundary polygon from common table names/columns in PostGIS.""" + candidates = [ + # table, geom expression, where + ("cb_2018_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"), + ("cb_2019_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"), + ("tl_2020_us_state", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"), + ("us_states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"), + ("states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"), + ("colorado_boundary", "geom", "1=1"), + ("co_boundary", "geom", "1=1"), + ] + alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"] + for table, geom_expr, where_clause in candidates: + for gexpr in alt_geoms: + sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}" + try: + gdf = gpd.read_postgis(sql, engine, geom_col='geom') + if not gdf.empty: + if gdf.crs is None: + gdf.set_crs('EPSG:4326', inplace=True) + return gdf[["geom"]].to_crs(4326) + except Exception: + continue + return None + + +def try_query_counties(engine) -> gpd.GeoDataFrame | None: + """Attempt to fetch Colorado counties from PostGIS for boundary overlays.""" + candidates = [ + ("cb_2018_us_county_500k", "geom", "statefp='08' OR stusps='CO'"), + ("cb_2019_us_county_500k", "geom", "statefp='08' OR stusps='CO'"), + ("tl_2020_us_county", "geom", "statefp='08' OR stusps='CO'"), + ("counties", "geom", "statefp='08' OR stusps='CO' OR state='CO' OR state_name ILIKE 'Colorado'"), + ("co_counties", "geom", "1=1"), + ("colorado_counties", "geom", "1=1"), + ] + alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"] + for table, geom_expr, where_clause in candidates: + for gexpr in alt_geoms: + sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}" + try: + gdf = gpd.read_postgis(sql, engine, geom_col='geom') + if not gdf.empty: + if gdf.crs is None: + gdf.set_crs('EPSG:4326', inplace=True) + return gdf[["geom"]].to_crs(4326) + except Exception: + continue + return None + + +def get_colorado_boundary(spills_gdf: gpd.GeoDataFrame) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame | None]: + """Return Colorado boundary and optional counties. Prefer PostGIS; fallback to convex-hull from points.""" + engine = get_engine() + counties = None + if engine is not None: + boundary = try_query_boundary(engine) + if boundary is not None and not boundary.empty: + counties = try_query_counties(engine) + return boundary, counties + # Fallback: convex hull of points + boundary_geom = spills_gdf.to_crs(3857).unary_union.convex_hull.buffer(30_000) + boundary = gpd.GeoDataFrame(geometry=[boundary_geom], crs='EPSG:3857').to_crs(4326) + return boundary, counties + + +def add_north_arrow(ax, x=0.95, y=0.1, size=18): + ax.annotate('N', xy=(x, y+0.06), xytext=(x, y), xycoords='axes fraction', + ha='center', va='bottom', fontsize=size, + fontweight='bold', color='black', + path_effects=[pe.withStroke(linewidth=3, foreground='white')], + arrowprops=dict(facecolor='black', width=5, headwidth=15, headlength=18, shrink=0.05, + path_effects=[pe.withStroke(linewidth=3, foreground='white')])) + + +def style_axes(ax): + ax.set_axis_off() + for spine in ax.spines.values(): + spine.set_visible(False) + + +def map_points(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None): + # Project to Web Mercator for plotting + spills_3857 = spills.to_crs(3857) + boundary_3857 = boundary.to_crs(3857) + + # Figure aesthetics + fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True) + boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8) + if counties is not None and not counties.empty: + counties_3857 = counties.to_crs(3857) + counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3) + + # Plot points with alpha and small size to avoid overplotting + spills_3857.plot(ax=ax, markersize=1.2, alpha=0.35, color='#2c7fb8') + + # Tighten to boundary + minx, miny, maxx, maxy = boundary_3857.total_bounds + ax.set_xlim(minx, maxx) + ax.set_ylim(miny, maxy) + + # Add scalebar (requires axis in meters) + ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7)) + + add_north_arrow(ax) + style_axes(ax) + # Dynamic title from available year fields + title = 'Oil & Gas Spill Reports in Colorado' + year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')] + if year_cols: + try: + yrs = spills[year_cols[0]].dropna().astype(int) + if not yrs.empty: + title = f"Oil & Gas Spill Reports in Colorado, {yrs.min()}–{yrs.max()}" + except Exception: + pass + ax.set_title(title, fontsize=12, pad=10) + + out_png = OUT / 'spill_map_points.png' + fig.savefig(out_png, dpi=300) + plt.close(fig) + return out_png + + +def map_hex(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None): + # Project to Web Mercator + spills_3857 = spills.to_crs(3857) + boundary_3857 = boundary.to_crs(3857) + + # Build hexbin density using matplotlib hexbin on data coordinates + x = spills_3857.geometry.x.values + y = spills_3857.geometry.y.values + + fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True) + boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8) + if counties is not None and not counties.empty: + counties_3857 = counties.to_crs(3857) + counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3) + + # Use hexbin; gridsize tuned to CO scale; mincnt to suppress isolated cells + hb = ax.hexbin(x, y, gridsize=70, mincnt=3, cmap='viridis', alpha=0.9) + cb = fig.colorbar(hb, ax=ax, fraction=0.028, pad=0.01) + cb.set_label('Spill count') + + # Scale bar and north arrow + ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7)) + add_north_arrow(ax) + + # Tighten to boundary + minx, miny, maxx, maxy = boundary_3857.total_bounds + ax.set_xlim(minx, maxx) + ax.set_ylim(miny, maxy) + style_axes(ax) + # Dynamic title from available year fields + title = 'Spill Density (Hexbin)' + year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')] + if year_cols: + try: + yrs = spills[year_cols[0]].dropna().astype(int) + if not yrs.empty: + title = f"Spill Density (Hexbin), {yrs.min()}–{yrs.max()}" + except Exception: + pass + ax.set_title(title, fontsize=12, pad=10) + + out_pdf = OUT / 'spill_map_hex.pdf' + fig.savefig(out_pdf) + plt.close(fig) + return out_pdf + + +def main(): + spills = load_spills() + boundary, counties = get_colorado_boundary(spills) + p = map_points(spills, boundary, counties) + h = map_hex(spills, boundary, counties) + print(f"Saved: {p}") + print(f"Saved: {h}") + +if __name__ == '__main__': + main()