tables and maps

This commit is contained in:
2025-11-01 15:10:32 -07:00
parent abff7981c2
commit dd40446f8c
12 changed files with 573 additions and 0 deletions

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 246 KiB

View File

@@ -0,0 +1,9 @@
Component,Details
Primary data source,"COGCC Spill/Release Reports (Form 19), statewide Colorado"
Units and variables,"Dates, volumes, locations, facility, county; outcome = report delay (days)"
Temporal coverage,20152022 (pre-policy and post-2020 policy)
Geographic coverage,"Statewide; top counties: Weld, Garfield, Las Animas"
Initial sample size,"N = 35,640 spill reports"
Analytical sample size,"N = 33,202 after IQR trimming"
Exclusions,"2,438 observations removed by IQR/outlier rules"
Demographics join,"Census tract RUCA and demographics (ACS), spatial join to spill locations"
1 Component Details
2 Primary data source COGCC Spill/Release Reports (Form 19), statewide Colorado
3 Units and variables Dates, volumes, locations, facility, county; outcome = report delay (days)
4 Temporal coverage 2015–2022 (pre-policy and post-2020 policy)
5 Geographic coverage Statewide; top counties: Weld, Garfield, Las Animas
6 Initial sample size N = 35,640 spill reports
7 Analytical sample size N = 33,202 after IQR trimming
8 Exclusions 2,438 observations removed by IQR/outlier rules
9 Demographics join Census tract RUCA and demographics (ACS), spatial join to spill locations

View File

@@ -0,0 +1,7 @@
Variable,Definition,Type,N,Mean (SD),Median [IQR]
Report Delay (days),Days between discovery and initial report; outcome variable,Continuous,33202.0,0.99 (1.09),"1.00 [0.00, 1.00]"
Spill Type,Historical vs Recent spill classification,Categorical,,,
Policy Period,Before 2020 vs 2020 and After (SB 19-181 implementation),Categorical,,,
Rurality,"Rural, Suburban, Urban (RUCA-based)",Categorical,,,
Median Household Income,Census tract ACS median income (USD),Continuous (optional),33202.0,-5904822.76 (62883297.78),"72717.00 [64573.00, 85561.00]"
Percent Poverty,Census tract percent below poverty line,Continuous (optional),33200.0,10.28 (6.37),"9.57 [5.83, 13.51]"
1 Variable Definition Type N Mean (SD) Median [IQR]
2 Report Delay (days) Days between discovery and initial report; outcome variable Continuous 33202.0 0.99 (1.09) 1.00 [0.00, 1.00]
3 Spill Type Historical vs Recent spill classification Categorical
4 Policy Period Before 2020 vs 2020 and After (SB 19-181 implementation) Categorical
5 Rurality Rural, Suburban, Urban (RUCA-based) Categorical
6 Median Household Income Census tract ACS median income (USD) Continuous (optional) 33202.0 -5904822.76 (62883297.78) 72717.00 [64573.00, 85561.00]
7 Percent Poverty Census tract percent below poverty line Continuous (optional) 33200.0 10.28 (6.37) 9.57 [5.83, 13.51]

View File

@@ -0,0 +1,14 @@
Variable,N,Mean (SD),Median [IQR],Percent
Report Delay (days),33202,0.99 (1.09),"1.00 [0.00, 1.00]",
Spill Type: Recent,20702,,,62.35%
Spill Type: Historical,12500,,,37.65%
Period: 2020 and After,19546,,,58.87%
Period: Before 2020,13656,,,41.13%
Rurality: Urban,15732,,,47.38%
Rurality: Rural,14826,,,44.65%
Rurality: Suburban,2644,,,7.96%
Median Household Income,33202,-5904822.76 (62883297.78),"72717.00 [64573.00, 85561.00]",
Percent Poverty,33200,10.28 (6.37),"9.57 [5.83, 13.51]",
Percent White,33200,83.51 (8.48),"85.92 [78.83, 87.85]",
Percent Hispanic,33200,22.82 (13.06),"18.38 [13.92, 35.42]",
Unemployment Rate,33200,2.66 (1.37),"2.41 [1.54, 3.97]",
1 Variable N Mean (SD) Median [IQR] Percent
2 Report Delay (days) 33202 0.99 (1.09) 1.00 [0.00, 1.00]
3 Spill Type: Recent 20702 62.35%
4 Spill Type: Historical 12500 37.65%
5 Period: 2020 and After 19546 58.87%
6 Period: Before 2020 13656 41.13%
7 Rurality: Urban 15732 47.38%
8 Rurality: Rural 14826 44.65%
9 Rurality: Suburban 2644 7.96%
10 Median Household Income 33202 -5904822.76 (62883297.78) 72717.00 [64573.00, 85561.00]
11 Percent Poverty 33200 10.28 (6.37) 9.57 [5.83, 13.51]
12 Percent White 33200 83.51 (8.48) 85.92 [78.83, 87.85]
13 Percent Hispanic 33200 22.82 (13.06) 18.38 [13.92, 35.42]
14 Unemployment Rate 33200 2.66 (1.37) 2.41 [1.54, 3.97]

View File

@@ -0,0 +1,7 @@
Variable,Definition,Type
Report Delay (days),Days between discovery and initial report; outcome variable,Continuous
Spill Type,Historical vs Recent spill classification,Categorical
Policy Period,Before 2020 vs 2020 and After (SB 19-181 implementation),Categorical
Rurality,"Rural, Suburban, Urban (RUCA-based)",Categorical
Median Household Income,Census tract ACS median income (USD),Continuous (optional)
Percent Poverty,Census tract percent below poverty line,Continuous (optional)
1 Variable Definition Type
2 Report Delay (days) Days between discovery and initial report; outcome variable Continuous
3 Spill Type Historical vs Recent spill classification Categorical
4 Policy Period Before 2020 vs 2020 and After (SB 19-181 implementation) Categorical
5 Rurality Rural, Suburban, Urban (RUCA-based) Categorical
6 Median Household Income Census tract ACS median income (USD) Continuous (optional)
7 Percent Poverty Census tract percent below poverty line Continuous (optional)

View File

@@ -0,0 +1,7 @@
Period,rurality,Delta (days),95% CI,p,Delta (hours),Hours 95% CI
Before 2020,Rural,0.0125075677145938,"(-0.10, 0.12)",0.828,0.3001816251502512,"(-2.42, 2.98)"
Before 2020,Suburban,0.0192862765678301,"(-0.21, 0.25)",0.862,0.4628706376279224,"(-4.97, 5.94)"
Before 2020,Urban,0.1305571823770012,"(0.05, 0.21)",<0.001,3.133372377048029,"(1.27, 5.00)"
2020 and After,Rural,0.0727709345083023,"(0.00, 0.14)",0.049,1.7465024281992552,"(0.01, 3.46)"
2020 and After,Suburban,0.1438421422124148,"(0.02, 0.26)",0.022,3.452211413097955,"(0.46, 6.34)"
2020 and After,Urban,0.2921678186765097,"(0.24, 0.34)",<0.001,7.0120276482362325,"(5.86, 8.22)"
1 Period rurality Delta (days) 95% CI p Delta (hours) Hours 95% CI
2 Before 2020 Rural 0.0125075677145938 (-0.10, 0.12) 0.828 0.3001816251502512 (-2.42, 2.98)
3 Before 2020 Suburban 0.0192862765678301 (-0.21, 0.25) 0.862 0.4628706376279224 (-4.97, 5.94)
4 Before 2020 Urban 0.1305571823770012 (0.05, 0.21) <0.001 3.133372377048029 (1.27, 5.00)
5 2020 and After Rural 0.0727709345083023 (0.00, 0.14) 0.049 1.7465024281992552 (0.01, 3.46)
6 2020 and After Suburban 0.1438421422124148 (0.02, 0.26) 0.022 3.452211413097955 (0.46, 6.34)
7 2020 and After Urban 0.2921678186765097 (0.24, 0.34) <0.001 7.0120276482362325 (5.86, 8.22)

View File

@@ -0,0 +1,13 @@
Spill Type,Period,Rurality,Predicted (days),95% CI
Historical,Before 2020,Rural,0.8881118881118854,"(0.81, 0.97)"
Historical,Before 2020,Suburban,0.7837837837837821,"(0.65, 0.93)"
Historical,Before 2020,Urban,0.8045417680454152,"(0.77, 0.84)"
Historical,2020 and After,Rural,0.6093189964157685,"(0.56, 0.66)"
Historical,2020 and After,Suburban,0.4201183431952656,"(0.36, 0.49)"
Historical,2020 and After,Urban,0.3444994290064702,"(0.33, 0.36)"
Recent,Before 2020,Rural,0.9033877038895848,"(0.87, 0.94)"
Recent,Before 2020,Suburban,0.8009708737864075,"(0.72, 0.89)"
Recent,Before 2020,Urban,0.93542074363992,"(0.89, 0.98)"
Recent,2020 and After,Rural,0.6839481555333978,"(0.66, 0.71)"
Recent,2020 and After,Suburban,0.5621468926553672,"(0.51, 0.62)"
Recent,2020 and After,Urban,0.6367281475541288,"(0.61, 0.67)"
1 Spill Type Period Rurality Predicted (days) 95% CI
2 Historical Before 2020 Rural 0.8881118881118854 (0.81, 0.97)
3 Historical Before 2020 Suburban 0.7837837837837821 (0.65, 0.93)
4 Historical Before 2020 Urban 0.8045417680454152 (0.77, 0.84)
5 Historical 2020 and After Rural 0.6093189964157685 (0.56, 0.66)
6 Historical 2020 and After Suburban 0.4201183431952656 (0.36, 0.49)
7 Historical 2020 and After Urban 0.3444994290064702 (0.33, 0.36)
8 Recent Before 2020 Rural 0.9033877038895848 (0.87, 0.94)
9 Recent Before 2020 Suburban 0.8009708737864075 (0.72, 0.89)
10 Recent Before 2020 Urban 0.93542074363992 (0.89, 0.98)
11 Recent 2020 and After Rural 0.6839481555333978 (0.66, 0.71)
12 Recent 2020 and After Suburban 0.5621468926553672 (0.51, 0.62)
13 Recent 2020 and After Urban 0.6367281475541288 (0.61, 0.67)

View File

@@ -0,0 +1,2 @@
Section,Details
Model Robustness,Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB
1 Section Details
2 Model Robustness Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB

View File

@@ -0,0 +1,7 @@
Period,Rurality,Poisson Δ (days),NB Δ (days),Difference (NB - Poisson),Poisson p,NB p
Before 2020,Rural,0.0125075677145938,-9.826219107892053,-9.838726675606646,0.828,0.0004
Before 2020,Suburban,0.0192862765678301,-7.100280417538358,-7.119566694106188,0.862,0.1072
Before 2020,Urban,0.1305571823770012,-1.6388416472647436,-1.7693988296417449,0.0,0.008
2020 and After,Rural,0.0727709345083023,-13.917287633807826,-13.990058568316128,0.049,0.0
2020 and After,Suburban,0.1438421422124148,-7.230488621220758,-7.374330763433172,0.022,0.0368
2020 and After,Urban,0.2921678186765097,-2.869383763253332,-3.161551581929842,0.0,0.0
1 Period Rurality Poisson Δ (days) NB Δ (days) Difference (NB - Poisson) Poisson p NB p
2 Before 2020 Rural 0.0125075677145938 -9.826219107892053 -9.838726675606646 0.828 0.0004
3 Before 2020 Suburban 0.0192862765678301 -7.100280417538358 -7.119566694106188 0.862 0.1072
4 Before 2020 Urban 0.1305571823770012 -1.6388416472647436 -1.7693988296417449 0.0 0.008
5 2020 and After Rural 0.0727709345083023 -13.917287633807826 -13.990058568316128 0.049 0.0
6 2020 and After Suburban 0.1438421422124148 -7.230488621220758 -7.374330763433172 0.022 0.0368
7 2020 and After Urban 0.2921678186765097 -2.869383763253332 -3.161551581929842 0.0 0.0

View File

@@ -0,0 +1,249 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from pathlib import Path
ROOT = Path('/home/dadams/Repos/colorado_spills')
ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025'
OUT = ROOT / 'out_tables'
OUT.mkdir(parents=True, exist_ok=True)
# Helpers
def read_csv_safely(path: Path) -> pd.DataFrame:
if path.exists():
return pd.read_csv(path)
raise FileNotFoundError(str(path))
# -------------------------
# Table 1: Data and Sample Overview
# -------------------------
def build_table1():
rows = []
# Attempt to load sample sizes and exclusions
raw_path = ANALYSIS_DIR / 'spills_with_trim_flag.csv'
trimmed_path = ANALYSIS_DIR / 'spills_trimmed.csv'
removed_path = ANALYSIS_DIR / 'spills_trimmed_removed.csv'
n_raw = read_csv_safely(raw_path).shape[0] if raw_path.exists() else None
n_trimmed = read_csv_safely(trimmed_path).shape[0]
n_removed = read_csv_safely(removed_path).shape[0] if removed_path.exists() else (n_raw - n_trimmed if n_raw else None)
rows.append({'Component': 'Primary data source', 'Details': 'COGCC Spill/Release Reports (Form 19), statewide Colorado'})
rows.append({'Component': 'Units and variables', 'Details': 'Dates, volumes, locations, facility, county; outcome = report delay (days)'})
rows.append({'Component': 'Temporal coverage', 'Details': '20152022 (pre-policy and post-2020 policy)'})
rows.append({'Component': 'Geographic coverage', 'Details': 'Statewide; top counties: Weld, Garfield, Las Animas'})
if n_raw:
rows.append({'Component': 'Initial sample size', 'Details': f'N = {n_raw:,} spill reports'})
rows.append({'Component': 'Analytical sample size', 'Details': f'N = {n_trimmed:,} after IQR trimming'})
if n_removed is not None:
rows.append({'Component': 'Exclusions', 'Details': f'{n_removed:,} observations removed by IQR/outlier rules'})
# Optional joins
rows.append({'Component': 'Demographics join', 'Details': 'Census tract RUCA and demographics (ACS), spatial join to spill locations'})
t1 = pd.DataFrame(rows)
t1.to_csv(OUT / 'table1_data_overview.csv', index=False)
return t1
# -------------------------
# Table 2: Variable Definitions and Summary Statistics
# -------------------------
def build_table2():
df = read_csv_safely(ANALYSIS_DIR / 'spills_trimmed.csv')
# Ensure Period exists
if 'Period' not in df.columns:
df['Period'] = pd.to_datetime(df['Initial Report Date']).dt.year.map(lambda y: 'Before 2020' if y < 2020 else '2020 and After')
# Definitions
definitions = [
{'Variable': 'Report Delay (days)', 'Definition': 'Days between discovery and initial report; outcome variable', 'Type': 'Continuous'},
{'Variable': 'Spill Type', 'Definition': 'Historical vs Recent spill classification', 'Type': 'Categorical'},
{'Variable': 'Policy Period', 'Definition': 'Before 2020 vs 2020 and After (SB 19-181 implementation)', 'Type': 'Categorical'},
{'Variable': 'Rurality', 'Definition': 'Rural, Suburban, Urban (RUCA-based)', 'Type': 'Categorical'},
{'Variable': 'Median Household Income', 'Definition': 'Census tract ACS median income (USD)', 'Type': 'Continuous (optional)'},
{'Variable': 'Percent Poverty', 'Definition': 'Census tract percent below poverty line', 'Type': 'Continuous (optional)'}
]
defs_df = pd.DataFrame(definitions)
# Summary stats
# Outcome: median [IQR] preferred due to skew; also mean (SD)
y = df['report_delay_full_days'].dropna()
outcome_row = {
'Variable': 'Report Delay (days)',
'N': len(y),
'Mean (SD)': f"{y.mean():.2f} ({y.std():.2f})",
'Median [IQR]': f"{y.median():.2f} [{y.quantile(0.25):.2f}, {y.quantile(0.75):.2f}]"
}
# Categorical stats
def cat_summary(series: pd.Series, label: str):
counts = series.value_counts(dropna=False)
rows = []
for k, v in counts.items():
pct = 100 * v / len(series)
rows.append({'Variable': f"{label}: {k}", 'N': int(v), 'Percent': f"{pct:.2f}%"})
return rows
cat_rows = []
cat_rows += cat_summary(df['Spill Type'], 'Spill Type') if 'Spill Type' in df.columns else []
cat_rows += cat_summary(df['Period'], 'Period')
cat_rows += cat_summary(df['rurality'], 'Rurality') if 'rurality' in df.columns else []
# Covariates
demo_rows = []
for col, label in [
('median_household_income', 'Median Household Income'),
('percent_poverty', 'Percent Poverty'),
('percent_white', 'Percent White'),
('percent_hispanic', 'Percent Hispanic'),
('unemployment_rate', 'Unemployment Rate'),
]:
if col in df.columns:
s = df[col].dropna()
demo_rows.append({'Variable': label, 'N': len(s), 'Mean (SD)': f"{s.mean():.2f} ({s.std():.2f})", 'Median [IQR]': f"{s.median():.2f} [{s.quantile(0.25):.2f}, {s.quantile(0.75):.2f}]"})
summary_df = pd.DataFrame([outcome_row] + cat_rows + demo_rows)
# Save
defs_df.to_csv(OUT / 'table2_variable_definitions.csv', index=False)
summary_df.to_csv(OUT / 'table2_summary_statistics.csv', index=False)
# Combined view
combined = defs_df.merge(summary_df[['Variable','N','Mean (SD)','Median [IQR]']], on='Variable', how='left')
combined.to_csv(OUT / 'table2_definitions_and_summary.csv', index=False)
return combined
# -------------------------
# Table 3: Core Model Results (GLM) — contrasts
# -------------------------
def build_table3():
path = ANALYSIS_DIR / 'final_contrast_comparison_combined.csv'
df = read_csv_safely(path)
# Keep Poisson as primary model
dfp = df[df['model'] == 'poisson'].copy()
# We have Period and rurality; compute formatted strings
dfp['Delta (days)'] = dfp['poisson_median']
dfp['95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower']:.2f}, {r['poisson_ci_upper']:.2f})", axis=1)
dfp['p'] = dfp['poisson_p'].apply(lambda x: '<0.001' if x < 0.001 else f"{x:.3f}")
dfp['Delta (hours)'] = dfp['poisson_median_hours']
dfp['Hours 95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower_hours']:.2f}, {r['poisson_ci_upper_hours']:.2f})", axis=1)
order_period = {'Before 2020': 0, '2020 and After': 1}
order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2}
dfp['po'] = dfp['Period'].map(order_period)
dfp['ro'] = dfp['rurality'].map(order_r)
dfp = dfp.sort_values(['po','ro']).drop(columns=['po','ro'])
keep = ['Period','rurality','Delta (days)','95% CI','p','Delta (hours)','Hours 95% CI']
out = dfp[keep]
out.to_csv(OUT / 'table3_core_glm_contrasts.csv', index=False)
return out
# -------------------------
# Table 4: Predicted Means or Contrasts (predicted reporting delays)
# -------------------------
def build_table4():
path = ANALYSIS_DIR / 'poisson_predicted_groups_boot_final.csv'
df = read_csv_safely(path)
# Expect columns: spill_type, Period, rurality, predicted_mean, ci_low, ci_high (names may vary)
# Attempt to infer
col_map = {}
for c in df.columns:
lc = c.lower()
if 'pred' in lc and ('mean' in lc or 'delay' in lc): col_map['pred'] = c
if ('ci_low' in lc or 'lower' in lc): col_map['low'] = c
if ('ci_high' in lc or 'upper' in lc): col_map['high'] = c
# Required grouping fields
gcols = []
for name in ['spill_type','Spill Type']:
if name in df.columns: gcols.append(name); break
for name in ['Period']:
if name in df.columns: gcols.append(name); break
for name in ['rurality','Rurality']:
if name in df.columns: gcols.append(name); break
if len(gcols) < 3 or not {'pred','low','high'} <= set(col_map):
# Fall back to exporting raw for manual review
df.to_csv(OUT / 'table4_predicted_means_raw.csv', index=False)
return df
tidy = df[gcols + [col_map['pred'], col_map['low'], col_map['high']]].copy()
tidy = tidy.rename(columns={gcols[0]:'Spill Type', gcols[1]:'Period', gcols[2]:'Rurality', col_map['pred']:'Predicted (days)', col_map['low']:'Lower', col_map['high']:'Upper'})
tidy['95% CI'] = tidy.apply(lambda r: f"({r['Lower']:.2f}, {r['Upper']:.2f})", axis=1)
out = tidy[['Spill Type','Period','Rurality','Predicted (days)','95% CI']].copy()
# Sort
order_period = {'Before 2020': 0, '2020 and After': 1}
order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2}
out['po'] = out['Period'].map(order_period)
out['ro'] = out['Rurality'].map(order_r)
out = out.sort_values(['Spill Type','po','ro']).drop(columns=['po','ro'])
out.to_csv(OUT / 'table4_predicted_means.csv', index=False)
return out
# -------------------------
# Table 5: ITS or Robustness Checks
# -------------------------
def build_table5():
rows = []
# ITS coefficients summary if available
its_path = ANALYSIS_DIR / 'its_summary_with_boot.csv'
if its_path.exists():
its = pd.read_csv(its_path)
# Expect columns: model/group identifiers + level change, slope change, CIs
# Try to infer
candidates = {
'level': [c for c in its.columns if 'level' in c.lower() and ('change' in c.lower() or 'beta' in c.lower() or 'immediate' in c.lower())],
'slope': [c for c in its.columns if 'slope' in c.lower() and ('change' in c.lower() or 'beta' in c.lower())],
'low': [c for c in its.columns if 'low' in c.lower()],
'high': [c for c in its.columns if 'high' in c.lower()],
}
# Build a minimal summary per group if possible
key_cols = []
for name in ['spill_type','Spill Type']:
if name in its.columns: key_cols.append(name); break
for name in ['rurality','Rurality']:
if name in its.columns: key_cols.append(name); break
if key_cols and candidates['level'] and candidates['slope']:
summary = its[key_cols + [candidates['level'][0], candidates['slope'][0]]].copy()
summary = summary.rename(columns={key_cols[0]:'Spill Type', key_cols[1]:'Rurality', candidates['level'][0]:'Level Change (days)', candidates['slope'][0]:'Slope Change (days/month)'})
summary.to_csv(OUT / 'table5_its_summary.csv', index=False)
rows.append({'Section':'ITS Summary','Details':'Saved table5_its_summary.csv with level and slope changes by group'})
# Robustness: Poisson vs NB differences from appendix_table2_model_comparison.csv
cmp_path = ANALYSIS_DIR / 'appendix_table2_model_comparison.csv'
if cmp_path.exists():
cmp = pd.read_csv(cmp_path)
# Expect columns: Period, Rurality, Poisson Δ (days), NB Δ (days), Difference (NB - Poisson), Poisson p, NB p
keep_cols = [c for c in cmp.columns if 'Period' in c or 'Rurality' in c or 'Difference' in c or 'Poisson' in c or 'NB' in c]
cmp[keep_cols].to_csv(OUT / 'table5_robustness_poisson_vs_nb.csv', index=False)
rows.append({'Section':'Model Robustness','Details':'Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB'})
t5 = pd.DataFrame(rows) if rows else pd.DataFrame([{'Section':'Note','Details':'No ITS or robustness sources found'}])
t5.to_csv(OUT / 'table5_overview.csv', index=False)
return t5
def main():
t1 = build_table1()
t2 = build_table2()
t3 = build_table3()
t4 = build_table4()
t5 = build_table5()
print('Built tables:')
print(' -', OUT / 'table1_data_overview.csv')
print(' -', OUT / 'table2_definitions_and_summary.csv')
print(' -', OUT / 'table3_core_glm_contrasts.csv')
print(' -', OUT / 'table4_predicted_means.csv')
print(' -', OUT / 'table5_overview.csv')
if __name__ == '__main__':
main()

258
scripts/make_spill_map.py Normal file
View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Create a production-ready map of Colorado oil & gas spill locations for journal use.
Outputs:
- out_tables/spill_map_points.png (vector-like high-res)
- out_tables/spill_map_hex.pdf (publication PDF)
"""
import os
from pathlib import Path
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import matplotlib.patheffects as pe
from sqlalchemy import create_engine
ROOT = Path('/home/dadams/Repos/colorado_spills')
ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025'
OUT = ROOT / 'out_tables'
OUT.mkdir(parents=True, exist_ok=True)
DB_NAME = 'colorado_spills'
def get_engine():
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
if not all([user, password, host, port]):
return None
url = f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{DB_NAME}'
try:
return create_engine(url)
except Exception:
return None
def load_spills() -> gpd.GeoDataFrame:
"""Load spills as a GeoDataFrame, preferring PostGIS, else CSV fallback."""
engine = get_engine()
if engine is not None:
# Try multiple geometry options from the PostGIS table
try_statements = [
("SELECT *, geom FROM spills_with_demographics_geog", 'geom'),
("SELECT *, geometry FROM spills_with_demographics_geog", 'geometry'),
("SELECT *, CAST(geog AS geometry) AS geom FROM spills_with_demographics_geog", 'geom'),
("SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM spills_with_demographics_geog", 'geom'),
]
for sql, geom_col in try_statements:
try:
gdf = gpd.read_postgis(sql, engine, geom_col=geom_col)
# Ensure CRS
if gdf.crs is None:
gdf.set_crs('EPSG:4326', inplace=True)
return gdf
except Exception:
pass
# Fallback to pandas + lat/lon if present in the DB table
try:
df = pd.read_sql_table('spills_with_demographics_geog', engine)
if {'Latitude', 'Longitude'}.issubset(df.columns):
return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
except Exception:
pass
# CSV fallback
csv = ANALYSIS_DIR / 'spills_trimmed.csv'
df = pd.read_csv(csv)
if not {'Latitude', 'Longitude'}.issubset(df.columns):
raise ValueError('Expected Latitude/Longitude columns in spills_trimmed.csv')
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
return gdf
def try_query_boundary(engine) -> gpd.GeoDataFrame | None:
"""Attempt to find a Colorado boundary polygon from common table names/columns in PostGIS."""
candidates = [
# table, geom expression, where
("cb_2018_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("cb_2019_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("tl_2020_us_state", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("us_states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("colorado_boundary", "geom", "1=1"),
("co_boundary", "geom", "1=1"),
]
alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"]
for table, geom_expr, where_clause in candidates:
for gexpr in alt_geoms:
sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}"
try:
gdf = gpd.read_postgis(sql, engine, geom_col='geom')
if not gdf.empty:
if gdf.crs is None:
gdf.set_crs('EPSG:4326', inplace=True)
return gdf[["geom"]].to_crs(4326)
except Exception:
continue
return None
def try_query_counties(engine) -> gpd.GeoDataFrame | None:
"""Attempt to fetch Colorado counties from PostGIS for boundary overlays."""
candidates = [
("cb_2018_us_county_500k", "geom", "statefp='08' OR stusps='CO'"),
("cb_2019_us_county_500k", "geom", "statefp='08' OR stusps='CO'"),
("tl_2020_us_county", "geom", "statefp='08' OR stusps='CO'"),
("counties", "geom", "statefp='08' OR stusps='CO' OR state='CO' OR state_name ILIKE 'Colorado'"),
("co_counties", "geom", "1=1"),
("colorado_counties", "geom", "1=1"),
]
alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"]
for table, geom_expr, where_clause in candidates:
for gexpr in alt_geoms:
sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}"
try:
gdf = gpd.read_postgis(sql, engine, geom_col='geom')
if not gdf.empty:
if gdf.crs is None:
gdf.set_crs('EPSG:4326', inplace=True)
return gdf[["geom"]].to_crs(4326)
except Exception:
continue
return None
def get_colorado_boundary(spills_gdf: gpd.GeoDataFrame) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame | None]:
"""Return Colorado boundary and optional counties. Prefer PostGIS; fallback to convex-hull from points."""
engine = get_engine()
counties = None
if engine is not None:
boundary = try_query_boundary(engine)
if boundary is not None and not boundary.empty:
counties = try_query_counties(engine)
return boundary, counties
# Fallback: convex hull of points
boundary_geom = spills_gdf.to_crs(3857).unary_union.convex_hull.buffer(30_000)
boundary = gpd.GeoDataFrame(geometry=[boundary_geom], crs='EPSG:3857').to_crs(4326)
return boundary, counties
def add_north_arrow(ax, x=0.95, y=0.1, size=18):
ax.annotate('N', xy=(x, y+0.06), xytext=(x, y), xycoords='axes fraction',
ha='center', va='bottom', fontsize=size,
fontweight='bold', color='black',
path_effects=[pe.withStroke(linewidth=3, foreground='white')],
arrowprops=dict(facecolor='black', width=5, headwidth=15, headlength=18, shrink=0.05,
path_effects=[pe.withStroke(linewidth=3, foreground='white')]))
def style_axes(ax):
ax.set_axis_off()
for spine in ax.spines.values():
spine.set_visible(False)
def map_points(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None):
# Project to Web Mercator for plotting
spills_3857 = spills.to_crs(3857)
boundary_3857 = boundary.to_crs(3857)
# Figure aesthetics
fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True)
boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8)
if counties is not None and not counties.empty:
counties_3857 = counties.to_crs(3857)
counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3)
# Plot points with alpha and small size to avoid overplotting
spills_3857.plot(ax=ax, markersize=1.2, alpha=0.35, color='#2c7fb8')
# Tighten to boundary
minx, miny, maxx, maxy = boundary_3857.total_bounds
ax.set_xlim(minx, maxx)
ax.set_ylim(miny, maxy)
# Add scalebar (requires axis in meters)
ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7))
add_north_arrow(ax)
style_axes(ax)
# Dynamic title from available year fields
title = 'Oil & Gas Spill Reports in Colorado'
year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')]
if year_cols:
try:
yrs = spills[year_cols[0]].dropna().astype(int)
if not yrs.empty:
title = f"Oil & Gas Spill Reports in Colorado, {yrs.min()}{yrs.max()}"
except Exception:
pass
ax.set_title(title, fontsize=12, pad=10)
out_png = OUT / 'spill_map_points.png'
fig.savefig(out_png, dpi=300)
plt.close(fig)
return out_png
def map_hex(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None):
# Project to Web Mercator
spills_3857 = spills.to_crs(3857)
boundary_3857 = boundary.to_crs(3857)
# Build hexbin density using matplotlib hexbin on data coordinates
x = spills_3857.geometry.x.values
y = spills_3857.geometry.y.values
fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True)
boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8)
if counties is not None and not counties.empty:
counties_3857 = counties.to_crs(3857)
counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3)
# Use hexbin; gridsize tuned to CO scale; mincnt to suppress isolated cells
hb = ax.hexbin(x, y, gridsize=70, mincnt=3, cmap='viridis', alpha=0.9)
cb = fig.colorbar(hb, ax=ax, fraction=0.028, pad=0.01)
cb.set_label('Spill count')
# Scale bar and north arrow
ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7))
add_north_arrow(ax)
# Tighten to boundary
minx, miny, maxx, maxy = boundary_3857.total_bounds
ax.set_xlim(minx, maxx)
ax.set_ylim(miny, maxy)
style_axes(ax)
# Dynamic title from available year fields
title = 'Spill Density (Hexbin)'
year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')]
if year_cols:
try:
yrs = spills[year_cols[0]].dropna().astype(int)
if not yrs.empty:
title = f"Spill Density (Hexbin), {yrs.min()}{yrs.max()}"
except Exception:
pass
ax.set_title(title, fontsize=12, pad=10)
out_pdf = OUT / 'spill_map_hex.pdf'
fig.savefig(out_pdf)
plt.close(fig)
return out_pdf
def main():
spills = load_spills()
boundary, counties = get_colorado_boundary(spills)
p = map_points(spills, boundary, counties)
h = map_hex(spills, boundary, counties)
print(f"Saved: {p}")
print(f"Saved: {h}")
if __name__ == '__main__':
main()