tables and maps

This commit is contained in:
2025-11-01 15:10:32 -07:00
parent abff7981c2
commit dd40446f8c
12 changed files with 573 additions and 0 deletions

View File

@@ -0,0 +1,249 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from pathlib import Path
ROOT = Path('/home/dadams/Repos/colorado_spills')
ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025'
OUT = ROOT / 'out_tables'
OUT.mkdir(parents=True, exist_ok=True)
# Helpers
def read_csv_safely(path: Path) -> pd.DataFrame:
if path.exists():
return pd.read_csv(path)
raise FileNotFoundError(str(path))
# -------------------------
# Table 1: Data and Sample Overview
# -------------------------
def build_table1():
rows = []
# Attempt to load sample sizes and exclusions
raw_path = ANALYSIS_DIR / 'spills_with_trim_flag.csv'
trimmed_path = ANALYSIS_DIR / 'spills_trimmed.csv'
removed_path = ANALYSIS_DIR / 'spills_trimmed_removed.csv'
n_raw = read_csv_safely(raw_path).shape[0] if raw_path.exists() else None
n_trimmed = read_csv_safely(trimmed_path).shape[0]
n_removed = read_csv_safely(removed_path).shape[0] if removed_path.exists() else (n_raw - n_trimmed if n_raw else None)
rows.append({'Component': 'Primary data source', 'Details': 'COGCC Spill/Release Reports (Form 19), statewide Colorado'})
rows.append({'Component': 'Units and variables', 'Details': 'Dates, volumes, locations, facility, county; outcome = report delay (days)'})
rows.append({'Component': 'Temporal coverage', 'Details': '20152022 (pre-policy and post-2020 policy)'})
rows.append({'Component': 'Geographic coverage', 'Details': 'Statewide; top counties: Weld, Garfield, Las Animas'})
if n_raw:
rows.append({'Component': 'Initial sample size', 'Details': f'N = {n_raw:,} spill reports'})
rows.append({'Component': 'Analytical sample size', 'Details': f'N = {n_trimmed:,} after IQR trimming'})
if n_removed is not None:
rows.append({'Component': 'Exclusions', 'Details': f'{n_removed:,} observations removed by IQR/outlier rules'})
# Optional joins
rows.append({'Component': 'Demographics join', 'Details': 'Census tract RUCA and demographics (ACS), spatial join to spill locations'})
t1 = pd.DataFrame(rows)
t1.to_csv(OUT / 'table1_data_overview.csv', index=False)
return t1
# -------------------------
# Table 2: Variable Definitions and Summary Statistics
# -------------------------
def build_table2():
df = read_csv_safely(ANALYSIS_DIR / 'spills_trimmed.csv')
# Ensure Period exists
if 'Period' not in df.columns:
df['Period'] = pd.to_datetime(df['Initial Report Date']).dt.year.map(lambda y: 'Before 2020' if y < 2020 else '2020 and After')
# Definitions
definitions = [
{'Variable': 'Report Delay (days)', 'Definition': 'Days between discovery and initial report; outcome variable', 'Type': 'Continuous'},
{'Variable': 'Spill Type', 'Definition': 'Historical vs Recent spill classification', 'Type': 'Categorical'},
{'Variable': 'Policy Period', 'Definition': 'Before 2020 vs 2020 and After (SB 19-181 implementation)', 'Type': 'Categorical'},
{'Variable': 'Rurality', 'Definition': 'Rural, Suburban, Urban (RUCA-based)', 'Type': 'Categorical'},
{'Variable': 'Median Household Income', 'Definition': 'Census tract ACS median income (USD)', 'Type': 'Continuous (optional)'},
{'Variable': 'Percent Poverty', 'Definition': 'Census tract percent below poverty line', 'Type': 'Continuous (optional)'}
]
defs_df = pd.DataFrame(definitions)
# Summary stats
# Outcome: median [IQR] preferred due to skew; also mean (SD)
y = df['report_delay_full_days'].dropna()
outcome_row = {
'Variable': 'Report Delay (days)',
'N': len(y),
'Mean (SD)': f"{y.mean():.2f} ({y.std():.2f})",
'Median [IQR]': f"{y.median():.2f} [{y.quantile(0.25):.2f}, {y.quantile(0.75):.2f}]"
}
# Categorical stats
def cat_summary(series: pd.Series, label: str):
counts = series.value_counts(dropna=False)
rows = []
for k, v in counts.items():
pct = 100 * v / len(series)
rows.append({'Variable': f"{label}: {k}", 'N': int(v), 'Percent': f"{pct:.2f}%"})
return rows
cat_rows = []
cat_rows += cat_summary(df['Spill Type'], 'Spill Type') if 'Spill Type' in df.columns else []
cat_rows += cat_summary(df['Period'], 'Period')
cat_rows += cat_summary(df['rurality'], 'Rurality') if 'rurality' in df.columns else []
# Covariates
demo_rows = []
for col, label in [
('median_household_income', 'Median Household Income'),
('percent_poverty', 'Percent Poverty'),
('percent_white', 'Percent White'),
('percent_hispanic', 'Percent Hispanic'),
('unemployment_rate', 'Unemployment Rate'),
]:
if col in df.columns:
s = df[col].dropna()
demo_rows.append({'Variable': label, 'N': len(s), 'Mean (SD)': f"{s.mean():.2f} ({s.std():.2f})", 'Median [IQR]': f"{s.median():.2f} [{s.quantile(0.25):.2f}, {s.quantile(0.75):.2f}]"})
summary_df = pd.DataFrame([outcome_row] + cat_rows + demo_rows)
# Save
defs_df.to_csv(OUT / 'table2_variable_definitions.csv', index=False)
summary_df.to_csv(OUT / 'table2_summary_statistics.csv', index=False)
# Combined view
combined = defs_df.merge(summary_df[['Variable','N','Mean (SD)','Median [IQR]']], on='Variable', how='left')
combined.to_csv(OUT / 'table2_definitions_and_summary.csv', index=False)
return combined
# -------------------------
# Table 3: Core Model Results (GLM) — contrasts
# -------------------------
def build_table3():
path = ANALYSIS_DIR / 'final_contrast_comparison_combined.csv'
df = read_csv_safely(path)
# Keep Poisson as primary model
dfp = df[df['model'] == 'poisson'].copy()
# We have Period and rurality; compute formatted strings
dfp['Delta (days)'] = dfp['poisson_median']
dfp['95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower']:.2f}, {r['poisson_ci_upper']:.2f})", axis=1)
dfp['p'] = dfp['poisson_p'].apply(lambda x: '<0.001' if x < 0.001 else f"{x:.3f}")
dfp['Delta (hours)'] = dfp['poisson_median_hours']
dfp['Hours 95% CI'] = dfp.apply(lambda r: f"({r['poisson_ci_lower_hours']:.2f}, {r['poisson_ci_upper_hours']:.2f})", axis=1)
order_period = {'Before 2020': 0, '2020 and After': 1}
order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2}
dfp['po'] = dfp['Period'].map(order_period)
dfp['ro'] = dfp['rurality'].map(order_r)
dfp = dfp.sort_values(['po','ro']).drop(columns=['po','ro'])
keep = ['Period','rurality','Delta (days)','95% CI','p','Delta (hours)','Hours 95% CI']
out = dfp[keep]
out.to_csv(OUT / 'table3_core_glm_contrasts.csv', index=False)
return out
# -------------------------
# Table 4: Predicted Means or Contrasts (predicted reporting delays)
# -------------------------
def build_table4():
path = ANALYSIS_DIR / 'poisson_predicted_groups_boot_final.csv'
df = read_csv_safely(path)
# Expect columns: spill_type, Period, rurality, predicted_mean, ci_low, ci_high (names may vary)
# Attempt to infer
col_map = {}
for c in df.columns:
lc = c.lower()
if 'pred' in lc and ('mean' in lc or 'delay' in lc): col_map['pred'] = c
if ('ci_low' in lc or 'lower' in lc): col_map['low'] = c
if ('ci_high' in lc or 'upper' in lc): col_map['high'] = c
# Required grouping fields
gcols = []
for name in ['spill_type','Spill Type']:
if name in df.columns: gcols.append(name); break
for name in ['Period']:
if name in df.columns: gcols.append(name); break
for name in ['rurality','Rurality']:
if name in df.columns: gcols.append(name); break
if len(gcols) < 3 or not {'pred','low','high'} <= set(col_map):
# Fall back to exporting raw for manual review
df.to_csv(OUT / 'table4_predicted_means_raw.csv', index=False)
return df
tidy = df[gcols + [col_map['pred'], col_map['low'], col_map['high']]].copy()
tidy = tidy.rename(columns={gcols[0]:'Spill Type', gcols[1]:'Period', gcols[2]:'Rurality', col_map['pred']:'Predicted (days)', col_map['low']:'Lower', col_map['high']:'Upper'})
tidy['95% CI'] = tidy.apply(lambda r: f"({r['Lower']:.2f}, {r['Upper']:.2f})", axis=1)
out = tidy[['Spill Type','Period','Rurality','Predicted (days)','95% CI']].copy()
# Sort
order_period = {'Before 2020': 0, '2020 and After': 1}
order_r = {'Rural': 0, 'Suburban': 1, 'Urban': 2}
out['po'] = out['Period'].map(order_period)
out['ro'] = out['Rurality'].map(order_r)
out = out.sort_values(['Spill Type','po','ro']).drop(columns=['po','ro'])
out.to_csv(OUT / 'table4_predicted_means.csv', index=False)
return out
# -------------------------
# Table 5: ITS or Robustness Checks
# -------------------------
def build_table5():
rows = []
# ITS coefficients summary if available
its_path = ANALYSIS_DIR / 'its_summary_with_boot.csv'
if its_path.exists():
its = pd.read_csv(its_path)
# Expect columns: model/group identifiers + level change, slope change, CIs
# Try to infer
candidates = {
'level': [c for c in its.columns if 'level' in c.lower() and ('change' in c.lower() or 'beta' in c.lower() or 'immediate' in c.lower())],
'slope': [c for c in its.columns if 'slope' in c.lower() and ('change' in c.lower() or 'beta' in c.lower())],
'low': [c for c in its.columns if 'low' in c.lower()],
'high': [c for c in its.columns if 'high' in c.lower()],
}
# Build a minimal summary per group if possible
key_cols = []
for name in ['spill_type','Spill Type']:
if name in its.columns: key_cols.append(name); break
for name in ['rurality','Rurality']:
if name in its.columns: key_cols.append(name); break
if key_cols and candidates['level'] and candidates['slope']:
summary = its[key_cols + [candidates['level'][0], candidates['slope'][0]]].copy()
summary = summary.rename(columns={key_cols[0]:'Spill Type', key_cols[1]:'Rurality', candidates['level'][0]:'Level Change (days)', candidates['slope'][0]:'Slope Change (days/month)'})
summary.to_csv(OUT / 'table5_its_summary.csv', index=False)
rows.append({'Section':'ITS Summary','Details':'Saved table5_its_summary.csv with level and slope changes by group'})
# Robustness: Poisson vs NB differences from appendix_table2_model_comparison.csv
cmp_path = ANALYSIS_DIR / 'appendix_table2_model_comparison.csv'
if cmp_path.exists():
cmp = pd.read_csv(cmp_path)
# Expect columns: Period, Rurality, Poisson Δ (days), NB Δ (days), Difference (NB - Poisson), Poisson p, NB p
keep_cols = [c for c in cmp.columns if 'Period' in c or 'Rurality' in c or 'Difference' in c or 'Poisson' in c or 'NB' in c]
cmp[keep_cols].to_csv(OUT / 'table5_robustness_poisson_vs_nb.csv', index=False)
rows.append({'Section':'Model Robustness','Details':'Saved table5_robustness_poisson_vs_nb.csv comparing Poisson vs NB'})
t5 = pd.DataFrame(rows) if rows else pd.DataFrame([{'Section':'Note','Details':'No ITS or robustness sources found'}])
t5.to_csv(OUT / 'table5_overview.csv', index=False)
return t5
def main():
t1 = build_table1()
t2 = build_table2()
t3 = build_table3()
t4 = build_table4()
t5 = build_table5()
print('Built tables:')
print(' -', OUT / 'table1_data_overview.csv')
print(' -', OUT / 'table2_definitions_and_summary.csv')
print(' -', OUT / 'table3_core_glm_contrasts.csv')
print(' -', OUT / 'table4_predicted_means.csv')
print(' -', OUT / 'table5_overview.csv')
if __name__ == '__main__':
main()

258
scripts/make_spill_map.py Normal file
View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Create a production-ready map of Colorado oil & gas spill locations for journal use.
Outputs:
- out_tables/spill_map_points.png (vector-like high-res)
- out_tables/spill_map_hex.pdf (publication PDF)
"""
import os
from pathlib import Path
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import matplotlib.patheffects as pe
from sqlalchemy import create_engine
ROOT = Path('/home/dadams/Repos/colorado_spills')
ANALYSIS_DIR = ROOT / 'analysis' / 'new analysis Aug 2025'
OUT = ROOT / 'out_tables'
OUT.mkdir(parents=True, exist_ok=True)
DB_NAME = 'colorado_spills'
def get_engine():
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
if not all([user, password, host, port]):
return None
url = f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{DB_NAME}'
try:
return create_engine(url)
except Exception:
return None
def load_spills() -> gpd.GeoDataFrame:
"""Load spills as a GeoDataFrame, preferring PostGIS, else CSV fallback."""
engine = get_engine()
if engine is not None:
# Try multiple geometry options from the PostGIS table
try_statements = [
("SELECT *, geom FROM spills_with_demographics_geog", 'geom'),
("SELECT *, geometry FROM spills_with_demographics_geog", 'geometry'),
("SELECT *, CAST(geog AS geometry) AS geom FROM spills_with_demographics_geog", 'geom'),
("SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM spills_with_demographics_geog", 'geom'),
]
for sql, geom_col in try_statements:
try:
gdf = gpd.read_postgis(sql, engine, geom_col=geom_col)
# Ensure CRS
if gdf.crs is None:
gdf.set_crs('EPSG:4326', inplace=True)
return gdf
except Exception:
pass
# Fallback to pandas + lat/lon if present in the DB table
try:
df = pd.read_sql_table('spills_with_demographics_geog', engine)
if {'Latitude', 'Longitude'}.issubset(df.columns):
return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
except Exception:
pass
# CSV fallback
csv = ANALYSIS_DIR / 'spills_trimmed.csv'
df = pd.read_csv(csv)
if not {'Latitude', 'Longitude'}.issubset(df.columns):
raise ValueError('Expected Latitude/Longitude columns in spills_trimmed.csv')
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
return gdf
def try_query_boundary(engine) -> gpd.GeoDataFrame | None:
"""Attempt to find a Colorado boundary polygon from common table names/columns in PostGIS."""
candidates = [
# table, geom expression, where
("cb_2018_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("cb_2019_us_state_500k", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("tl_2020_us_state", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("us_states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("states", "geom", "stusps='CO' OR name ILIKE 'Colorado' OR statefp='08'"),
("colorado_boundary", "geom", "1=1"),
("co_boundary", "geom", "1=1"),
]
alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"]
for table, geom_expr, where_clause in candidates:
for gexpr in alt_geoms:
sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}"
try:
gdf = gpd.read_postgis(sql, engine, geom_col='geom')
if not gdf.empty:
if gdf.crs is None:
gdf.set_crs('EPSG:4326', inplace=True)
return gdf[["geom"]].to_crs(4326)
except Exception:
continue
return None
def try_query_counties(engine) -> gpd.GeoDataFrame | None:
"""Attempt to fetch Colorado counties from PostGIS for boundary overlays."""
candidates = [
("cb_2018_us_county_500k", "geom", "statefp='08' OR stusps='CO'"),
("cb_2019_us_county_500k", "geom", "statefp='08' OR stusps='CO'"),
("tl_2020_us_county", "geom", "statefp='08' OR stusps='CO'"),
("counties", "geom", "statefp='08' OR stusps='CO' OR state='CO' OR state_name ILIKE 'Colorado'"),
("co_counties", "geom", "1=1"),
("colorado_counties", "geom", "1=1"),
]
alt_geoms = ["geom", "geometry", "ST_SetSRID(CAST(geog AS geometry), 4326)"]
for table, geom_expr, where_clause in candidates:
for gexpr in alt_geoms:
sql = f"SELECT {gexpr} AS geom FROM {table} WHERE {where_clause}"
try:
gdf = gpd.read_postgis(sql, engine, geom_col='geom')
if not gdf.empty:
if gdf.crs is None:
gdf.set_crs('EPSG:4326', inplace=True)
return gdf[["geom"]].to_crs(4326)
except Exception:
continue
return None
def get_colorado_boundary(spills_gdf: gpd.GeoDataFrame) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame | None]:
"""Return Colorado boundary and optional counties. Prefer PostGIS; fallback to convex-hull from points."""
engine = get_engine()
counties = None
if engine is not None:
boundary = try_query_boundary(engine)
if boundary is not None and not boundary.empty:
counties = try_query_counties(engine)
return boundary, counties
# Fallback: convex hull of points
boundary_geom = spills_gdf.to_crs(3857).unary_union.convex_hull.buffer(30_000)
boundary = gpd.GeoDataFrame(geometry=[boundary_geom], crs='EPSG:3857').to_crs(4326)
return boundary, counties
def add_north_arrow(ax, x=0.95, y=0.1, size=18):
ax.annotate('N', xy=(x, y+0.06), xytext=(x, y), xycoords='axes fraction',
ha='center', va='bottom', fontsize=size,
fontweight='bold', color='black',
path_effects=[pe.withStroke(linewidth=3, foreground='white')],
arrowprops=dict(facecolor='black', width=5, headwidth=15, headlength=18, shrink=0.05,
path_effects=[pe.withStroke(linewidth=3, foreground='white')]))
def style_axes(ax):
ax.set_axis_off()
for spine in ax.spines.values():
spine.set_visible(False)
def map_points(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None):
# Project to Web Mercator for plotting
spills_3857 = spills.to_crs(3857)
boundary_3857 = boundary.to_crs(3857)
# Figure aesthetics
fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True)
boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8)
if counties is not None and not counties.empty:
counties_3857 = counties.to_crs(3857)
counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3)
# Plot points with alpha and small size to avoid overplotting
spills_3857.plot(ax=ax, markersize=1.2, alpha=0.35, color='#2c7fb8')
# Tighten to boundary
minx, miny, maxx, maxy = boundary_3857.total_bounds
ax.set_xlim(minx, maxx)
ax.set_ylim(miny, maxy)
# Add scalebar (requires axis in meters)
ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7))
add_north_arrow(ax)
style_axes(ax)
# Dynamic title from available year fields
title = 'Oil & Gas Spill Reports in Colorado'
year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')]
if year_cols:
try:
yrs = spills[year_cols[0]].dropna().astype(int)
if not yrs.empty:
title = f"Oil & Gas Spill Reports in Colorado, {yrs.min()}{yrs.max()}"
except Exception:
pass
ax.set_title(title, fontsize=12, pad=10)
out_png = OUT / 'spill_map_points.png'
fig.savefig(out_png, dpi=300)
plt.close(fig)
return out_png
def map_hex(spills: gpd.GeoDataFrame, boundary: gpd.GeoDataFrame, counties: gpd.GeoDataFrame | None):
# Project to Web Mercator
spills_3857 = spills.to_crs(3857)
boundary_3857 = boundary.to_crs(3857)
# Build hexbin density using matplotlib hexbin on data coordinates
x = spills_3857.geometry.x.values
y = spills_3857.geometry.y.values
fig, ax = plt.subplots(figsize=(7, 9), dpi=300, constrained_layout=True)
boundary_3857.plot(ax=ax, color='#f5f5f5', edgecolor='#888888', linewidth=0.8)
if counties is not None and not counties.empty:
counties_3857 = counties.to_crs(3857)
counties_3857.boundary.plot(ax=ax, color='#aaaaaa', linewidth=0.3)
# Use hexbin; gridsize tuned to CO scale; mincnt to suppress isolated cells
hb = ax.hexbin(x, y, gridsize=70, mincnt=3, cmap='viridis', alpha=0.9)
cb = fig.colorbar(hb, ax=ax, fraction=0.028, pad=0.01)
cb.set_label('Spill count')
# Scale bar and north arrow
ax.add_artist(ScaleBar(1, units='m', location='lower left', box_alpha=0.7))
add_north_arrow(ax)
# Tighten to boundary
minx, miny, maxx, maxy = boundary_3857.total_bounds
ax.set_xlim(minx, maxx)
ax.set_ylim(miny, maxy)
style_axes(ax)
# Dynamic title from available year fields
title = 'Spill Density (Hexbin)'
year_cols = [c for c in spills.columns if c.lower() in ('year', 'yr')]
if year_cols:
try:
yrs = spills[year_cols[0]].dropna().astype(int)
if not yrs.empty:
title = f"Spill Density (Hexbin), {yrs.min()}{yrs.max()}"
except Exception:
pass
ax.set_title(title, fontsize=12, pad=10)
out_pdf = OUT / 'spill_map_hex.pdf'
fig.savefig(out_pdf)
plt.close(fig)
return out_pdf
def main():
spills = load_spills()
boundary, counties = get_colorado_boundary(spills)
p = map_points(spills, boundary, counties)
h = map_hex(spills, boundary, counties)
print(f"Saved: {p}")
print(f"Saved: {h}")
if __name__ == '__main__':
main()