diff --git a/analysis/new analysis Aug 2025/analayis11_2020_nooutliers.ipynb b/analysis/new analysis Aug 2025/analayis11_2020_nooutliers.ipynb index fbbe9f2..2e07d76 100644 --- a/analysis/new analysis Aug 2025/analayis11_2020_nooutliers.ipynb +++ b/analysis/new analysis Aug 2025/analayis11_2020_nooutliers.ipynb @@ -10874,7 +10874,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.7" + "version": "3.13.11" } }, "nbformat": 4, diff --git a/data_setup/get_demographics.ipynb b/data_setup/get_demographics.ipynb index b79f6dd..8e35e28 100644 --- a/data_setup/get_demographics.ipynb +++ b/data_setup/get_demographics.ipynb @@ -1077,7 +1077,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.7" + "version": "3.13.11" } }, "nbformat": 4, diff --git a/out_tables/spill_map_hex.pdf b/out_tables/spill_map_hex.pdf index ea62a43..5e75b8d 100644 Binary files a/out_tables/spill_map_hex.pdf and b/out_tables/spill_map_hex.pdf differ diff --git a/out_tables/spill_map_hex.png b/out_tables/spill_map_hex.png index c9a40b8..aa3dfe4 100644 Binary files a/out_tables/spill_map_hex.png and b/out_tables/spill_map_hex.png differ diff --git a/out_tables/spill_map_points.png b/out_tables/spill_map_points.png index 6df22d0..fa7165e 100644 Binary files a/out_tables/spill_map_points.png and b/out_tables/spill_map_points.png differ diff --git a/scripts/make_spill_map.py b/scripts/make_spill_map.py index 8efef73..2c41254 100644 --- a/scripts/make_spill_map.py +++ b/scripts/make_spill_map.py @@ -22,6 +22,38 @@ OUT = ROOT / 'out_tables' OUT.mkdir(parents=True, exist_ok=True) DB_NAME = 'colorado_spills' +SPILLS_TABLE_PRIMARY = 'spills_with_ruca' +SPILLS_TABLE_FALLBACK = 'spills_with_demographics_geog' + +START_DATE = pd.Timestamp('2015-01-01') +END_DATE = pd.Timestamp('2024-12-31') + + +def filter_spills_by_date(df: pd.DataFrame) -> pd.DataFrame: + """Filter rows to [START_DATE, END_DATE] using 'Initial Report Date' and 'Date of Discovery'. + + Uses the earliest non-null of the two columns (when both exist). + If neither column exists, returns df unchanged. + """ + report_col = 'Initial Report Date' + discovery_col = 'Date of Discovery' + if report_col not in df.columns and discovery_col not in df.columns: + return df + + report_dt = pd.to_datetime(df[report_col], errors='coerce') if report_col in df.columns else pd.NaT + disc_dt = pd.to_datetime(df[discovery_col], errors='coerce') if discovery_col in df.columns else pd.NaT + + # earliest non-null + if isinstance(report_dt, pd.Series) and isinstance(disc_dt, pd.Series): + use_dt = report_dt.where(report_dt.notna(), disc_dt) + use_dt = use_dt.where(disc_dt.isna() | (use_dt <= disc_dt), disc_dt) + elif isinstance(report_dt, pd.Series): + use_dt = report_dt + else: + use_dt = disc_dt + + mask = (use_dt >= START_DATE) & (use_dt <= END_DATE) + return df.loc[mask].copy() def get_engine(): @@ -42,33 +74,38 @@ def load_spills() -> gpd.GeoDataFrame: """Load spills as a GeoDataFrame, preferring PostGIS, else CSV fallback.""" engine = get_engine() if engine is not None: - # Try multiple geometry options from the PostGIS table - try_statements = [ - ("SELECT *, geom FROM spills_with_demographics_geog", 'geom'), - ("SELECT *, geometry FROM spills_with_demographics_geog", 'geometry'), - ("SELECT *, CAST(geog AS geometry) AS geom FROM spills_with_demographics_geog", 'geom'), - ("SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM spills_with_demographics_geog", 'geom'), - ] - for sql, geom_col in try_statements: + # Try multiple geometry options from the PostGIS table (prefer spills_with_ruca) + for table in (SPILLS_TABLE_PRIMARY, SPILLS_TABLE_FALLBACK): + try_statements = [ + (f"SELECT *, geom FROM {table}", 'geom'), + (f"SELECT *, geometry FROM {table}", 'geometry'), + (f"SELECT *, CAST(geog AS geometry) AS geom FROM {table}", 'geom'), + (f"SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM {table}", 'geom'), + ] + for sql, geom_col in try_statements: + try: + gdf = gpd.read_postgis(sql, engine, geom_col=geom_col) + # Ensure CRS + if gdf.crs is None: + gdf.set_crs('EPSG:4326', inplace=True) + gdf = gdf.loc[filter_spills_by_date(gdf).index] + return gdf + except Exception: + pass + + # Fallback to pandas + lat/lon if present in the DB table try: - gdf = gpd.read_postgis(sql, engine, geom_col=geom_col) - # Ensure CRS - if gdf.crs is None: - gdf.set_crs('EPSG:4326', inplace=True) - return gdf + df = pd.read_sql_table(table, engine) + df = filter_spills_by_date(df) + if {'Latitude', 'Longitude'}.issubset(df.columns): + return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326') except Exception: pass - # Fallback to pandas + lat/lon if present in the DB table - try: - df = pd.read_sql_table('spills_with_demographics_geog', engine) - if {'Latitude', 'Longitude'}.issubset(df.columns): - return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326') - except Exception: - pass # CSV fallback csv = ANALYSIS_DIR / 'spills_trimmed.csv' df = pd.read_csv(csv) + df = filter_spills_by_date(df) if not {'Latitude', 'Longitude'}.issubset(df.columns): raise ValueError('Expected Latitude/Longitude columns in spills_trimmed.csv') gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')