updated maps for redux

2025-12-25 23:31:01 -08:00
parent 6342f8f78c
commit affae050a3
6 changed files with 59 additions and 22 deletions
--- a/2025/analayis11_2020_nooutliers.ipynb
+++ b/2025/analayis11_2020_nooutliers.ipynb
@@ -10874,7 +10874,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.13.11"
  }
 },
 "nbformat": 4,
--- a/data_setup/get_demographics.ipynb
+++ b/data_setup/get_demographics.ipynb
@@ -1077,7 +1077,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.13.11"
  }
 },
 "nbformat": 4,
--- a/out_tables/spill_map_hex.pdf
+++ b/out_tables/spill_map_hex.pdf
--- a/out_tables/spill_map_hex.png
+++ b/out_tables/spill_map_hex.png
--- a/out_tables/spill_map_points.png
+++ b/out_tables/spill_map_points.png
--- a/scripts/make_spill_map.py
+++ b/scripts/make_spill_map.py
@@ -22,6 +22,38 @@ OUT = ROOT / 'out_tables'
 OUT.mkdir(parents=True, exist_ok=True)

 DB_NAME = 'colorado_spills'
+SPILLS_TABLE_PRIMARY = 'spills_with_ruca'
+SPILLS_TABLE_FALLBACK = 'spills_with_demographics_geog'
+
+START_DATE = pd.Timestamp('2015-01-01')
+END_DATE = pd.Timestamp('2024-12-31')
+
+
+def filter_spills_by_date(df: pd.DataFrame) -> pd.DataFrame:
+    """Filter rows to [START_DATE, END_DATE] using 'Initial Report Date' and 'Date of Discovery'.
+
+    Uses the earliest non-null of the two columns (when both exist).
+    If neither column exists, returns df unchanged.
+    """
+    report_col = 'Initial Report Date'
+    discovery_col = 'Date of Discovery'
+    if report_col not in df.columns and discovery_col not in df.columns:
+        return df
+
+    report_dt = pd.to_datetime(df[report_col], errors='coerce') if report_col in df.columns else pd.NaT
+    disc_dt = pd.to_datetime(df[discovery_col], errors='coerce') if discovery_col in df.columns else pd.NaT
+
+    # earliest non-null
+    if isinstance(report_dt, pd.Series) and isinstance(disc_dt, pd.Series):
+        use_dt = report_dt.where(report_dt.notna(), disc_dt)
+        use_dt = use_dt.where(disc_dt.isna() | (use_dt <= disc_dt), disc_dt)
+    elif isinstance(report_dt, pd.Series):
+        use_dt = report_dt
+    else:
+        use_dt = disc_dt
+
+    mask = (use_dt >= START_DATE) & (use_dt <= END_DATE)
+    return df.loc[mask].copy()


 def get_engine():
@@ -42,33 +74,38 @@ def load_spills() -> gpd.GeoDataFrame:
    """Load spills as a GeoDataFrame, preferring PostGIS, else CSV fallback."""
    engine = get_engine()
    if engine is not None:
-        # Try multiple geometry options from the PostGIS table
-        try_statements = [
-            ("SELECT *, geom FROM spills_with_demographics_geog", 'geom'),
-            ("SELECT *, geometry FROM spills_with_demographics_geog", 'geometry'),
-            ("SELECT *, CAST(geog AS geometry) AS geom FROM spills_with_demographics_geog", 'geom'),
-            ("SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM spills_with_demographics_geog", 'geom'),
-        ]
-        for sql, geom_col in try_statements:
+        # Try multiple geometry options from the PostGIS table (prefer spills_with_ruca)
+        for table in (SPILLS_TABLE_PRIMARY, SPILLS_TABLE_FALLBACK):
+            try_statements = [
+                (f"SELECT *, geom FROM {table}", 'geom'),
+                (f"SELECT *, geometry FROM {table}", 'geometry'),
+                (f"SELECT *, CAST(geog AS geometry) AS geom FROM {table}", 'geom'),
+                (f"SELECT *, ST_SetSRID(CAST(geog AS geometry), 4326) AS geom FROM {table}", 'geom'),
+            ]
+            for sql, geom_col in try_statements:
+                try:
+                    gdf = gpd.read_postgis(sql, engine, geom_col=geom_col)
+                    # Ensure CRS
+                    if gdf.crs is None:
+                        gdf.set_crs('EPSG:4326', inplace=True)
+                    gdf = gdf.loc[filter_spills_by_date(gdf).index]
+                    return gdf
+                except Exception:
+                    pass
+
+            # Fallback to pandas + lat/lon if present in the DB table
            try:
-                gdf = gpd.read_postgis(sql, engine, geom_col=geom_col)
-                # Ensure CRS
-                if gdf.crs is None:
-                    gdf.set_crs('EPSG:4326', inplace=True)
-                return gdf
+                df = pd.read_sql_table(table, engine)
+                df = filter_spills_by_date(df)
+                if {'Latitude', 'Longitude'}.issubset(df.columns):
+                    return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
            except Exception:
                pass
-        # Fallback to pandas + lat/lon if present in the DB table
-        try:
-            df = pd.read_sql_table('spills_with_demographics_geog', engine)
-            if {'Latitude', 'Longitude'}.issubset(df.columns):
-                return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')
-        except Exception:
-            pass

    # CSV fallback
    csv = ANALYSIS_DIR / 'spills_trimmed.csv'
    df = pd.read_csv(csv)
+    df = filter_spills_by_date(df)
    if not {'Latitude', 'Longitude'}.issubset(df.columns):
        raise ValueError('Expected Latitude/Longitude columns in spills_trimmed.csv')
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']), crs='EPSG:4326')