texas-district-analysis/rebuild/well_shape.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "64ba36a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RealDictRow({'version': 'PostgreSQL 17.6 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 15.2.1 20250813, 64-bit'})\n"
     ]
    }
   ],
   "source": [
    "# Cell 0: connect to Postgres\n",
    "# Requires: pip install psycopg2-binary\n",
    "import os\n",
    "import psycopg2\n",
    "from psycopg2.extras import RealDictCursor\n",
    "\n",
    "# Configure via environment variables (safer) or edit defaults below\n",
    "PGHOST = os.getenv(\"PGHOST\", \"localhost\")\n",
    "PGPORT = os.getenv(\"PGPORT\", \"5432\")\n",
    "PGUSER = os.getenv(\"PGUSER\", \"postgres\")\n",
    "PGPASSWORD = os.getenv(\"PGPASSWORD\", \"\")\n",
    "# Default DB name set to \"postgres\"; will fall back if PGDATABASE not set in env\n",
    "PGDATABASE = os.getenv(\"PGDATABASE\", \"texas_data\")\n",
    "\n",
    "if PGDATABASE is None or PGDATABASE == \"\":\n",
    "    print(\"[warn] PGDATABASE not set; using fallback 'postgres'\")\n",
    "    PGDATABASE = \"postgres\"\n",
    "\n",
    "def get_conn():\n",
    "    \"\"\"Return a new psycopg2 connection using configured environment variables.\"\"\"\n",
    "    return psycopg2.connect(\n",
    "        host=PGHOST,\n",
    "        port=PGPORT,\n",
    "        user=PGUSER,\n",
    "        password=PGPASSWORD,\n",
    "        dbname=PGDATABASE,\n",
    "    )\n",
    "\n",
    "# Quick connection test (only runs if executed as a script, not typical in notebook)\n",
    "if __name__ == \"__main__\":\n",
    "    try:\n",
    "        conn = get_conn()\n",
    "        with conn.cursor(cursor_factory=RealDictCursor) as cur:\n",
    "            cur.execute(\"SELECT version() AS version;\")\n",
    "            print(cur.fetchone())\n",
    "        conn.close()\n",
    "    except Exception as e:\n",
    "        print(\"Postgres connection failed:\", e)\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "43cbda02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "# Install geospatial dependencies\n",
    "%pip install --quiet geopandas shapely fiona pyproj geoalchemy2 sqlalchemy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "66396497",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 254 zip files\n",
      "Unified attribute column count (sample-based): 16\n",
      "Observed geometry types in sample: ['Point']\n",
      "Unified attribute column count (sample-based): 16\n",
      "Observed geometry types in sample: ['Point']\n",
      "Final unified attribute column count: 16\n",
      "Columns:\n",
      "['bottom_id', 'surface_id', 'symnum', 'apinum', 'reliab', 'api10', 'api', 'long27', 'lat27', 'long83', 'lat83', 'out_fips', 'cwellnum', 'radioact', 'wellid', 'stcode']\n",
      "Final unified attribute column count: 16\n",
      "Columns:\n",
      "['bottom_id', 'surface_id', 'symnum', 'apinum', 'reliab', 'api10', 'api', 'long27', 'lat27', 'long83', 'lat83', 'out_fips', 'cwellnum', 'radioact', 'wellid', 'stcode']\n"
     ]
    }
   ],
   "source": [
    "# Cell 2: scan shapefile zips and build unified schema\n",
    "import zipfile\n",
    "from pathlib import Path\n",
    "import geopandas as gpd\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "shp_zip_dir = Path('/home/dadams/data/rrc_shapefiles/shapefile_zips')\n",
    "if not shp_zip_dir.exists():\n",
    "    raise FileNotFoundError(f\"Directory {shp_zip_dir} not found\")\n",
    "\n",
    "zip_files = sorted(shp_zip_dir.glob('*.zip'))\n",
    "print(f\"Found {len(zip_files)} zip files\")\n",
    "if not zip_files:\n",
    "    raise SystemExit(\"No shapefile zip archives found\")\n",
    "\n",
    "def sanitize(name: str) -> str:\n",
    "    s = name.strip().lower()\n",
    "    s = re.sub(r\"\\s+\", \"_\", s)\n",
    "    s = re.sub(r\"[^a-z0-9_]+\", \"_\", s)\n",
    "    s = re.sub(r\"_+\", \"_\", s).strip('_')\n",
    "    if not s or s[0].isdigit():\n",
    "        s = f\"col_{s}\" if s else \"col\"\n",
    "    return s\n",
    "\n",
    "all_columns = []\n",
    "geom_types = set()\n",
    "\n",
    "sample_limit = 10  # read a few to get representative schema quickly\n",
    "for i, zf_path in enumerate(zip_files[:sample_limit]):\n",
    "    with zipfile.ZipFile(zf_path) as zf:\n",
    "        # find .shp file inside zip\n",
    "        shp_members = [m for m in zf.namelist() if m.lower().endswith('.shp')]\n",
    "        if not shp_members:\n",
    "            print(f\"[warn] No .shp in {zf_path.name}, skipping\")\n",
    "            continue\n",
    "        # Extract to temp directory\n",
    "        import tempfile, os\n",
    "        with tempfile.TemporaryDirectory() as tmpdir:\n",
    "            zf.extractall(tmpdir)\n",
    "            shp_path = Path(tmpdir) / shp_members[0]\n",
    "            try:\n",
    "                gdf = gpd.read_file(shp_path)\n",
    "            except Exception as e:\n",
    "                print(f\"[warn] Failed to read {zf_path.name}: {e}\")\n",
    "                continue\n",
    "            # record columns\n",
    "            cols = [sanitize(c) for c in gdf.columns if c.lower() != 'geometry']\n",
    "            for c in cols:\n",
    "                if c not in all_columns:\n",
    "                    all_columns.append(c)\n",
    "            # record geometry types\n",
    "            geom_types.update(gdf.geometry.geom_type.unique())\n",
    "\n",
    "print(f\"Unified attribute column count (sample-based): {len(all_columns)}\")\n",
    "print(f\"Observed geometry types in sample: {sorted(geom_types)}\")\n",
    "\n",
    "# We'll finalize column list after scanning all archives for robustness\n",
    "for zf_path in zip_files:\n",
    "    with zipfile.ZipFile(zf_path) as zf:\n",
    "        shp_members = [m for m in zf.namelist() if m.lower().endswith('.shp')]\n",
    "        if not shp_members:\n",
    "            continue\n",
    "        import tempfile\n",
    "        with tempfile.TemporaryDirectory() as tmpdir:\n",
    "            zf.extractall(tmpdir)\n",
    "            shp_path = Path(tmpdir) / shp_members[0]\n",
    "            try:\n",
    "                gdf = gpd.read_file(shp_path)\n",
    "            except Exception:\n",
    "                continue\n",
    "            cols = [sanitize(c) for c in gdf.columns if c.lower() != 'geometry']\n",
    "            for c in cols:\n",
    "                if c not in all_columns:\n",
    "                    all_columns.append(c)\n",
    "\n",
    "print(f\"Final unified attribute column count: {len(all_columns)}\")\n",
    "print(\"Columns:\")\n",
    "print(all_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "06ab9489",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created table well_shapes\n",
      "Inserted 1390194 rows into well_shapes\n",
      "Inserted 1390194 rows into well_shapes\n",
      "Added GIST index on well_shapes.geom\n",
      "Added GIST index on well_shapes.geom\n"
     ]
    }
   ],
   "source": [
    "# Cell 3: create PostGIS table well_shapes and ingest all shapefiles\n",
    "from sqlalchemy import create_engine, text\n",
    "from geoalchemy2 import Geometry\n",
    "import zipfile\n",
    "import tempfile\n",
    "\n",
    "# Build SQLAlchemy engine from PG* env\n",
    "pg_url = f\"postgresql+psycopg2://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}\"\n",
    "engine = create_engine(pg_url)\n",
    "\n",
    "# Ensure postgis is enabled\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis\"))\n",
    "\n",
    "# Create table with unified text columns + geometry\n",
    "attr_cols_sql = \", \".join(f'\"{c}\" TEXT' for c in all_columns)\n",
    "create_sql = f\"\"\"\n",
    "DROP TABLE IF EXISTS well_shapes;\n",
    "CREATE TABLE well_shapes (\n",
    "    id BIGSERIAL PRIMARY KEY,\n",
    "    {attr_cols_sql},\n",
    "    geom geometry(GEOMETRY, 4326)\n",
    ");\n",
    "\"\"\"\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(create_sql))\n",
    "print(\"Created table well_shapes\")\n",
    "\n",
    "# Ingest all shapefiles: read, normalize columns, ensure WGS84 (EPSG:4326), and bulk insert via GeoPandas\n",
    "zip_files = sorted(Path('/home/dadams/data/rrc_shapefiles/shapefile_zips').glob('*.zip'))\n",
    "rows_total = 0\n",
    "for zf_path in zip_files:\n",
    "    with zipfile.ZipFile(zf_path) as zf:\n",
    "        shp_members = [m for m in zf.namelist() if m.lower().endswith('.shp')]\n",
    "        if not shp_members:\n",
    "            print(f\"[warn] No .shp in {zf_path.name}, skipping\")\n",
    "            continue\n",
    "        with tempfile.TemporaryDirectory() as tmpdir:\n",
    "            zf.extractall(tmpdir)\n",
    "            shp_path = Path(tmpdir) / shp_members[0]\n",
    "            try:\n",
    "                gdf = gpd.read_file(shp_path)\n",
    "            except Exception as e:\n",
    "                print(f\"[warn] Failed to read {zf_path.name}: {e}\")\n",
    "                continue\n",
    "            if gdf.empty:\n",
    "                continue\n",
    "            # Normalize columns to match well_shapes schema\n",
    "            rename_map = {c: sanitize(c) for c in gdf.columns if c.lower() != 'geometry'}\n",
    "            gdf = gdf.rename(columns=rename_map)\n",
    "            # Keep only known columns; add missing as None\n",
    "            for c in all_columns:\n",
    "                if c not in gdf.columns:\n",
    "                    gdf[c] = None\n",
    "            keep_cols = all_columns + ['geometry']\n",
    "            gdf = gdf[[c for c in keep_cols if c in gdf.columns]]\n",
    "            # Reproject to EPSG:4326\n",
    "            try:\n",
    "                if gdf.crs is None:\n",
    "                    # Trust .prj if present; otherwise, assume EPSG:4326 as a fallback\n",
    "                    gdf = gdf.set_crs(4326, allow_override=True)\n",
    "                if gdf.crs.to_epsg() != 4326:\n",
    "                    gdf = gdf.to_crs(4326)\n",
    "            except Exception as e:\n",
    "                print(f\"[warn] CRS handling issue in {zf_path.name}: {e}; forcing 4326\")\n",
    "                gdf = gdf.set_crs(4326, allow_override=True)\n",
    "            # Geometry column must be named 'geom' to match table\n",
    "            gdf = gdf.rename_geometry('geom') if hasattr(gdf, 'rename_geometry') else gdf.set_geometry('geometry')\n",
    "            if gdf.geometry.name != 'geom':\n",
    "                gdf = gdf.set_geometry('geom')\n",
    "            # Cast attribute cols to string for safety\n",
    "            for c in all_columns:\n",
    "                if c in gdf.columns:\n",
    "                    gdf[c] = gdf[c].astype(str).where(gdf[c].notna(), None)\n",
    "            # Write in chunks using GeoPandas to_postgis (handles WKB/WKT correctly)\n",
    "            chunksize = 5000\n",
    "            for start in range(0, len(gdf), chunksize):\n",
    "                chunk = gdf.iloc[start:start+chunksize].copy()\n",
    "                try:\n",
    "                    chunk.to_postgis(\n",
    "                        'well_shapes',\n",
    "                        engine,\n",
    "                        if_exists='append',\n",
    "                        index=False,\n",
    "                    )\n",
    "                except Exception as e:\n",
    "                    print(f\"[warn] to_postgis failed for {zf_path.name} chunk starting {start}: {e}\")\n",
    "                    continue\n",
    "                rows_total += len(chunk)\n",
    "print(f\"Inserted {rows_total} rows into well_shapes\")\n",
    "\n",
    "# Add spatial index\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_well_shapes_geom ON well_shapes USING GIST (geom)\"))\n",
    "print(\"Added GIST index on well_shapes.geom\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "eac28d18",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "well_shapes row count: 1390194\n",
      "Geometry types:\n",
      "  ST_Point: 1390194\n",
      "   id bottom_id surface_id symnum      apinum reliab     api10       api  \\\n",
      "0   1   1314180    1314180      3  4200132761     55  00132761  00132761   \n",
      "1   2    441598     441598      7  4200101708     30  00101708  00101708   \n",
      "2   3    622581     622581      7  4200100126     15  00100126  00100126   \n",
      "3   4    443035     443035      7  4200131523     15  00131523  00131523   \n",
      "4   5   1060307    1060307     22  4200132335     40  00132335  00132335   \n",
      "\n",
      "         long27        lat27        long83        lat83 out_fips cwellnum  \\\n",
      "0  -95.77868611  31.98602778  -95.77893973  31.98619311        N        1   \n",
      "1  -95.50418172  31.63615535  -95.50442643  31.63633064        N        1   \n",
      "2  -95.52108432  32.06352291  -95.52133065  32.06368669        N      661   \n",
      "3  -95.62782436  31.94839591  -95.62807349  31.94856254        N        1   \n",
      "4  -95.99003131  31.96996832  -95.99028153  31.97013568        N       12   \n",
      "\n",
      "  radioact wellid stcode                                               geom  \n",
      "0     None  32761   None  0101000020E61000008DAC0326DAF157C07956CF2677FC...  \n",
      "1     None  01708   None  0101000020E6100000879ACC8548E057C009AB9490E6A2...  \n",
      "2     None  00126   None  0101000020E6100000BECB397B5DE157C0A07CADE22608...  \n",
      "3     None  31523   None  0101000020E6100000B53A295B32E857C0D591A2FED4F2...  \n",
      "4     None  32335   None  0101000020E61000001B9D99ED60FF57C084A4DFAD5AF8...  \n"
     ]
    }
   ],
   "source": [
    "# Cell 4: verification queries\n",
    "import pandas as pd\n",
    "from sqlalchemy import text\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    total = conn.execute(text(\"SELECT COUNT(*) FROM well_shapes\")).scalar()\n",
    "    types = conn.execute(text(\"SELECT ST_GeometryType(geom), COUNT(*) FROM well_shapes WHERE geom IS NOT NULL GROUP BY 1 ORDER BY 2 DESC\")).fetchall()\n",
    "print(f\"well_shapes row count: {total}\")\n",
    "print(\"Geometry types:\")\n",
    "for t, c in types:\n",
    "    print(f\"  {t}: {c}\")\n",
    "\n",
    "# Peek a few rows\n",
    "with engine.begin() as conn:\n",
    "    df = pd.read_sql(text(\"SELECT * FROM well_shapes LIMIT 5\"), conn)\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "285e1530",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Latitude/Longitude columns ensured and populated (where empty).\n"
     ]
    }
   ],
   "source": [
    "# Cell 5: ensure latitude/longitude numeric columns from geometry (no API normalization here)\n",
    "from sqlalchemy import text\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    # Add latitude/longitude columns (double precision) if not exist\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS latitude DOUBLE PRECISION\"))\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS longitude DOUBLE PRECISION\"))\n",
    "    # Populate lat/long from geometry (assuming Point geometries EPSG:4326)\n",
    "    conn.execute(text(\"\"\"\n",
    "        UPDATE well_shapes\n",
    "        SET longitude = ST_X(geom),\n",
    "            latitude  = ST_Y(geom)\n",
    "        WHERE geom IS NOT NULL\n",
    "          AND (longitude IS NULL OR latitude IS NULL)\n",
    "    \"\"\"))\n",
    "    # Indexes for latitude/longitude (optional for range queries)\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_well_shapes_latitude ON well_shapes (latitude)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_well_shapes_longitude ON well_shapes (longitude)\"))\n",
    "\n",
    "print(\"Latitude/Longitude columns ensured and populated (where empty).\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fbe0947c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'has_api': True, 'has_api10': True, 'has_apinum': True}\n",
      "API Derivation Preview Stats:\n",
      "{'ten_digit': 852653, 'eight_digit': 852653, 'api_anomaly': 363149, 'api10_anomaly': 1390193, 'apinum_anomaly': 537542}\n",
      "Clean samples (first 5):\n",
      "        api     api10      apinum api_digits_raw api10_digits_raw  \\\n",
      "0  05700422  05700422  4205700422       05700422         05700422   \n",
      "1  05700427  05700427  4205700427       05700427         05700427   \n",
      "2  05730811  05730811  4205730811       05730811         05730811   \n",
      "3  05731052  05731052  4205731052       05731052         05731052   \n",
      "4  05730924  05730924  4205730924       05730924         05730924   \n",
      "\n",
      "  apinum_digits_raw api10_clean api8_clean  \n",
      "0        4205700422  4205700422   05700422  \n",
      "1        4205700427  4205700427   05700427  \n",
      "2        4205730811  4205730811   05730811  \n",
      "3        4205731052  4205731052   05731052  \n",
      "4        4205730924  4205730924   05730924  \n",
      "Anomalous samples (first 5):\n",
      "   api api10 apinum api_digits_raw api10_digits_raw apinum_digits_raw\n",
      "0  249   249  42249            249              249             42249\n",
      "1  371   371  42371            371              371             42371\n",
      "2  371   371  42371            371              371             42371\n",
      "3  249   249  42249            249              249             42249\n",
      "4  249   249  42249            249              249             42249\n",
      "API Derivation Preview Stats:\n",
      "{'ten_digit': 852653, 'eight_digit': 852653, 'api_anomaly': 363149, 'api10_anomaly': 1390193, 'apinum_anomaly': 537542}\n",
      "Clean samples (first 5):\n",
      "        api     api10      apinum api_digits_raw api10_digits_raw  \\\n",
      "0  05700422  05700422  4205700422       05700422         05700422   \n",
      "1  05700427  05700427  4205700427       05700427         05700427   \n",
      "2  05730811  05730811  4205730811       05730811         05730811   \n",
      "3  05731052  05731052  4205731052       05731052         05731052   \n",
      "4  05730924  05730924  4205730924       05730924         05730924   \n",
      "\n",
      "  apinum_digits_raw api10_clean api8_clean  \n",
      "0        4205700422  4205700422   05700422  \n",
      "1        4205700427  4205700427   05700427  \n",
      "2        4205730811  4205730811   05730811  \n",
      "3        4205731052  4205731052   05731052  \n",
      "4        4205730924  4205730924   05730924  \n",
      "Anomalous samples (first 5):\n",
      "   api api10 apinum api_digits_raw api10_digits_raw apinum_digits_raw\n",
      "0  249   249  42249            249              249             42249\n",
      "1  371   371  42371            371              371             42371\n",
      "2  371   371  42371            371              371             42371\n",
      "3  249   249  42249            249              249             42249\n",
      "4  249   249  42249            249              249             42249\n"
     ]
    }
   ],
   "source": [
    "# Cell 6: API preview (derive from apinum/api10/api; read-only audit)\n",
    "from sqlalchemy import text\n",
    "import pandas as pd\n",
    "\n",
    "# Determine which source columns exist in well_shapes\n",
    "with engine.begin() as conn:\n",
    "    cols = [r[0] for r in conn.execute(text(\n",
    "        \"\"\"\n",
    "        SELECT column_name\n",
    "        FROM information_schema.columns\n",
    "        WHERE table_schema='public' AND table_name='well_shapes'\n",
    "        \"\"\"\n",
    "    )).fetchall()]\n",
    "\n",
    "has_api = 'api' in cols\n",
    "has_api10 = 'api10' in cols\n",
    "has_apinum = 'apinum' in cols\n",
    "print({'has_api': has_api, 'has_api10': has_api10, 'has_apinum': has_apinum})\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    # Working columns\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS api_digits_raw TEXT\"))\n",
    "    if has_api10:\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS api10_digits_raw TEXT\"))\n",
    "    if has_apinum:\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS apinum_digits_raw TEXT\"))\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS api10_clean TEXT\"))\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS api8_clean TEXT\"))\n",
    "\n",
    "    # Populate digits-only raw columns\n",
    "    if has_api:\n",
    "        conn.execute(text(\n",
    "            \"\"\"\n",
    "            UPDATE well_shapes\n",
    "            SET api_digits_raw = regexp_replace(api, '[^0-9]', '', 'g')\n",
    "            WHERE api IS NOT NULL AND api <> '' AND api_digits_raw IS NULL\n",
    "            \"\"\"\n",
    "        ))\n",
    "    if has_api10:\n",
    "        conn.execute(text(\n",
    "            \"\"\"\n",
    "            UPDATE well_shapes\n",
    "            SET api10_digits_raw = regexp_replace(api10, '[^0-9]', '', 'g')\n",
    "            WHERE api10 IS NOT NULL AND api10 <> '' AND api10_digits_raw IS NULL\n",
    "            \"\"\"\n",
    "        ))\n",
    "    if has_apinum:\n",
    "        conn.execute(text(\n",
    "            \"\"\"\n",
    "            UPDATE well_shapes\n",
    "            SET apinum_digits_raw = regexp_replace(apinum, '[^0-9]', '', 'g')\n",
    "            WHERE apinum IS NOT NULL AND apinum <> '' AND apinum_digits_raw IS NULL\n",
    "            \"\"\"\n",
    "        ))\n",
    "\n",
    "    # Build COALESCE expression for 10-digit API using available sources (precedence: api10, apinum, api)\n",
    "    parts = []\n",
    "    if has_api10:\n",
    "        parts.append(\"CASE WHEN length(api10_digits_raw)=10 THEN api10_digits_raw END\")\n",
    "    if has_apinum:\n",
    "        parts.append(\"CASE WHEN length(apinum_digits_raw)=10 THEN apinum_digits_raw END\")\n",
    "    if has_api:\n",
    "        parts.append(\"CASE WHEN length(api_digits_raw)=10 THEN api_digits_raw END\")\n",
    "    if not parts:\n",
    "        parts.append(\"NULL\")\n",
    "    expr = \"COALESCE(\" + \", \".join(parts) + \")\"\n",
    "\n",
    "    # Compute api10_clean and api8_clean (for TX, api8 = county(3)+unique(5) from 10-digit)\n",
    "    conn.execute(text(f\"\"\"\n",
    "        UPDATE well_shapes\n",
    "        SET api10_clean = {expr}\n",
    "        WHERE api10_clean IS NULL\n",
    "    \"\"\"))\n",
    "    conn.execute(text(\n",
    "        \"\"\"\n",
    "        UPDATE well_shapes\n",
    "        SET api8_clean = CASE WHEN api10_clean IS NOT NULL AND length(api10_clean)=10\n",
    "                              THEN substr(api10_clean, 3, 8)\n",
    "                              ELSE NULL END\n",
    "        WHERE api8_clean IS NULL\n",
    "        \"\"\"\n",
    "    ))\n",
    "\n",
    "# Preview stats\n",
    "with engine.begin() as conn:\n",
    "    stats = conn.execute(text(\n",
    "        \"\"\"\n",
    "        SELECT\n",
    "          COUNT(*) FILTER (WHERE api10_clean IS NOT NULL) AS ten_digit,\n",
    "          COUNT(*) FILTER (WHERE api8_clean IS NOT NULL)  AS eight_digit,\n",
    "          COUNT(*) FILTER (WHERE api_digits_raw IS NOT NULL AND length(api_digits_raw) NOT IN (8,10)) AS api_anomaly,\n",
    "          {api10_anom} AS api10_anomaly,\n",
    "          {apinum_anom} AS apinum_anomaly\n",
    "        FROM well_shapes\n",
    "        \"\"\"\n",
    "        .format(\n",
    "            api10_anom=(\"COUNT(*) FILTER (WHERE api10_digits_raw IS NOT NULL AND length(api10_digits_raw)<>10)\"\n",
    "                         if has_api10 else \"0\"),\n",
    "            apinum_anom=(\"COUNT(*) FILTER (WHERE apinum_digits_raw IS NOT NULL AND length(apinum_digits_raw)<>10)\"\n",
    "                         if has_apinum else \"0\")\n",
    "        )\n",
    "    )).fetchone()\n",
    "\n",
    "print(\"API Derivation Preview Stats:\")\n",
    "print({\n",
    "    'ten_digit': stats[0],\n",
    "    'eight_digit': stats[1],\n",
    "    'api_anomaly': stats[2],\n",
    "    'api10_anomaly': stats[3],\n",
    "    'apinum_anomaly': stats[4]\n",
    "})\n",
    "\n",
    "# Show samples\n",
    "with engine.begin() as conn:\n",
    "    sample = pd.read_sql(text(\n",
    "        \"\"\"\n",
    "        SELECT api, {api10}, {apinum}, api_digits_raw, {api10_raw}, {apinum_raw}, api10_clean, api8_clean\n",
    "        FROM well_shapes\n",
    "        WHERE api8_clean IS NOT NULL\n",
    "        LIMIT 5\n",
    "        \"\"\"\n",
    "        .format(\n",
    "            api10=('api10' if has_api10 else \"NULL AS api10\"),\n",
    "            apinum=('apinum' if has_apinum else \"NULL AS apinum\"),\n",
    "            api10_raw=('api10_digits_raw' if has_api10 else \"NULL AS api10_digits_raw\"),\n",
    "            apinum_raw=('apinum_digits_raw' if has_apinum else \"NULL AS apinum_digits_raw\")\n",
    "        )\n",
    "    ), conn)\n",
    "print(\"Clean samples (first 5):\")\n",
    "print(sample)\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    anomalous = pd.read_sql(text(\n",
    "        \"\"\"\n",
    "        SELECT api, {api10}, {apinum}, api_digits_raw, {api10_raw}, {apinum_raw}\n",
    "        FROM well_shapes\n",
    "        WHERE (api_digits_raw IS NOT NULL AND length(api_digits_raw) NOT IN (8,10))\n",
    "           OR ({api10_raw_is} AND length(api10_digits_raw)<>10)\n",
    "           OR ({apinum_raw_is} AND length(apinum_digits_raw)<>10)\n",
    "        LIMIT 5\n",
    "        \"\"\"\n",
    "        .format(\n",
    "            api10=('api10' if has_api10 else \"NULL AS api10\"),\n",
    "            apinum=('apinum' if has_apinum else \"NULL AS apinum\"),\n",
    "            api10_raw=('api10_digits_raw' if has_api10 else \"NULL AS api10_digits_raw\"),\n",
    "            apinum_raw=('apinum_digits_raw' if has_apinum else \"NULL AS apinum_digits_raw\"),\n",
    "            api10_raw_is=('TRUE' if has_api10 else 'FALSE'),\n",
    "            apinum_raw_is=('TRUE' if has_apinum else 'FALSE')\n",
    "        )\n",
    "    ), conn)\n",
    "print(\"Anomalous samples (first 5):\")\n",
    "print(anomalous)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "363a6606",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'api8_total': 1027045, 'api8_distinct': 1010430, 'api8_duplicate_groups': 11463, 'api10_total': 852653, 'api10_distinct': 852539, 'api10_duplicate_groups': 58}\n",
      "Skipped UNIQUE on api_number (duplicates or empty)\n",
      "Skipped UNIQUE on api10_number (duplicates or empty)\n"
     ]
    }
   ],
   "source": [
    "# Cell 7: Apply API numbers (10-digit and 8-digit) with precedence apinum/api10/api\n",
    "from sqlalchemy import text\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    # Ensure destination columns\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS api_number TEXT\"))\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS api10_number TEXT\"))\n",
    "\n",
    "    # Assign api10_number from api10_clean when length=10\n",
    "    conn.execute(text(\n",
    "        \"\"\"\n",
    "        UPDATE well_shapes\n",
    "        SET api10_number = api10_clean\n",
    "        WHERE api10_clean IS NOT NULL AND length(api10_clean)=10\n",
    "        \"\"\"\n",
    "    ))\n",
    "\n",
    "    # Assign api_number as the 8-digit from api8_clean\n",
    "    conn.execute(text(\n",
    "        \"\"\"\n",
    "        UPDATE well_shapes\n",
    "        SET api_number = api8_clean\n",
    "        WHERE api8_clean IS NOT NULL AND length(api8_clean)=8\n",
    "        \"\"\"\n",
    "    ))\n",
    "\n",
    "    # Defensive cleanup: null anything not exactly 8/10\n",
    "    conn.execute(text(\"\"\"\n",
    "        UPDATE well_shapes SET api_number = NULL WHERE api_number IS NOT NULL AND length(api_number)<>8;\n",
    "    \"\"\"))\n",
    "    conn.execute(text(\"\"\"\n",
    "        UPDATE well_shapes SET api10_number = NULL WHERE api10_number IS NOT NULL AND length(api10_number)<>10;\n",
    "    \"\"\"))\n",
    "\n",
    "    # Duplicate detection for api_number (8-digit)\n",
    "    dup8 = conn.execute(text(\n",
    "        \"\"\"\n",
    "        SELECT COUNT(*) FROM (\n",
    "          SELECT api_number, COUNT(*) c\n",
    "          FROM well_shapes\n",
    "          WHERE api_number IS NOT NULL\n",
    "          GROUP BY api_number\n",
    "          HAVING COUNT(*)>1\n",
    "        ) d\n",
    "        \"\"\"\n",
    "    )).scalar()\n",
    "    total8 = conn.execute(text(\"SELECT COUNT(*) FROM well_shapes WHERE api_number IS NOT NULL\")).scalar()\n",
    "    distinct8 = conn.execute(text(\"SELECT COUNT(DISTINCT api_number) FROM well_shapes WHERE api_number IS NOT NULL\")).scalar()\n",
    "\n",
    "    # Duplicate detection for api10_number (10-digit)\n",
    "    dup10 = conn.execute(text(\n",
    "        \"\"\"\n",
    "        SELECT COUNT(*) FROM (\n",
    "          SELECT api10_number, COUNT(*) c\n",
    "          FROM well_shapes\n",
    "          WHERE api10_number IS NOT NULL\n",
    "          GROUP BY api10_number\n",
    "          HAVING COUNT(*)>1\n",
    "        ) d\n",
    "        \"\"\"\n",
    "    )).scalar()\n",
    "    total10 = conn.execute(text(\"SELECT COUNT(*) FROM well_shapes WHERE api10_number IS NOT NULL\")).scalar()\n",
    "    distinct10 = conn.execute(text(\"SELECT COUNT(DISTINCT api10_number) FROM well_shapes WHERE api10_number IS NOT NULL\")).scalar()\n",
    "\n",
    "print({\n",
    "    'api8_total': total8,\n",
    "    'api8_distinct': distinct8,\n",
    "    'api8_duplicate_groups': dup8,\n",
    "    'api10_total': total10,\n",
    "    'api10_distinct': distinct10,\n",
    "    'api10_duplicate_groups': dup10,\n",
    "})\n",
    "\n",
    "# Optional: enforce unique constraints if no duplicates\n",
    "if dup8 == 0 and total8 > 0:\n",
    "    with engine.begin() as conn:\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes DROP CONSTRAINT IF EXISTS well_shapes_api_number_unique\"))\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes ADD CONSTRAINT well_shapes_api_number_unique UNIQUE (api_number)\"))\n",
    "    print(\"Applied UNIQUE constraint on api_number\")\n",
    "else:\n",
    "    print(\"Skipped UNIQUE on api_number (duplicates or empty)\")\n",
    "\n",
    "if dup10 == 0 and total10 > 0:\n",
    "    with engine.begin() as conn:\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes DROP CONSTRAINT IF EXISTS well_shapes_api10_number_unique\"))\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes ADD CONSTRAINT well_shapes_api10_number_unique UNIQUE (api10_number)\"))\n",
    "    print(\"Applied UNIQUE constraint on api10_number\")\n",
    "else:\n",
    "    print(\"Skipped UNIQUE on api10_number (duplicates or empty)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d0c3734b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using tract layer: ACS_2021_5YR_TRACT_48_TEXAS\n",
      "Using GEOID column: GEOID\n",
      "Loaded 6896 tract geometries into census_tracts_2021\n",
      "Spatial index on tract geometries ensured.\n",
      "Loaded 6896 tract geometries into census_tracts_2021\n",
      "Spatial index on tract geometries ensured.\n"
     ]
    }
   ],
   "source": [
    "# Cell 8: Load Texas Census tract geometries from local geodatabase into PostGIS (robust)\n",
    "from sqlalchemy import text\n",
    "import geopandas as gpd\n",
    "import fiona\n",
    "from pathlib import Path\n",
    "\n",
    "# Resolve geodatabase path\n",
    "gdb_path = Path('data/ACS_2021_5YR_TRACT_48_TEXAS.gdb')\n",
    "if not gdb_path.exists():\n",
    "    alt = Path('/home/dadams/Repos/texas-rebuild-postgis/data/ACS_2021_5YR_TRACT_48_TEXAS.gdb')\n",
    "    if alt.exists():\n",
    "        gdb_path = alt\n",
    "    else:\n",
    "        raise FileNotFoundError(f\"Geodatabase not found: {gdb_path}\")\n",
    "\n",
    "# Discover candidate layers with geometry\n",
    "layers = fiona.listlayers(gdb_path)\n",
    "# Prefer layers that look like tract geography and have polygon geometries\n",
    "candidate_layers = [l for l in layers if 'TRACT' in l.upper() and 'BLOCK' not in l.upper()]\n",
    "if not candidate_layers:\n",
    "    candidate_layers = layers  # fallback: try any layer\n",
    "\n",
    "tract_gdf = None\n",
    "selected_layer = None\n",
    "for lyr in candidate_layers:\n",
    "    try:\n",
    "        gdf_try = gpd.read_file(gdb_path, layer=lyr)\n",
    "        # Ensure we actually have a GeoDataFrame with a geometry column\n",
    "        if hasattr(gdf_try, 'geometry') and gdf_try.geometry is not None:\n",
    "            if not gdf_try.empty:\n",
    "                tract_gdf = gdf_try\n",
    "                selected_layer = lyr\n",
    "                break\n",
    "    except Exception:\n",
    "        continue\n",
    "\n",
    "if tract_gdf is None:\n",
    "    raise SystemExit(f\"Failed to read a tract layer with geometry from: {layers}\")\n",
    "\n",
    "print(f\"Using tract layer: {selected_layer}\")\n",
    "\n",
    "# Ensure CRS is WGS84\n",
    "try:\n",
    "    if tract_gdf.crs is None:\n",
    "        print('[warn] Tract CRS missing; assuming EPSG:4326')\n",
    "        tract_gdf = tract_gdf.set_crs(4326, allow_override=True)\n",
    "    if tract_gdf.crs.to_epsg() != 4326:\n",
    "        tract_gdf = tract_gdf.to_crs(4326)\n",
    "except Exception as e:\n",
    "    print(f\"[warn] Issue normalizing CRS for tracts: {e}; forcing 4326\")\n",
    "    tract_gdf = tract_gdf.set_crs(4326, allow_override=True)\n",
    "\n",
    "# Identify GEOID column\n",
    "geoid_col = None\n",
    "for cand in ['GEOID', 'GEOID10', 'GEOID20', 'geoid', 'GEOID_TRACT', 'TRACTCE']:\n",
    "    if cand in tract_gdf.columns:\n",
    "        geoid_col = cand\n",
    "        break\n",
    "if geoid_col is None:\n",
    "    geo_candidates = [c for c in tract_gdf.columns if 'GEOID' in c.upper()]\n",
    "    if geo_candidates:\n",
    "        geoid_col = geo_candidates[0]\n",
    "if geoid_col is None:\n",
    "    raise SystemExit(f\"No GEOID-like column found in tract layer columns: {list(tract_gdf.columns)}\")\n",
    "print(f\"Using GEOID column: {geoid_col}\")\n",
    "\n",
    "# Keep only needed columns and standardize names\n",
    "tract_gdf = tract_gdf[[geoid_col, tract_gdf.geometry.name]].rename(columns={geoid_col: 'geoid'})\n",
    "# Ensure geometry column name 'geom' to match our SQL later\n",
    "if tract_gdf.geometry.name != 'geom' and hasattr(tract_gdf, 'rename_geometry'):\n",
    "    tract_gdf = tract_gdf.rename_geometry('geom')\n",
    "if tract_gdf.geometry.name != 'geom':\n",
    "    tract_gdf = tract_gdf.set_geometry('geom')\n",
    "\n",
    "from sqlalchemy import create_engine\n",
    "pg_url = f\"postgresql+psycopg2://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}\"\n",
    "engine = create_engine(pg_url)\n",
    "\n",
    "# Load into PostGIS (replace if exists)\n",
    "tract_table = 'census_tracts_2021'\n",
    "tract_gdf.to_postgis(tract_table, engine, if_exists='replace', index=False)\n",
    "print(f\"Loaded {len(tract_gdf)} tract geometries into {tract_table}\")\n",
    "\n",
    "# Add spatial index on the geom column\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_{tract_table}_geom ON {tract_table} USING GIST (geom)\"))\n",
    "print(\"Spatial index on tract geometries ensured.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "342491aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assigned census_tract_geoid via spatial join for 1388601 wells\n",
      "Ensured index on census_tract_geoid\n",
      "Ensured index on census_tract_geoid\n"
     ]
    }
   ],
   "source": [
    "# Cell 9: Spatially join wells to Census tracts in PostGIS (preferred, offline)\n",
    "from sqlalchemy import text\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS census_tract_geoid TEXT\"))\n",
    "    # Use ST_Contains (tract polygon contains point). Requires both in EPSG:4326 (already enforced earlier).\n",
    "    updated = conn.execute(text(\"\"\"\n",
    "        WITH to_update AS (\n",
    "            SELECT w.id, t.geoid\n",
    "            FROM well_shapes w\n",
    "            JOIN census_tracts_2021 t\n",
    "              ON ST_Contains(t.geom, w.geom)\n",
    "            WHERE w.geom IS NOT NULL\n",
    "              AND w.census_tract_geoid IS NULL\n",
    "        )\n",
    "        UPDATE well_shapes w\n",
    "        SET census_tract_geoid = u.geoid\n",
    "        FROM to_update u\n",
    "        WHERE w.id = u.id\n",
    "        RETURNING 1\n",
    "    \"\"\")).rowcount\n",
    "print(f\"Assigned census_tract_geoid via spatial join for {updated or 0} wells\")\n",
    "\n",
    "# Index for faster joins/filters on tract\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_well_shapes_census_tract_geoid ON well_shapes (census_tract_geoid)\"))\n",
    "print(\"Ensured index on census_tract_geoid\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "583d5852",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Census API key detected; attempting fallback for unresolved wells.\n",
      "Fallback batch size: 200\n",
      "API obtained GEOIDs for 0 wells\n",
      "API obtained GEOIDs for 0 wells\n"
     ]
    }
   ],
   "source": [
    "# Cell 10: OPTIONAL fallback using Census Geocoder API for missing tracts (requires CENSUS_API_KEY)\n",
    "import os, time, requests, pandas as pd\n",
    "from sqlalchemy import text\n",
    "\n",
    "API_KEY = os.getenv('CENSUS_API_KEY')\n",
    "if API_KEY is None:\n",
    "    print('[info] CENSUS_API_KEY not set; skipping API fallback.')\n",
    "else:\n",
    "    print('Census API key detected; attempting fallback for unresolved wells.')\n",
    "    # Fetch a batch of wells still lacking census_tract_geoid (limit to avoid quota issues)\n",
    "    batch_limit = 200  # adjust as needed; respect API rate limits\n",
    "    with engine.begin() as conn:\n",
    "        rows = conn.execute(text(\"\"\"\n",
    "            SELECT id, latitude, longitude\n",
    "            FROM well_shapes\n",
    "            WHERE census_tract_geoid IS NULL\n",
    "              AND latitude IS NOT NULL AND longitude IS NOT NULL\n",
    "            LIMIT :lim\n",
    "        \"\"\"), {'lim': batch_limit}).fetchall()\n",
    "    print(f\"Fallback batch size: {len(rows)}\")\n",
    "    if rows:\n",
    "        updates = []\n",
    "        for r in rows:\n",
    "            wid, lat, lon = r\n",
    "            # Census Geocoder API: https://geocoding.geo.census.gov/ (Federal API, not all endpoints require key; some do for bulk)\n",
    "            # Using /geocoder/geographies/coordinates endpoint\n",
    "            url = ('https://geocoding.geo.census.gov/geocoder/geographies/coordinates'\n",
    "                   f'?x={lon}&y={lat}&benchmark=Public_AR_Current&vintage=ACS2021_Current&format=json&layers=Tracts&key={API_KEY}')\n",
    "            try:\n",
    "                resp = requests.get(url, timeout=5)\n",
    "                if resp.status_code == 200:\n",
    "                    data = resp.json()\n",
    "                    tract_info = data.get('result', {}).get('geographies', {}).get('Census Tracts', [])\n",
    "                    if tract_info:\n",
    "                        geoid = tract_info[0].get('GEOID')\n",
    "                        if geoid:\n",
    "                            updates.append((wid, geoid))\n",
    "                else:\n",
    "                    print(f\"[warn] Non-200 response for well id {wid}: {resp.status_code}\")\n",
    "            except Exception as e:\n",
    "                print(f\"[warn] Request failed for well id {wid}: {e}\")\n",
    "            time.sleep(0.15)  # gentle rate limiting\n",
    "        print(f\"API obtained GEOIDs for {len(updates)} wells\")\n",
    "        if updates:\n",
    "            # Bulk update\n",
    "            with engine.begin() as conn:\n",
    "                for wid, geoid in updates:\n",
    "                    conn.execute(text(\"UPDATE well_shapes SET census_tract_geoid=:g WHERE id=:i\"), {'g': geoid, 'i': wid})\n",
    "            print(\"Applied API fallback updates.\")\n",
    "    else:\n",
    "        print('No wells need fallback.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "72084705",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length distribution for api (length -> count):\n",
      "   length    count\n",
      "0       8  1027045\n",
      "1       3   363147\n",
      "2       7        2\n",
      "\n",
      "Length distribution for api10 (length -> count):\n",
      "   length   count\n",
      "0       8  845289\n",
      "1       3  362399\n",
      "2      10  182502\n",
      "3       9       2\n",
      "4       7       2\n",
      "\n",
      "Length distribution for apinum (length -> count):\n",
      "   length   count\n",
      "0      10  845288\n",
      "1       5  362399\n",
      "2      12  182503\n",
      "3       9       2\n",
      "4      11       2\n",
      "Focused length counts:\n",
      "{'apinum_len10': 845288, 'api10_len10': 182502, 'api_len10': 0, 'api_len3': 363147, 'api10_len3': 362399, 'apinum_len5': 362399}\n",
      "Short API samples (showing related fields):\n",
      "       id  api api10 apinum api_digits_raw api10_digits_raw apinum_digits_raw\n",
      "0  643559  249   249  42249            249              249             42249\n",
      "1  932957  371   371  42371            371              371             42371\n",
      "2  933051  371   371  42371            371              371             42371\n",
      "3  643600  249   249  42249            249              249             42249\n",
      "4  643603  249   249  42249            249              249             42249\n",
      "5  643605  249   249  42249            249              249             42249\n",
      "6  643638  249   249  42249            249              249             42249\n",
      "7  933718  371   371  42371            371              371             42371\n",
      "8  933745  371   371  42371            371              371             42371\n",
      "9  933963  371   371  42371            371              371             42371\n",
      "Derived vs source (first 10):\n",
      "       id        apinum       api10       api api10_clean api8_clean  \\\n",
      "0  163778  4205730992D1  05730992D1  05730992        None       None   \n",
      "1  164009  4205731591D1  05731591D1  05731591        None       None   \n",
      "2  163965  4270330280D1  70330280D1  70330280        None       None   \n",
      "3  164598  4270330331D1  70330331D1  70330331        None       None   \n",
      "4  291223  4210933080H1  10933080H1  10933080        None       None   \n",
      "5  292357  4210933084H1  10933084H1  10933084        None       None   \n",
      "6  292494  4210933532H1  10933532H1  10933532        None       None   \n",
      "7  292570  4210933079H1  10933079H1  10933079        None       None   \n",
      "8  292908  4210933527H1  10933527H1  10933527        None       None   \n",
      "9  293901  4210933085H1  10933085H1  10933085        None       None   \n",
      "\n",
      "  api_number api10_number  \n",
      "0   05730992         None  \n",
      "1   05731591         None  \n",
      "2   70330280         None  \n",
      "3   70330331         None  \n",
      "4   10933080         None  \n",
      "5   10933084         None  \n",
      "6   10933532         None  \n",
      "7   10933079         None  \n",
      "8   10933527         None  \n",
      "9   10933085         None  \n",
      "Coverage summary:\n",
      "{'total_wells': 1390194, 'with_tract_geoid': 1388666, 'without_tract_geoid': 1528, 'coverage_pct': 99.8901}\n",
      "Focused length counts:\n",
      "{'apinum_len10': 845288, 'api10_len10': 182502, 'api_len10': 0, 'api_len3': 363147, 'api10_len3': 362399, 'apinum_len5': 362399}\n",
      "Short API samples (showing related fields):\n",
      "       id  api api10 apinum api_digits_raw api10_digits_raw apinum_digits_raw\n",
      "0  643559  249   249  42249            249              249             42249\n",
      "1  932957  371   371  42371            371              371             42371\n",
      "2  933051  371   371  42371            371              371             42371\n",
      "3  643600  249   249  42249            249              249             42249\n",
      "4  643603  249   249  42249            249              249             42249\n",
      "5  643605  249   249  42249            249              249             42249\n",
      "6  643638  249   249  42249            249              249             42249\n",
      "7  933718  371   371  42371            371              371             42371\n",
      "8  933745  371   371  42371            371              371             42371\n",
      "9  933963  371   371  42371            371              371             42371\n",
      "Derived vs source (first 10):\n",
      "       id        apinum       api10       api api10_clean api8_clean  \\\n",
      "0  163778  4205730992D1  05730992D1  05730992        None       None   \n",
      "1  164009  4205731591D1  05731591D1  05731591        None       None   \n",
      "2  163965  4270330280D1  70330280D1  70330280        None       None   \n",
      "3  164598  4270330331D1  70330331D1  70330331        None       None   \n",
      "4  291223  4210933080H1  10933080H1  10933080        None       None   \n",
      "5  292357  4210933084H1  10933084H1  10933084        None       None   \n",
      "6  292494  4210933532H1  10933532H1  10933532        None       None   \n",
      "7  292570  4210933079H1  10933079H1  10933079        None       None   \n",
      "8  292908  4210933527H1  10933527H1  10933527        None       None   \n",
      "9  293901  4210933085H1  10933085H1  10933085        None       None   \n",
      "\n",
      "  api_number api10_number  \n",
      "0   05730992         None  \n",
      "1   05731591         None  \n",
      "2   70330280         None  \n",
      "3   70330331         None  \n",
      "4   10933080         None  \n",
      "5   10933084         None  \n",
      "6   10933532         None  \n",
      "7   10933079         None  \n",
      "8   10933527         None  \n",
      "9   10933085         None  \n",
      "Coverage summary:\n",
      "{'total_wells': 1390194, 'with_tract_geoid': 1388666, 'without_tract_geoid': 1528, 'coverage_pct': 99.8901}\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sqlalchemy import text\n",
    "\n",
    "# Cell 11: API diagnostics & coverage summary (post derivation)\n",
    "\n",
    "# Length distributions for each raw column (separate queries to avoid GROUP BY mistakes)\n",
    "with engine.begin() as conn:\n",
    "    len_api = pd.read_sql(text(\"\"\"\n",
    "        SELECT length(api) AS length, COUNT(*) AS count\n",
    "        FROM well_shapes\n",
    "        GROUP BY length\n",
    "        ORDER BY count DESC NULLS LAST\n",
    "    \"\"\"), conn)\n",
    "\n",
    "    len_api10 = pd.read_sql(text(\"\"\"\n",
    "        SELECT length(api10) AS length, COUNT(*) AS count\n",
    "        FROM well_shapes\n",
    "        GROUP BY length\n",
    "        ORDER BY count DESC NULLS LAST\n",
    "    \"\"\"), conn)\n",
    "\n",
    "    len_apinum = pd.read_sql(text(\"\"\"\n",
    "        SELECT length(apinum) AS length, COUNT(*) AS count\n",
    "        FROM well_shapes\n",
    "        GROUP BY length\n",
    "        ORDER BY count DESC NULLS LAST\n",
    "    \"\"\"), conn)\n",
    "\n",
    "print(\"Length distribution for api (length -> count):\")\n",
    "print(len_api.head(20))\n",
    "print(\"\\nLength distribution for api10 (length -> count):\")\n",
    "print(len_api10.head(20))\n",
    "print(\"\\nLength distribution for apinum (length -> count):\")\n",
    "print(len_apinum.head(20))\n",
    "\n",
    "# Focused counts for lengths of interest\n",
    "with engine.begin() as conn:\n",
    "    focused = conn.execute(text(\"\"\"\n",
    "        SELECT\n",
    "          COUNT(*) FILTER (WHERE length(apinum)=10) AS apinum_len10,\n",
    "          COUNT(*) FILTER (WHERE length(api10)=10)  AS api10_len10,\n",
    "          COUNT(*) FILTER (WHERE length(api)=10)    AS api_len10,\n",
    "          COUNT(*) FILTER (WHERE length(api)=3)     AS api_len3,\n",
    "          COUNT(*) FILTER (WHERE length(api10)=3)   AS api10_len3,\n",
    "          COUNT(*) FILTER (WHERE length(apinum)=5)  AS apinum_len5\n",
    "        FROM well_shapes\n",
    "    \"\"\")).fetchone()\n",
    "\n",
    "print(\"Focused length counts:\")\n",
    "print({\n",
    "    'apinum_len10': focused[0],\n",
    "    'api10_len10': focused[1],\n",
    "    'api_len10': focused[2],\n",
    "    'api_len3': focused[3],\n",
    "    'api10_len3': focused[4],\n",
    "    'apinum_len5': focused[5]\n",
    "})\n",
    "\n",
    "# Sample of short api values with related fields for context\n",
    "with engine.begin() as conn:\n",
    "    short_sample = pd.read_sql(text(\"\"\"\n",
    "        SELECT id, api, api10, apinum, api_digits_raw, api10_digits_raw, apinum_digits_raw\n",
    "        FROM well_shapes\n",
    "        WHERE length(api)=3 OR length(api10)=3\n",
    "        LIMIT 10\n",
    "    \"\"\"), conn)\n",
    "print(\"Short API samples (showing related fields):\")\n",
    "print(short_sample)\n",
    "\n",
    "# Derived vs source comparison (show examples where either derived 8- or 10-digit was produced)\n",
    "with engine.begin() as conn:\n",
    "    derived_sample = pd.read_sql(text(\"\"\"\n",
    "        SELECT id, apinum, api10, api, api10_clean, api8_clean, api_number, api10_number\n",
    "        FROM well_shapes\n",
    "        WHERE api_number IS NOT NULL OR api10_number IS NOT NULL\n",
    "        LIMIT 10\n",
    "    \"\"\"), conn)\n",
    "print(\"Derived vs source (first 10):\")\n",
    "print(derived_sample)\n",
    "\n",
    "# Coverage summary (tract assignment)\n",
    "with engine.begin() as conn:\n",
    "    total = conn.execute(text(\"SELECT COUNT(*) FROM well_shapes\")).scalar()\n",
    "    tract_assigned = conn.execute(text(\"SELECT COUNT(*) FROM well_shapes WHERE census_tract_geoid IS NOT NULL\")).scalar()\n",
    "\n",
    "coverage = {\n",
    "    'total_wells': total,\n",
    "    'with_tract_geoid': tract_assigned,\n",
    "    'without_tract_geoid': (total - tract_assigned) if total is not None else None,\n",
    "    'coverage_pct': round(tract_assigned * 100.0 / total, 4) if total else None\n",
    "}\n",
    "print(\"Coverage summary:\")\n",
    "print(coverage)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1ae22ec",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "ca7047f5",
   "metadata": {},
   "source": [
    "## Deduplication Plan\n",
    "\n",
    "We will remove duplicate well records based on a canonical API key:\n",
    "\n",
    "Key selection precedence:\n",
    "1. api10_number (full 10-digit API) if present.\n",
    "2. api_number (8-digit derived) if 10-digit is NULL.\n",
    "\n",
    "For each duplicate group (same dedup_key), keep the single \"best\" row determined by a quality score:\n",
    "- +10 if geom is not NULL.\n",
    "- +5 if both latitude and longitude are not NULL.\n",
    "- +4 if apinum_digits_raw present and length=10.\n",
    "- +3 if api10_clean present.\n",
    "- +2 if census_tract_geoid present.\n",
    "- +1 for every other non-null attribute column (excluding working/derived columns and id/geom).\n",
    "\n",
    "Process outline:\n",
    "1. Audit duplicates (counts and top example groups).\n",
    "2. (Optional) Create a full backup table `well_shapes_backup_dedup` before deletion.\n",
    "3. Use a window function (ROW_NUMBER over dedup_key ordered by quality_score DESC, id ASC) to delete rows with rn>1.\n",
    "4. Re-check that duplicates are gone; add UNIQUE constraints and indexes on api10_number/api_number.\n",
    "\n",
    "Execution safety:\n",
    "- Controlled by a flag EXECUTE_DELETE (default False) so the cell can be run safely for preview first.\n",
    "- Backup table preserves all original rows for easy restore if needed.\n",
    "\n",
    "Assumptions:\n",
    "- api_number derived from api10 so consistency between both keys in groups.\n",
    "- There is no authoritative timestamp column; we fall back to id ordering for deterministic tie-break.\n",
    "\n",
    "You can adjust weights if needed before running deletion."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "068460b9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'keys_present': 1027045, 'keys_null': 363149, 'duplicate_groups': 9401}\n",
      "Top duplicate groups (by count):\n",
      "    dedup_key  cnt\n",
      "0  4239531364   15\n",
      "1    12733586   14\n",
      "2  4239530892   13\n",
      "3  4212733751   11\n",
      "4    50732832   11\n",
      "5    12738366   10\n",
      "6  4239530437   10\n",
      "7    02131620    9\n",
      "8    04131132    9\n",
      "9    04131897    9\n",
      "Top duplicate groups (by count):\n",
      "    dedup_key  cnt\n",
      "0  4239531364   15\n",
      "1    12733586   14\n",
      "2  4239530892   13\n",
      "3  4212733751   11\n",
      "4    50732832   11\n",
      "5    12738366   10\n",
      "6  4239530437   10\n",
      "7    02131620    9\n",
      "8    04131132    9\n",
      "9    04131897    9\n",
      "Sample rows for example duplicate key:\n",
      "       id       api       api10        apinum api_number api10_number  \\\n",
      "0  983150  39531364  39531364HD  4239531364HD   39531364   4239531364   \n",
      "1  983190  39531364  39531364HF  4239531364HF   39531364   4239531364   \n",
      "2  983193  39531364  39531364HO  4239531364HO   39531364   4239531364   \n",
      "3  983367  39531364  39531364HA  4239531364HA   39531364   4239531364   \n",
      "4  983378  39531364  39531364HL  4239531364HL   39531364   4239531364   \n",
      "\n",
      "    latitude  longitude census_tract_geoid  has_geom  \n",
      "0  30.803677 -96.473793        48395960400      True  \n",
      "1  30.797427 -96.466177        48395960400      True  \n",
      "2  30.795806 -96.468429        48395960400      True  \n",
      "3  30.814532 -96.478007        48395960400      True  \n",
      "4  30.810300 -96.480577        48395960400      True  \n",
      "Sample rows for example duplicate key:\n",
      "       id       api       api10        apinum api_number api10_number  \\\n",
      "0  983150  39531364  39531364HD  4239531364HD   39531364   4239531364   \n",
      "1  983190  39531364  39531364HF  4239531364HF   39531364   4239531364   \n",
      "2  983193  39531364  39531364HO  4239531364HO   39531364   4239531364   \n",
      "3  983367  39531364  39531364HA  4239531364HA   39531364   4239531364   \n",
      "4  983378  39531364  39531364HL  4239531364HL   39531364   4239531364   \n",
      "\n",
      "    latitude  longitude census_tract_geoid  has_geom  \n",
      "0  30.803677 -96.473793        48395960400      True  \n",
      "1  30.797427 -96.466177        48395960400      True  \n",
      "2  30.795806 -96.468429        48395960400      True  \n",
      "3  30.814532 -96.478007        48395960400      True  \n",
      "4  30.810300 -96.480577        48395960400      True  \n"
     ]
    }
   ],
   "source": [
    "# Cell 12: Duplicate audit (preview only)\n",
    "from sqlalchemy import text\n",
    "import pandas as pd\n",
    "\n",
    "# Helper: show top duplicate keys and sample rows\n",
    "with engine.begin() as conn:\n",
    "    # Count duplicate groups for both keys\n",
    "    dup_counts = conn.execute(text(\n",
    "        \"\"\"\n",
    "        WITH k AS (\n",
    "          SELECT COALESCE(api10_number, api_number) AS dedup_key\n",
    "          FROM well_shapes\n",
    "        )\n",
    "        SELECT\n",
    "          COUNT(*) FILTER (WHERE dedup_key IS NOT NULL)                      AS keys_present,\n",
    "          COUNT(*) FILTER (WHERE dedup_key IS NULL)                          AS keys_null,\n",
    "          COUNT(*) FILTER (WHERE dedup_key IS NOT NULL) -\n",
    "          COUNT(DISTINCT dedup_key)                                          AS duplicate_groups\n",
    "        FROM k\n",
    "        \"\"\"\n",
    "    )).fetchone()\n",
    "\n",
    "print({\n",
    "    'keys_present': dup_counts[0],\n",
    "    'keys_null': dup_counts[1],\n",
    "    'duplicate_groups': dup_counts[2]\n",
    "})\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    top_dups = pd.read_sql(text(\n",
    "        \"\"\"\n",
    "        SELECT COALESCE(api10_number, api_number) AS dedup_key, COUNT(*) AS cnt\n",
    "        FROM well_shapes\n",
    "        WHERE COALESCE(api10_number, api_number) IS NOT NULL\n",
    "        GROUP BY 1\n",
    "        HAVING COUNT(*) > 1\n",
    "        ORDER BY cnt DESC, dedup_key\n",
    "        LIMIT 10\n",
    "        \"\"\"\n",
    "    ), conn)\n",
    "\n",
    "print(\"Top duplicate groups (by count):\")\n",
    "print(top_dups)\n",
    "\n",
    "# Show sample rows for one duplicate key (if any)\n",
    "if not top_dups.empty:\n",
    "    example_key = top_dups.iloc[0]['dedup_key']\n",
    "    with engine.begin() as conn:\n",
    "        sample_rows = pd.read_sql(text(\n",
    "            \"\"\"\n",
    "            SELECT id, api, api10, apinum, api_number, api10_number, latitude, longitude,\n",
    "                   census_tract_geoid, geom IS NOT NULL AS has_geom\n",
    "            FROM well_shapes\n",
    "            WHERE COALESCE(api10_number, api_number) = :k\n",
    "            ORDER BY id\n",
    "            LIMIT 5\n",
    "            \"\"\"\n",
    "        ), conn, params={'k': example_key})\n",
    "    print(\"Sample rows for example duplicate key:\")\n",
    "    print(sample_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "277b2c9a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Backup table well_shapes_backup_dedup created (full copy).\n",
      "{'would_delete': 9401, 'would_keep': 1017644, 'key_null': 363149}\n",
      "{'would_delete': 9401, 'would_keep': 1017644, 'key_null': 363149}\n",
      "Deleted 9401 duplicate rows.\n",
      "Deleted 9401 duplicate rows.\n"
     ]
    }
   ],
   "source": [
    "# Cell 13: Deduplicate in-place with quality score (guarded by EXECUTE_DELETE)\n",
    "from sqlalchemy import text\n",
    "\n",
    "EXECUTE_DELETE = True  # set True to perform deletion\n",
    "\n",
    "# Build a dynamic quality score: higher is better\n",
    "# We approximate \"more complete\" by counting non-null attributes and adding weights for geom/latlon/api10/apinum/tract\n",
    "\n",
    "# Identify attribute columns to count as \"other\" (exclude id, geom, and working/derived)\n",
    "with engine.begin() as conn:\n",
    "    cols = [r[0] for r in conn.execute(text(\n",
    "        \"\"\"\n",
    "        SELECT column_name\n",
    "        FROM information_schema.columns\n",
    "        WHERE table_schema='public' AND table_name='well_shapes'\n",
    "        \"\"\"\n",
    "    )).fetchall()]\n",
    "\n",
    "exclude_cols = set(['id','geom','latitude','longitude','census_tract_geoid',\n",
    "                    'api','api10','apinum',\n",
    "                    'api_digits_raw','api10_digits_raw','apinum_digits_raw',\n",
    "                    'api10_clean','api8_clean','api_number','api10_number'])\n",
    "other_cols = [c for c in cols if c not in exclude_cols]\n",
    "\n",
    "# Construct expression to count non-null other columns\n",
    "other_nonnull_expr = \" + \".join([f\"(CASE WHEN {c} IS NOT NULL THEN 1 ELSE 0 END)\" for c in other_cols]) or \"0\"\n",
    "\n",
    "score_expr = f\"\"\"\n",
    "  (CASE WHEN geom IS NOT NULL THEN 10 ELSE 0 END)\n",
    "+ (CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL THEN 5 ELSE 0 END)\n",
    "+ (CASE WHEN apinum_digits_raw IS NOT NULL AND length(apinum_digits_raw)=10 THEN 4 ELSE 0 END)\n",
    "+ (CASE WHEN api10_clean IS NOT NULL THEN 3 ELSE 0 END)\n",
    "+ (CASE WHEN census_tract_geoid IS NOT NULL THEN 2 ELSE 0 END)\n",
    "+ ({other_nonnull_expr})\n",
    "\"\"\"\n",
    "\n",
    "# Create backup table (idempotent replace)\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(\"DROP TABLE IF EXISTS well_shapes_backup_dedup\"))\n",
    "    conn.execute(text(\"CREATE TABLE well_shapes_backup_dedup AS TABLE well_shapes\"))\n",
    "\n",
    "print(\"Backup table well_shapes_backup_dedup created (full copy).\")\n",
    "\n",
    "# Compute row_number per dedup_key; preview counts\n",
    "with engine.begin() as conn:\n",
    "    preview = conn.execute(text(f\"\"\"\n",
    "        WITH ranked AS (\n",
    "          SELECT id,\n",
    "                 COALESCE(api10_number, api_number) AS dedup_key,\n",
    "                 {score_expr} AS quality_score,\n",
    "                 ROW_NUMBER() OVER (\n",
    "                   PARTITION BY COALESCE(api10_number, api_number)\n",
    "                   ORDER BY {score_expr} DESC, id ASC\n",
    "                 ) AS rn\n",
    "          FROM well_shapes\n",
    "        )\n",
    "        SELECT\n",
    "          COUNT(*) FILTER (WHERE dedup_key IS NOT NULL AND rn>1) AS would_delete,\n",
    "          COUNT(*) FILTER (WHERE dedup_key IS NOT NULL AND rn=1) AS would_keep,\n",
    "          COUNT(*) FILTER (WHERE dedup_key IS NULL)               AS key_null\n",
    "        FROM ranked\n",
    "    \"\"\")).fetchone()\n",
    "\n",
    "print({\n",
    "    'would_delete': preview[0],\n",
    "    'would_keep': preview[1],\n",
    "    'key_null': preview[2]\n",
    "})\n",
    "\n",
    "if EXECUTE_DELETE:\n",
    "    with engine.begin() as conn:\n",
    "        deleted = conn.execute(text(f\"\"\"\n",
    "            WITH ranked AS (\n",
    "              SELECT id,\n",
    "                     COALESCE(api10_number, api_number) AS dedup_key,\n",
    "                     {score_expr} AS quality_score,\n",
    "                     ROW_NUMBER() OVER (\n",
    "                       PARTITION BY COALESCE(api10_number, api_number)\n",
    "                       ORDER BY {score_expr} DESC, id ASC\n",
    "                     ) AS rn\n",
    "              FROM well_shapes\n",
    "            )\n",
    "            DELETE FROM well_shapes w\n",
    "            USING ranked r\n",
    "            WHERE w.id = r.id AND r.dedup_key IS NOT NULL AND r.rn > 1\n",
    "            RETURNING 1\n",
    "        \"\"\")).rowcount\n",
    "    print(f\"Deleted {deleted or 0} duplicate rows.\")\n",
    "else:\n",
    "    print(\"EXECUTE_DELETE is False: no rows deleted. Set to True to perform deduplication.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "717433fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'duplicate_groups_after': 0}\n",
      "Indexes ensured; constraints applied only if no duplicates remained; ANALYZE complete.\n",
      "Indexes ensured; constraints applied only if no duplicates remained; ANALYZE complete.\n"
     ]
    }
   ],
   "source": [
    "# Cell 14: Post-dedup verification and indexing\n",
    "from sqlalchemy import text\n",
    "import pandas as pd\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    # Re-check duplicates\n",
    "    dup_check = conn.execute(text(\n",
    "        \"\"\"\n",
    "        WITH k AS (\n",
    "          SELECT COALESCE(api10_number, api_number) AS dedup_key\n",
    "          FROM well_shapes\n",
    "        )\n",
    "        SELECT\n",
    "          COUNT(*) FILTER (WHERE dedup_key IS NOT NULL) - COUNT(DISTINCT dedup_key) AS duplicate_groups\n",
    "        FROM k\n",
    "        \"\"\"\n",
    "    )).scalar()\n",
    "\n",
    "print({'duplicate_groups_after': dup_check})\n",
    "\n",
    "# Ensure indexes and optional constraints\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_well_shapes_api10_number ON well_shapes (api10_number)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_well_shapes_api_number ON well_shapes (api_number)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_well_shapes_census_tract_geoid ON well_shapes (census_tract_geoid)\"))\n",
    "    # Only add unique constraints if no duplicates remain\n",
    "    if dup_check == 0:\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes DROP CONSTRAINT IF EXISTS well_shapes_api10_number_unique\"))\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes ADD CONSTRAINT well_shapes_api10_number_unique UNIQUE (api10_number)\"))\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes DROP CONSTRAINT IF EXISTS well_shapes_api_number_unique\"))\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes ADD CONSTRAINT well_shapes_api_number_unique UNIQUE (api_number)\"))\n",
    "    # Analyze for planner stats\n",
    "    conn.execute(text(\"ANALYZE well_shapes\"))\n",
    "\n",
    "print(\"Indexes ensured; constraints applied only if no duplicates remained; ANALYZE complete.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0fb03ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 15 (optional): Restore original rows from backup\n",
    "from sqlalchemy import text\n",
    "\n",
    "CONFIRM_RESTORE = False  # set True to restore from backup table\n",
    "\n",
    "if CONFIRM_RESTORE:\n",
    "    with engine.begin() as conn:\n",
    "        # Sanity check backup exists and has rows\n",
    "        backup_rows = conn.execute(text(\"SELECT COUNT(*) FROM well_shapes_backup_dedup\")).scalar()\n",
    "        if not backup_rows:\n",
    "            raise SystemExit(\"Backup table is empty or missing; aborting restore.\")\n",
    "        # Replace current with backup\n",
    "        conn.execute(text(\"TRUNCATE TABLE well_shapes\"))\n",
    "        conn.execute(text(\"INSERT INTO well_shapes SELECT * FROM well_shapes_backup_dedup\"))\n",
    "        conn.execute(text(\"ANALYZE well_shapes\"))\n",
    "    print(f\"Restored {backup_rows} rows from well_shapes_backup_dedup.\")\n",
    "else:\n",
    "    print(\"CONFIRM_RESTORE is False: not restoring.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "77c01c81",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'rows_with_key': 1017644, 'distinct_keys': 1010430, 'duplicate_groups': 7214}\n",
      "Top duplicate groups after first pass (canonical):\n",
      "          key  cnt\n",
      "0  4200100766    2\n",
      "1  4200131333    2\n",
      "2  4200131738    2\n",
      "3  4200132166    2\n",
      "4  4200132207    2\n",
      "5  4200132236    2\n",
      "6  4200132318    2\n",
      "7  4200132407    2\n",
      "8  4200132416    2\n",
      "9  4200132501    2\n",
      "Top duplicate groups after first pass (canonical):\n",
      "          key  cnt\n",
      "0  4200100766    2\n",
      "1  4200131333    2\n",
      "2  4200131738    2\n",
      "3  4200132166    2\n",
      "4  4200132207    2\n",
      "5  4200132236    2\n",
      "6  4200132318    2\n",
      "7  4200132407    2\n",
      "8  4200132416    2\n",
      "9  4200132501    2\n",
      "Sample rows for top remaining duplicate group:\n",
      "     id       api       api10        apinum api_number api10_number  \\\n",
      "0   155  00100766  00100766DW  4200100766DW   00100766   4200100766   \n",
      "1  2859  00100766  00100766D1  4200100766D1   00100766         None   \n",
      "\n",
      "  canonical_api10  latitude  longitude census_tract_geoid  has_geom  \n",
      "0      4200100766  31.89849 -95.926702        48001951100      True  \n",
      "1      4200100766  31.89721 -95.926583        48001951100      True  \n",
      "Sample rows for top remaining duplicate group:\n",
      "     id       api       api10        apinum api_number api10_number  \\\n",
      "0   155  00100766  00100766DW  4200100766DW   00100766   4200100766   \n",
      "1  2859  00100766  00100766D1  4200100766D1   00100766         None   \n",
      "\n",
      "  canonical_api10  latitude  longitude census_tract_geoid  has_geom  \n",
      "0      4200100766  31.89849 -95.926702        48001951100      True  \n",
      "1      4200100766  31.89721 -95.926583        48001951100      True  \n"
     ]
    }
   ],
   "source": [
    "# Cell 16: Second-pass canonical key & audit\n",
    "from sqlalchemy import text\n",
    "import pandas as pd\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    # Add a unified 10-digit canonical key if not exists\n",
    "    conn.execute(text(\"ALTER TABLE well_shapes ADD COLUMN IF NOT EXISTS canonical_api10 TEXT\"))\n",
    "    # Rule: prefer api10_number; if missing but api_number (8-digit) present, prepend '42' (Texas state code) to get a synthetic 10-digit; else NULL\n",
    "    conn.execute(text(\"\"\"\n",
    "        UPDATE well_shapes\n",
    "        SET canonical_api10 = CASE\n",
    "          WHEN api10_number IS NOT NULL AND length(api10_number)=10 THEN api10_number\n",
    "          WHEN api10_number IS NULL AND api_number IS NOT NULL AND length(api_number)=8 THEN '42' || api_number\n",
    "          ELSE canonical_api10 END\n",
    "        WHERE canonical_api10 IS NULL\n",
    "    \"\"\"))\n",
    "\n",
    "# Audit duplicates on canonical_api10\n",
    "with engine.begin() as conn:\n",
    "    dup_info = conn.execute(text(\"\"\"\n",
    "        SELECT COUNT(*) FILTER (WHERE canonical_api10 IS NOT NULL) AS key_rows,\n",
    "               COUNT(DISTINCT canonical_api10) AS distinct_keys,\n",
    "               (COUNT(*) FILTER (WHERE canonical_api10 IS NOT NULL) - COUNT(DISTINCT canonical_api10)) AS duplicate_groups\n",
    "        FROM well_shapes\n",
    "    \"\"\")).fetchone()\n",
    "\n",
    "print({\n",
    "    'rows_with_key': dup_info[0],\n",
    "    'distinct_keys': dup_info[1],\n",
    "    'duplicate_groups': dup_info[2]\n",
    "})\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    top_dups2 = pd.read_sql(text(\"\"\"\n",
    "        SELECT canonical_api10 AS key, COUNT(*) AS cnt\n",
    "        FROM well_shapes\n",
    "        WHERE canonical_api10 IS NOT NULL\n",
    "        GROUP BY 1\n",
    "        HAVING COUNT(*)>1\n",
    "        ORDER BY cnt DESC, key\n",
    "        LIMIT 10\n",
    "    \"\"\"), conn)\n",
    "print(\"Top duplicate groups after first pass (canonical):\")\n",
    "print(top_dups2)\n",
    "\n",
    "if not top_dups2.empty:\n",
    "    sample_key = top_dups2.iloc[0]['key']\n",
    "    with engine.begin() as conn:\n",
    "        sample2 = pd.read_sql(text(\"\"\"\n",
    "            SELECT id, api, api10, apinum, api_number, api10_number, canonical_api10,\n",
    "                   latitude, longitude, census_tract_geoid, geom IS NOT NULL AS has_geom\n",
    "            FROM well_shapes\n",
    "            WHERE canonical_api10 = :k\n",
    "            ORDER BY id\n",
    "            LIMIT 10\n",
    "        \"\"\"), conn, params={'k': sample_key})\n",
    "    print(\"Sample rows for top remaining duplicate group:\")\n",
    "    print(sample2)\n",
    "else:\n",
    "    print(\"No remaining duplicate groups on canonical_api10.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "94717e26",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'would_delete': 7214, 'would_keep': 1010430}\n",
      "Deleted 7214 additional duplicate rows.\n",
      "Deleted 7214 additional duplicate rows.\n"
     ]
    }
   ],
   "source": [
    "# Cell 17: Second-pass dedup using canonical_api10 (guarded)\n",
    "from sqlalchemy import text\n",
    "\n",
    "EXECUTE_DELETE2 = True  # set True to perform second-pass deletion\n",
    "\n",
    "# Reuse the prior quality score but base partition on canonical_api10\n",
    "with engine.begin() as conn:\n",
    "    cols = [r[0] for r in conn.execute(text(\n",
    "        \"\"\"\n",
    "        SELECT column_name\n",
    "        FROM information_schema.columns\n",
    "        WHERE table_schema='public' AND table_name='well_shapes'\n",
    "        \"\"\"\n",
    "    )).fetchall()]\n",
    "\n",
    "exclude_cols = set(['id','geom','latitude','longitude','census_tract_geoid',\n",
    "                    'api','api10','apinum',\n",
    "                    'api_digits_raw','api10_digits_raw','apinum_digits_raw',\n",
    "                    'api10_clean','api8_clean','api_number','api10_number','canonical_api10'])\n",
    "other_cols = [c for c in cols if c not in exclude_cols]\n",
    "other_nonnull_expr = \" + \".join([f\"(CASE WHEN {c} IS NOT NULL THEN 1 ELSE 0 END)\" for c in other_cols]) or \"0\"\n",
    "\n",
    "score_expr = f\"\"\"\n",
    "  (CASE WHEN geom IS NOT NULL THEN 10 ELSE 0 END)\n",
    "+ (CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL THEN 5 ELSE 0 END)\n",
    "+ (CASE WHEN apinum_digits_raw IS NOT NULL AND length(apinum_digits_raw)=10 THEN 4 ELSE 0 END)\n",
    "+ (CASE WHEN api10_clean IS NOT NULL THEN 3 ELSE 0 END)\n",
    "+ (CASE WHEN census_tract_geoid IS NOT NULL THEN 2 ELSE 0 END)\n",
    "+ ({other_nonnull_expr})\n",
    "\"\"\"\n",
    "\n",
    "# Preview what would be deleted\n",
    "with engine.begin() as conn:\n",
    "    preview2 = conn.execute(text(f\"\"\"\n",
    "        WITH ranked AS (\n",
    "          SELECT id,\n",
    "                 canonical_api10 AS dedup_key,\n",
    "                 {score_expr} AS quality_score,\n",
    "                 ROW_NUMBER() OVER (\n",
    "                   PARTITION BY canonical_api10\n",
    "                   ORDER BY {score_expr} DESC, id ASC\n",
    "                 ) AS rn\n",
    "          FROM well_shapes\n",
    "          WHERE canonical_api10 IS NOT NULL\n",
    "        )\n",
    "        SELECT COUNT(*) FILTER (WHERE rn>1) AS would_delete,\n",
    "               COUNT(*) FILTER (WHERE rn=1) AS would_keep\n",
    "        FROM ranked\n",
    "    \"\"\")).fetchone()\n",
    "\n",
    "print({'would_delete': preview2[0], 'would_keep': preview2[1]})\n",
    "\n",
    "if EXECUTE_DELETE2:\n",
    "    with engine.begin() as conn:\n",
    "        deleted2 = conn.execute(text(f\"\"\"\n",
    "            WITH ranked AS (\n",
    "              SELECT id,\n",
    "                     canonical_api10 AS dedup_key,\n",
    "                     {score_expr} AS quality_score,\n",
    "                     ROW_NUMBER() OVER (\n",
    "                       PARTITION BY canonical_api10\n",
    "                       ORDER BY {score_expr} DESC, id ASC\n",
    "                     ) AS rn\n",
    "              FROM well_shapes\n",
    "              WHERE canonical_api10 IS NOT NULL\n",
    "            )\n",
    "            DELETE FROM well_shapes w\n",
    "            USING ranked r\n",
    "            WHERE w.id = r.id AND r.rn > 1\n",
    "            RETURNING 1\n",
    "        \"\"\")).rowcount\n",
    "    print(f\"Deleted {deleted2 or 0} additional duplicate rows.\")\n",
    "else:\n",
    "    print(\"EXECUTE_DELETE2 is False: not deleting in second pass.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "b8f76de7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'canonical_duplicate_groups_after': 0}\n",
      "Applied UNIQUE constraint on canonical_api10\n",
      "Applied UNIQUE constraint on canonical_api10\n",
      "Final ANALYZE complete.\n",
      "Final ANALYZE complete.\n"
     ]
    }
   ],
   "source": [
    "# Cell 18: Final verification & constraints (after second pass)\n",
    "from sqlalchemy import text\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    final_dup = conn.execute(text(\"\"\"\n",
    "        SELECT COUNT(*) FILTER (WHERE canonical_api10 IS NOT NULL) - COUNT(DISTINCT canonical_api10) AS duplicate_groups\n",
    "        FROM well_shapes\n",
    "    \"\"\")).scalar()\n",
    "\n",
    "print({'canonical_duplicate_groups_after': final_dup})\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    if final_dup == 0:\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes DROP CONSTRAINT IF EXISTS well_shapes_canonical_api10_unique\"))\n",
    "        conn.execute(text(\"ALTER TABLE well_shapes ADD CONSTRAINT well_shapes_canonical_api10_unique UNIQUE (canonical_api10)\"))\n",
    "        print(\"Applied UNIQUE constraint on canonical_api10\")\n",
    "    else:\n",
    "        print(\"Skipped UNIQUE constraint on canonical_api10 (still duplicates present)\")\n",
    "    conn.execute(text(\"ANALYZE well_shapes\"))\n",
    "\n",
    "print(\"Final ANALYZE complete.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbe051ab",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "8b7d1ee8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created table well_shape_tract with selected columns.\n",
      "Indexes ensured on well_shape_tract; UNIQUE applied on canonical_api10 if safe; ANALYZE done.\n"
     ]
    }
   ],
   "source": [
    "# Cell 19: Materialize to well_shape_tract (cleaned projection)\n",
    "from sqlalchemy import text\n",
    "\n",
    "# Choose columns to keep: identifiers, geometry, lat/lon, tract, and a few useful sources\n",
    "keep_cols = [\n",
    "    'id',\n",
    "    'canonical_api10', 'api10_number', 'api_number',\n",
    "    'apinum', 'api10', 'api',\n",
    "    'census_tract_geoid', 'latitude', 'longitude',\n",
    "    'geom'\n",
    "]\n",
    "\n",
    "# Build SELECT with only available columns\n",
    "with engine.begin() as conn:\n",
    "    cols = [r[0] for r in conn.execute(text(\n",
    "        \"\"\"\n",
    "        SELECT column_name\n",
    "        FROM information_schema.columns\n",
    "        WHERE table_schema='public' AND table_name='well_shapes'\n",
    "        \"\"\"\n",
    "    )).fetchall()]\n",
    "\n",
    "available = [c for c in keep_cols if c in cols]\n",
    "missing = [c for c in keep_cols if c not in cols]\n",
    "if missing:\n",
    "    print({ 'skipping_missing_columns': missing })\n",
    "\n",
    "select_list = \", \".join(available)\n",
    "\n",
    "create_sql = f\"\"\"\n",
    "DROP TABLE IF EXISTS well_shape_tract;\n",
    "CREATE TABLE well_shape_tract AS\n",
    "SELECT {select_list}\n",
    "FROM well_shapes;\n",
    "\"\"\"\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    conn.execute(text(create_sql))\n",
    "\n",
    "print(\"Created table well_shape_tract with selected columns.\")\n",
    "\n",
    "# Add types/constraints/indexes\n",
    "with engine.begin() as conn:\n",
    "    # Ensure geometry type and SRID declared (copy may keep it, but enforce explicitly)\n",
    "    conn.execute(text(\"ALTER TABLE well_shape_tract ALTER COLUMN geom TYPE geometry(Point,4326) USING ST_SetSRID(ST_Force2D(geom),4326)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wst_geom ON well_shape_tract USING GIST (geom)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wst_canonical_api10 ON well_shape_tract (canonical_api10)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wst_api10_number ON well_shape_tract (api10_number)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wst_api_number ON well_shape_tract (api_number)\"))\n",
    "    conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wst_tract ON well_shape_tract (census_tract_geoid)\"))\n",
    "    # Only add unique if source is clean\n",
    "    dup = conn.execute(text(\"\"\"\n",
    "        SELECT COUNT(*) FILTER (WHERE canonical_api10 IS NOT NULL) - COUNT(DISTINCT canonical_api10)\n",
    "        FROM well_shape_tract\n",
    "    \"\"\")).scalar()\n",
    "    if dup == 0:\n",
    "        conn.execute(text(\"ALTER TABLE well_shape_tract DROP CONSTRAINT IF EXISTS wst_canonical_api10_unique\"))\n",
    "        conn.execute(text(\"ALTER TABLE well_shape_tract ADD CONSTRAINT wst_canonical_api10_unique UNIQUE (canonical_api10)\"))\n",
    "\n",
    "    conn.execute(text(\"ANALYZE well_shape_tract\"))\n",
    "\n",
    "print(\"Indexes ensured on well_shape_tract; UNIQUE applied on canonical_api10 if safe; ANALYZE done.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}