{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e095f37f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connected to texas_data at localhost:5432 as postgres\n", "{'well_tables_found': ['well_shape_tract', 'well_shapes']}\n", "{'using_well_table': 'well_shape_tract'}\n" ] } ], "source": [ "# Cell 1: Imports & database connection setup\n", "import os\n", "import pandas as pd\n", "import geopandas as gpd\n", "from sqlalchemy import create_engine, text\n", "from pathlib import Path\n", "\n", "PGHOST = os.getenv(\"PGHOST\", \"localhost\")\n", "PGPORT = os.getenv(\"PGPORT\", \"5432\")\n", "PGUSER = os.getenv(\"PGUSER\", \"postgres\")\n", "PGPASSWORD = os.getenv(\"PGPASSWORD\", \"\")\n", "PGDATABASE = os.getenv(\"PGDATABASE\", \"texas_data\") # default; we may switch if target DB differs\n", "\n", "pg_url = f\"postgresql+psycopg2://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}\"\n", "engine = create_engine(pg_url)\n", "print(f\"Connected to {PGDATABASE} at {PGHOST}:{PGPORT} as {PGUSER}\")\n", "\n", "# Detect candidate well tables\n", "candidate_well_tables = [\"well_shape_tract\", \"well_shapes\", \"wells\", \"rcc_well_api_raw\"]\n", "existing_well_tables = []\n", "with engine.begin() as conn:\n", " rows = conn.execute(text(\"\"\"\n", " SELECT table_name\n", " FROM information_schema.tables\n", " WHERE table_schema='public'\n", " \"\"\")).fetchall()\n", " present = {r[0] for r in rows}\n", " for t in candidate_well_tables:\n", " if t in present:\n", " existing_well_tables.append(t)\n", "\n", "print({\"well_tables_found\": existing_well_tables})\n", "# Select preferred well source by priority order\n", "preferred_order = [\"well_shape_tract\", \"well_shapes\", \"wells\", \"rcc_well_api_raw\"]\n", "well_table = None\n", "for t in preferred_order:\n", " if t in existing_well_tables:\n", " well_table = t\n", " break\n", "if well_table is None:\n", " raise RuntimeError(\"No well table found among candidates; ensure wells are loaded first.\")\n", "print({\"using_well_table\": well_table})\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "cd8aea17", "metadata": {}, "outputs": [], "source": [ "# Cell 2: Helper to read and standardize a shapefile layer to EPSG:4326\n", "from typing import Optional\n", "\n", "def load_layer(path: Path, layer_name: str, name_field_candidates=None) -> gpd.GeoDataFrame:\n", " if name_field_candidates is None:\n", " name_field_candidates = [\"name\", \"Name\", \"Basin\", \"BASIN\", \"Play\", \"PLAY\", \"Region\", \"REGION\", \"TITLE\"]\n", " if not path.exists():\n", " raise FileNotFoundError(f\"Path does not exist: {path}\")\n", " gdf = gpd.read_file(path)\n", " # Reproject if needed\n", " if gdf.crs is None:\n", " print(f\"[warn] {layer_name} has no CRS; assuming EPSG:4326\")\n", " gdf = gdf.set_crs(4326)\n", " if gdf.crs.to_epsg() != 4326:\n", " gdf = gdf.to_crs(4326)\n", " # Pick a name field\n", " chosen = None\n", " for cand in name_field_candidates:\n", " if cand in gdf.columns:\n", " chosen = cand\n", " break\n", " if chosen is None:\n", " # Create a synthetic name\n", " gdf[\"feature_name\"] = gdf.index.astype(str)\n", " chosen = \"feature_name\"\n", " gdf = gdf.rename(columns={chosen: \"feature_name\"})\n", " gdf = gdf[[\"feature_name\", \"geometry\"]].copy()\n", " gdf[\"feature_name\"] = gdf[\"feature_name\"].astype(str).str.strip()\n", " gdf = gdf[gdf[\"geometry\"].notna()]\n", " gdf[\"feature_name\"] = gdf[\"feature_name\"].replace({\"\": None})\n", " print({\"layer\": layer_name, \"features\": len(gdf)})\n", " return gdf\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "6bb77203", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'basin_shp': '/home/dadams/Repos/texas-rebuild-postgis/data/oil_gas_basin_shape/Tx_ShaleBasins_11_USEIA.shp', 'play_shp': '/home/dadams/Repos/texas-rebuild-postgis/data/shale_play_shape/Tx_ShalePlays_11_USEIA.shp', 'texmex_shp': '/home/dadams/Repos/texas-rebuild-postgis/data/texmex_shape/tl_2023_us_internationalboundary.shp'}\n", "{'layer': 'basins', 'features': 9}\n", "{'layer': 'shale_plays', 'features': 10}\n", "{'layer': 'texmex', 'features': 4759}\n", "{'basins_rows': 9, 'plays_rows': 10, 'texmex_rows': 4759}\n" ] } ], "source": [ "# Cell 3: Load basin, shale play, and texmex regions (robust discovery + diagnostics)\n", "from pprint import pprint\n", "\n", "base_dir = Path(\"data\")\n", "basin_dir = base_dir / \"oil_gas_basin_shape\"\n", "play_dir = base_dir / \"shale_play_shape\"\n", "texmex_dir = base_dir / \"texmex_shape\" # now pointing to data/texmex_shape\n", "\n", "# Provide direct shapefile paths (user-specified)\n", "DIRECT_BASIN_SHP = Path(\"/home/dadams/Repos/texas-rebuild-postgis/data/oil_gas_basin_shape/Tx_ShaleBasins_11_USEIA.shp\")\n", "DIRECT_PLAY_SHP = Path(\"/home/dadams/Repos/texas-rebuild-postgis/data/shale_play_shape/Tx_ShalePlays_11_USEIA.shp\")\n", "DIRECT_TEXMEX_SHP = Path(\"/home/dadams/Repos/texas-rebuild-postgis/data/texmex_shape/tl_2023_us_internationalboundary.shp\")\n", "\n", "# Utility: list all shapefiles under a directory (one level recursive)\n", "def list_all_shapefiles(root: Path, max_results=50):\n", " if not root.exists():\n", " return []\n", " files = list(root.rglob(\"*.shp\"))\n", " return files[:max_results]\n", "\n", "# Discover a shapefile by directory or accept a direct file path. If missing, print suggestions instead of hard failing.\n", "def find_first_shp(path_candidate: Path, hint: str = \"layer\") -> Path | None:\n", " if path_candidate and path_candidate.is_file() and path_candidate.suffix.lower() == \".shp\":\n", " return path_candidate\n", " candidates = []\n", " if path_candidate and path_candidate.exists() and path_candidate.is_dir():\n", " candidates.extend(path_candidate.rglob(\"*.shp\"))\n", " if base_dir.exists():\n", " candidates.extend([p for p in base_dir.rglob(f\"*{hint}*.shp\")])\n", " seen = set(); deduped = []\n", " for c in candidates:\n", " if c not in seen:\n", " deduped.append(c); seen.add(c)\n", " if deduped:\n", " return deduped[0]\n", " print(f\"[warn] No shapefile found for hint '{hint}'. Listing available shapefiles under {base_dir}:\")\n", " for p in list_all_shapefiles(base_dir):\n", " print(f\" - {p}\")\n", " return None\n", "\n", "basin_shp = find_first_shp(DIRECT_BASIN_SHP if DIRECT_BASIN_SHP.exists() else basin_dir, hint=\"basin\")\n", "play_shp = find_first_shp(DIRECT_PLAY_SHP if DIRECT_PLAY_SHP.exists() else play_dir, hint=\"play\")\n", "texmex_shp = find_first_shp(DIRECT_TEXMEX_SHP if DIRECT_TEXMEX_SHP.exists() else texmex_dir, hint=\"texmex\")\n", "\n", "print({\"basin_shp\": str(basin_shp) if basin_shp else None,\n", " \"play_shp\": str(play_shp) if play_shp else None,\n", " \"texmex_shp\": str(texmex_shp) if texmex_shp else None})\n", "\n", "basins_gdf = load_layer(basin_shp, \"basins\") if basin_shp else gpd.GeoDataFrame(columns=[\"feature_name\",\"geometry\"], geometry=\"geometry\", crs=4326)\n", "plays_gdf = load_layer(play_shp, \"shale_plays\") if play_shp else gpd.GeoDataFrame(columns=[\"feature_name\",\"geometry\"], geometry=\"geometry\", crs=4326)\n", "texmex_gdf = load_layer(texmex_shp, \"texmex\") if texmex_shp else gpd.GeoDataFrame(columns=[\"feature_name\",\"geometry\"], geometry=\"geometry\", crs=4326)\n", "\n", "print({\"basins_rows\": len(basins_gdf), \"plays_rows\": len(plays_gdf), \"texmex_rows\": len(texmex_gdf)})\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "a4d82064", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'table': 'oil_gas_basins', 'rows': 9}\n", "{'table': 'shale_plays', 'rows': 10}\n", "{'table': 'texmex_regions', 'rows': 4759}\n" ] } ], "source": [ "# Cell 4: Persist shape layers to PostGIS with GIST indexes\n", "layer_specs = [\n", " (basins_gdf, \"oil_gas_basins\", \"basin_name\"),\n", " (plays_gdf, \"shale_plays\", \"play_name\"),\n", " (texmex_gdf, \"texmex_regions\", \"texmex_name\"),\n", "]\n", "\n", "with engine.begin() as conn:\n", " conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis\"))\n", " for gdf, table, new_name in layer_specs:\n", " df = gdf.rename(columns={\"feature_name\": new_name})\n", " # Write (replace each time); default geometry column name varies by geopandas version (often 'geom')\n", " df.to_postgis(table, conn, if_exists=\"replace\", index=False)\n", " # Normalize geometry column name to 'geometry'\n", " # Try geometry_columns first\n", " geom_col = conn.execute(\n", " text(\"\"\"\n", " SELECT f_geometry_column\n", " FROM geometry_columns\n", " WHERE f_table_schema='public' AND f_table_name=:t\n", " LIMIT 1\n", " \"\"\"), {\"t\": table}\n", " ).scalar()\n", " if geom_col is None:\n", " # Fallback: inspect columns for 'geom' or 'geometry'\n", " cols = [r[0] for r in conn.execute(text(\n", " \"SELECT column_name FROM information_schema.columns WHERE table_schema='public' AND table_name=:t\"\n", " ), {\"t\": table})]\n", " if \"geometry\" in cols:\n", " geom_col = \"geometry\"\n", " elif \"geom\" in cols:\n", " geom_col = \"geom\"\n", " if geom_col and geom_col != \"geometry\":\n", " conn.execute(text(f\"ALTER TABLE {table} RENAME COLUMN {geom_col} TO geometry\"))\n", " # Create GIST index on normalized column\n", " conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_{table}_geom ON {table} USING GIST (geometry)\"))\n", " # Basic stats\n", " count = conn.execute(text(f\"SELECT COUNT(*) FROM {table}\")).scalar()\n", " print({\"table\": table, \"rows\": int(count)})\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "0a3c2df9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'well_geo_features_rows': 1413157}\n" ] } ], "source": [ "# Cell 5: Materialize well_geo_features by spatially joining wells to basins, plays, and texmex\n", "# Contract:\n", "# - Input: well_table with geom in EPSG:4326\n", "# - Outputs: well_geo_features table with columns:\n", "# id, canonical_api10 (if present), api10_number/api_number (if present), geom,\n", "# basin_name, play_name, texmex_name (nullable)\n", "# - Indexes: btree on canonical_api10 (if exists) and GIST on geom\n", "\n", "# Probe well_table columns to pick identifiers and geometry column\n", "with engine.begin() as conn:\n", " cols = conn.execute(text(\"\"\"\n", " SELECT column_name\n", " FROM information_schema.columns\n", " WHERE table_schema='public' AND table_name=:t\n", " \"\"\"), {\"t\": well_table}).fetchall()\n", " cols = {r[0] for r in cols}\n", "\n", "id_cols = [c for c in [\"canonical_api10\", \"api10_number\", \"api_number\"] if c in cols]\n", "geom_col = \"geom\" if \"geom\" in cols else (\"geometry\" if \"geometry\" in cols else None)\n", "if geom_col is None:\n", " raise RuntimeError(f\"No geometry column found in {well_table} (expected geom or geometry)\")\n", "\n", "# Build SQL for left-joins using ST_Intersects; if none, try nearest within 1km\n", "select_core = f\"\"\"\n", "SELECT\n", " w.id,\n", " {', '.join([f'w.{c}' for c in id_cols]) if id_cols else 'NULL::text AS canonical_api10'} ,\n", " w.{geom_col} AS geom,\n", " b.basin_name,\n", " p.play_name,\n", " t.texmex_name\n", "FROM {well_table} w\n", "LEFT JOIN oil_gas_basins b\n", " ON ST_Intersects(w.{geom_col}, b.geometry)\n", "LEFT JOIN shale_plays p\n", " ON ST_Intersects(w.{geom_col}, p.geometry)\n", "LEFT JOIN texmex_regions t\n", " ON ST_Intersects(w.{geom_col}, t.geometry)\n", "\"\"\"\n", "\n", "materialize_sql = f\"\"\"\n", "DROP TABLE IF EXISTS _well_geo_features_stage;\n", "CREATE TABLE _well_geo_features_stage AS\n", "{select_core};\n", "\n", "-- Replace target\n", "DROP TABLE IF EXISTS well_geo_features;\n", "ALTER TABLE _well_geo_features_stage RENAME TO well_geo_features;\n", "\"\"\"\n", "\n", "with engine.begin() as conn:\n", " conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis\"))\n", " conn.execute(text(materialize_sql))\n", " # Indexes\n", " if \"canonical_api10\" in id_cols:\n", " conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wgf_api10 ON well_geo_features (canonical_api10)\"))\n", " conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wgf_geom ON well_geo_features USING GIST (geom)\"))\n", " conn.execute(text(\"ANALYZE well_geo_features\"))\n", "\n", "# Row count and sample\n", "with engine.begin() as conn:\n", " cnt = conn.execute(text(\"SELECT COUNT(*) FROM well_geo_features\")).scalar()\n", "print({\"well_geo_features_rows\": int(cnt)})\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "83c322ea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tables present (subset):\n", " - oil_gas_basins\n", " - shale_plays\n", " - texmex_regions\n", " - well_geo_features\n", "Sample well_geo_features:\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "id", "rawType": "int64", "type": "integer" }, { "name": "canonical_api10", "rawType": "object", "type": "string" }, { "name": "api10_number", "rawType": "object", "type": "unknown" }, { "name": "api_number", "rawType": "object", "type": "string" }, { "name": "geom", "rawType": "object", "type": "string" }, { "name": "basin_name", "rawType": "object", "type": "unknown" }, { "name": "play_name", "rawType": "object", "type": "unknown" }, { "name": "texmex_name", "rawType": "object", "type": "unknown" } ], "ref": "3301df45-2297-4eaa-8fd0-58c1152387f4", "rows": [ [ "0", "892679", "4236101293", null, "36101293", "0101000020E6100000F9E7D515206E57C052658EAEA3123E40", null, null, null ], [ "1", "892696", "4236130846", null, "36130846", "0101000020E610000031106C9FA26E57C01AA62FF3010D3E40", "8", null, null ], [ "2", "892697", "4236130889", null, "36130889", "0101000020E61000006EAF7E272C6F57C027656F143A0B3E40", "8", null, null ], [ "3", "892698", "4236130612", null, "36130612", "0101000020E6100000FE4F591F686F57C0EF5986F5270B3E40", "8", null, null ], [ "4", "892699", "4236130951", null, "36130951", "0101000020E6100000A9056F842A6F57C0EB553C5CF4093E40", "8", null, null ] ], "shape": { "columns": 8, "rows": 5 } }, "text/html": [ "
| \n", " | id | \n", "canonical_api10 | \n", "api10_number | \n", "api_number | \n", "geom | \n", "basin_name | \n", "play_name | \n", "texmex_name | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "892679 | \n", "4236101293 | \n", "None | \n", "36101293 | \n", "0101000020E6100000F9E7D515206E57C052658EAEA312... | \n", "None | \n", "None | \n", "None | \n", "
| 1 | \n", "892696 | \n", "4236130846 | \n", "None | \n", "36130846 | \n", "0101000020E610000031106C9FA26E57C01AA62FF3010D... | \n", "8 | \n", "None | \n", "None | \n", "
| 2 | \n", "892697 | \n", "4236130889 | \n", "None | \n", "36130889 | \n", "0101000020E61000006EAF7E272C6F57C027656F143A0B... | \n", "8 | \n", "None | \n", "None | \n", "
| 3 | \n", "892698 | \n", "4236130612 | \n", "None | \n", "36130612 | \n", "0101000020E6100000FE4F591F686F57C0EF5986F5270B... | \n", "8 | \n", "None | \n", "None | \n", "
| 4 | \n", "892699 | \n", "4236130951 | \n", "None | \n", "36130951 | \n", "0101000020E6100000A9056F842A6F57C0EB553C5CF409... | \n", "8 | \n", "None | \n", "None | \n", "