{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e095f37f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connected to texas_data at localhost:5432 as postgres\n", "{'well_tables_found': ['well_shape_tract', 'well_shapes']}\n", "{'using_well_table': 'well_shape_tract'}\n" ] } ], "source": [ "# Cell 1: Imports & database connection setup\n", "import os\n", "import pandas as pd\n", "import geopandas as gpd\n", "from sqlalchemy import create_engine, text\n", "from pathlib import Path\n", "\n", "PGHOST = os.getenv(\"PGHOST\", \"localhost\")\n", "PGPORT = os.getenv(\"PGPORT\", \"5432\")\n", "PGUSER = os.getenv(\"PGUSER\", \"postgres\")\n", "PGPASSWORD = os.getenv(\"PGPASSWORD\", \"\")\n", "PGDATABASE = os.getenv(\"PGDATABASE\", \"texas_data\") # default; we may switch if target DB differs\n", "\n", "pg_url = f\"postgresql+psycopg2://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}\"\n", "engine = create_engine(pg_url)\n", "print(f\"Connected to {PGDATABASE} at {PGHOST}:{PGPORT} as {PGUSER}\")\n", "\n", "# Detect candidate well tables\n", "candidate_well_tables = [\"well_shape_tract\", \"well_shapes\", \"wells\", \"rcc_well_api_raw\"]\n", "existing_well_tables = []\n", "with engine.begin() as conn:\n", " rows = conn.execute(text(\"\"\"\n", " SELECT table_name\n", " FROM information_schema.tables\n", " WHERE table_schema='public'\n", " \"\"\")).fetchall()\n", " present = {r[0] for r in rows}\n", " for t in candidate_well_tables:\n", " if t in present:\n", " existing_well_tables.append(t)\n", "\n", "print({\"well_tables_found\": existing_well_tables})\n", "# Select preferred well source by priority order\n", "preferred_order = [\"well_shape_tract\", \"well_shapes\", \"wells\", \"rcc_well_api_raw\"]\n", "well_table = None\n", "for t in preferred_order:\n", " if t in existing_well_tables:\n", " well_table = t\n", " break\n", "if well_table is None:\n", " raise RuntimeError(\"No well table found among candidates; ensure wells are loaded first.\")\n", "print({\"using_well_table\": well_table})\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "cd8aea17", "metadata": {}, "outputs": [], "source": [ "# Cell 2: Helper to read and standardize a shapefile layer to EPSG:4326\n", "from typing import Optional\n", "\n", "def load_layer(path: Path, layer_name: str, name_field_candidates=None) -> gpd.GeoDataFrame:\n", " if name_field_candidates is None:\n", " name_field_candidates = [\"name\", \"Name\", \"Basin\", \"BASIN\", \"Play\", \"PLAY\", \"Region\", \"REGION\", \"TITLE\"]\n", " if not path.exists():\n", " raise FileNotFoundError(f\"Path does not exist: {path}\")\n", " gdf = gpd.read_file(path)\n", " # Reproject if needed\n", " if gdf.crs is None:\n", " print(f\"[warn] {layer_name} has no CRS; assuming EPSG:4326\")\n", " gdf = gdf.set_crs(4326)\n", " if gdf.crs.to_epsg() != 4326:\n", " gdf = gdf.to_crs(4326)\n", " # Pick a name field\n", " chosen = None\n", " for cand in name_field_candidates:\n", " if cand in gdf.columns:\n", " chosen = cand\n", " break\n", " if chosen is None:\n", " # Create a synthetic name\n", " gdf[\"feature_name\"] = gdf.index.astype(str)\n", " chosen = \"feature_name\"\n", " gdf = gdf.rename(columns={chosen: \"feature_name\"})\n", " gdf = gdf[[\"feature_name\", \"geometry\"]].copy()\n", " gdf[\"feature_name\"] = gdf[\"feature_name\"].astype(str).str.strip()\n", " gdf = gdf[gdf[\"geometry\"].notna()]\n", " gdf[\"feature_name\"] = gdf[\"feature_name\"].replace({\"\": None})\n", " print({\"layer\": layer_name, \"features\": len(gdf)})\n", " return gdf\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "6bb77203", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'basin_shp': '/home/dadams/Repos/texas-rebuild-postgis/data/oil_gas_basin_shape/Tx_ShaleBasins_11_USEIA.shp', 'play_shp': '/home/dadams/Repos/texas-rebuild-postgis/data/shale_play_shape/Tx_ShalePlays_11_USEIA.shp', 'texmex_shp': '/home/dadams/Repos/texas-rebuild-postgis/data/texmex_shape/tl_2023_us_internationalboundary.shp'}\n", "{'layer': 'basins', 'features': 9}\n", "{'layer': 'shale_plays', 'features': 10}\n", "{'layer': 'texmex', 'features': 4759}\n", "{'basins_rows': 9, 'plays_rows': 10, 'texmex_rows': 4759}\n" ] } ], "source": [ "# Cell 3: Load basin, shale play, and texmex regions (robust discovery + diagnostics)\n", "from pprint import pprint\n", "\n", "base_dir = Path(\"data\")\n", "basin_dir = base_dir / \"oil_gas_basin_shape\"\n", "play_dir = base_dir / \"shale_play_shape\"\n", "texmex_dir = base_dir / \"texmex_shape\" # now pointing to data/texmex_shape\n", "\n", "# Provide direct shapefile paths (user-specified)\n", "DIRECT_BASIN_SHP = Path(\"/home/dadams/Repos/texas-rebuild-postgis/data/oil_gas_basin_shape/Tx_ShaleBasins_11_USEIA.shp\")\n", "DIRECT_PLAY_SHP = Path(\"/home/dadams/Repos/texas-rebuild-postgis/data/shale_play_shape/Tx_ShalePlays_11_USEIA.shp\")\n", "DIRECT_TEXMEX_SHP = Path(\"/home/dadams/Repos/texas-rebuild-postgis/data/texmex_shape/tl_2023_us_internationalboundary.shp\")\n", "\n", "# Utility: list all shapefiles under a directory (one level recursive)\n", "def list_all_shapefiles(root: Path, max_results=50):\n", " if not root.exists():\n", " return []\n", " files = list(root.rglob(\"*.shp\"))\n", " return files[:max_results]\n", "\n", "# Discover a shapefile by directory or accept a direct file path. If missing, print suggestions instead of hard failing.\n", "def find_first_shp(path_candidate: Path, hint: str = \"layer\") -> Path | None:\n", " if path_candidate and path_candidate.is_file() and path_candidate.suffix.lower() == \".shp\":\n", " return path_candidate\n", " candidates = []\n", " if path_candidate and path_candidate.exists() and path_candidate.is_dir():\n", " candidates.extend(path_candidate.rglob(\"*.shp\"))\n", " if base_dir.exists():\n", " candidates.extend([p for p in base_dir.rglob(f\"*{hint}*.shp\")])\n", " seen = set(); deduped = []\n", " for c in candidates:\n", " if c not in seen:\n", " deduped.append(c); seen.add(c)\n", " if deduped:\n", " return deduped[0]\n", " print(f\"[warn] No shapefile found for hint '{hint}'. Listing available shapefiles under {base_dir}:\")\n", " for p in list_all_shapefiles(base_dir):\n", " print(f\" - {p}\")\n", " return None\n", "\n", "basin_shp = find_first_shp(DIRECT_BASIN_SHP if DIRECT_BASIN_SHP.exists() else basin_dir, hint=\"basin\")\n", "play_shp = find_first_shp(DIRECT_PLAY_SHP if DIRECT_PLAY_SHP.exists() else play_dir, hint=\"play\")\n", "texmex_shp = find_first_shp(DIRECT_TEXMEX_SHP if DIRECT_TEXMEX_SHP.exists() else texmex_dir, hint=\"texmex\")\n", "\n", "print({\"basin_shp\": str(basin_shp) if basin_shp else None,\n", " \"play_shp\": str(play_shp) if play_shp else None,\n", " \"texmex_shp\": str(texmex_shp) if texmex_shp else None})\n", "\n", "basins_gdf = load_layer(basin_shp, \"basins\") if basin_shp else gpd.GeoDataFrame(columns=[\"feature_name\",\"geometry\"], geometry=\"geometry\", crs=4326)\n", "plays_gdf = load_layer(play_shp, \"shale_plays\") if play_shp else gpd.GeoDataFrame(columns=[\"feature_name\",\"geometry\"], geometry=\"geometry\", crs=4326)\n", "texmex_gdf = load_layer(texmex_shp, \"texmex\") if texmex_shp else gpd.GeoDataFrame(columns=[\"feature_name\",\"geometry\"], geometry=\"geometry\", crs=4326)\n", "\n", "print({\"basins_rows\": len(basins_gdf), \"plays_rows\": len(plays_gdf), \"texmex_rows\": len(texmex_gdf)})\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "a4d82064", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'table': 'oil_gas_basins', 'rows': 9}\n", "{'table': 'shale_plays', 'rows': 10}\n", "{'table': 'texmex_regions', 'rows': 4759}\n" ] } ], "source": [ "# Cell 4: Persist shape layers to PostGIS with GIST indexes\n", "layer_specs = [\n", " (basins_gdf, \"oil_gas_basins\", \"basin_name\"),\n", " (plays_gdf, \"shale_plays\", \"play_name\"),\n", " (texmex_gdf, \"texmex_regions\", \"texmex_name\"),\n", "]\n", "\n", "with engine.begin() as conn:\n", " conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis\"))\n", " for gdf, table, new_name in layer_specs:\n", " df = gdf.rename(columns={\"feature_name\": new_name})\n", " # Write (replace each time); default geometry column name varies by geopandas version (often 'geom')\n", " df.to_postgis(table, conn, if_exists=\"replace\", index=False)\n", " # Normalize geometry column name to 'geometry'\n", " # Try geometry_columns first\n", " geom_col = conn.execute(\n", " text(\"\"\"\n", " SELECT f_geometry_column\n", " FROM geometry_columns\n", " WHERE f_table_schema='public' AND f_table_name=:t\n", " LIMIT 1\n", " \"\"\"), {\"t\": table}\n", " ).scalar()\n", " if geom_col is None:\n", " # Fallback: inspect columns for 'geom' or 'geometry'\n", " cols = [r[0] for r in conn.execute(text(\n", " \"SELECT column_name FROM information_schema.columns WHERE table_schema='public' AND table_name=:t\"\n", " ), {\"t\": table})]\n", " if \"geometry\" in cols:\n", " geom_col = \"geometry\"\n", " elif \"geom\" in cols:\n", " geom_col = \"geom\"\n", " if geom_col and geom_col != \"geometry\":\n", " conn.execute(text(f\"ALTER TABLE {table} RENAME COLUMN {geom_col} TO geometry\"))\n", " # Create GIST index on normalized column\n", " conn.execute(text(f\"CREATE INDEX IF NOT EXISTS idx_{table}_geom ON {table} USING GIST (geometry)\"))\n", " # Basic stats\n", " count = conn.execute(text(f\"SELECT COUNT(*) FROM {table}\")).scalar()\n", " print({\"table\": table, \"rows\": int(count)})\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "0a3c2df9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'well_geo_features_rows': 1413157}\n" ] } ], "source": [ "# Cell 5: Materialize well_geo_features by spatially joining wells to basins, plays, and texmex\n", "# Contract:\n", "# - Input: well_table with geom in EPSG:4326\n", "# - Outputs: well_geo_features table with columns:\n", "# id, canonical_api10 (if present), api10_number/api_number (if present), geom,\n", "# basin_name, play_name, texmex_name (nullable)\n", "# - Indexes: btree on canonical_api10 (if exists) and GIST on geom\n", "\n", "# Probe well_table columns to pick identifiers and geometry column\n", "with engine.begin() as conn:\n", " cols = conn.execute(text(\"\"\"\n", " SELECT column_name\n", " FROM information_schema.columns\n", " WHERE table_schema='public' AND table_name=:t\n", " \"\"\"), {\"t\": well_table}).fetchall()\n", " cols = {r[0] for r in cols}\n", "\n", "id_cols = [c for c in [\"canonical_api10\", \"api10_number\", \"api_number\"] if c in cols]\n", "geom_col = \"geom\" if \"geom\" in cols else (\"geometry\" if \"geometry\" in cols else None)\n", "if geom_col is None:\n", " raise RuntimeError(f\"No geometry column found in {well_table} (expected geom or geometry)\")\n", "\n", "# Build SQL for left-joins using ST_Intersects; if none, try nearest within 1km\n", "select_core = f\"\"\"\n", "SELECT\n", " w.id,\n", " {', '.join([f'w.{c}' for c in id_cols]) if id_cols else 'NULL::text AS canonical_api10'} ,\n", " w.{geom_col} AS geom,\n", " b.basin_name,\n", " p.play_name,\n", " t.texmex_name\n", "FROM {well_table} w\n", "LEFT JOIN oil_gas_basins b\n", " ON ST_Intersects(w.{geom_col}, b.geometry)\n", "LEFT JOIN shale_plays p\n", " ON ST_Intersects(w.{geom_col}, p.geometry)\n", "LEFT JOIN texmex_regions t\n", " ON ST_Intersects(w.{geom_col}, t.geometry)\n", "\"\"\"\n", "\n", "materialize_sql = f\"\"\"\n", "DROP TABLE IF EXISTS _well_geo_features_stage;\n", "CREATE TABLE _well_geo_features_stage AS\n", "{select_core};\n", "\n", "-- Replace target\n", "DROP TABLE IF EXISTS well_geo_features;\n", "ALTER TABLE _well_geo_features_stage RENAME TO well_geo_features;\n", "\"\"\"\n", "\n", "with engine.begin() as conn:\n", " conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS postgis\"))\n", " conn.execute(text(materialize_sql))\n", " # Indexes\n", " if \"canonical_api10\" in id_cols:\n", " conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wgf_api10 ON well_geo_features (canonical_api10)\"))\n", " conn.execute(text(\"CREATE INDEX IF NOT EXISTS idx_wgf_geom ON well_geo_features USING GIST (geom)\"))\n", " conn.execute(text(\"ANALYZE well_geo_features\"))\n", "\n", "# Row count and sample\n", "with engine.begin() as conn:\n", " cnt = conn.execute(text(\"SELECT COUNT(*) FROM well_geo_features\")).scalar()\n", "print({\"well_geo_features_rows\": int(cnt)})\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "83c322ea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tables present (subset):\n", " - oil_gas_basins\n", " - shale_plays\n", " - texmex_regions\n", " - well_geo_features\n", "Sample well_geo_features:\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "id", "rawType": "int64", "type": "integer" }, { "name": "canonical_api10", "rawType": "object", "type": "string" }, { "name": "api10_number", "rawType": "object", "type": "unknown" }, { "name": "api_number", "rawType": "object", "type": "string" }, { "name": "geom", "rawType": "object", "type": "string" }, { "name": "basin_name", "rawType": "object", "type": "unknown" }, { "name": "play_name", "rawType": "object", "type": "unknown" }, { "name": "texmex_name", "rawType": "object", "type": "unknown" } ], "ref": "3301df45-2297-4eaa-8fd0-58c1152387f4", "rows": [ [ "0", "892679", "4236101293", null, "36101293", "0101000020E6100000F9E7D515206E57C052658EAEA3123E40", null, null, null ], [ "1", "892696", "4236130846", null, "36130846", "0101000020E610000031106C9FA26E57C01AA62FF3010D3E40", "8", null, null ], [ "2", "892697", "4236130889", null, "36130889", "0101000020E61000006EAF7E272C6F57C027656F143A0B3E40", "8", null, null ], [ "3", "892698", "4236130612", null, "36130612", "0101000020E6100000FE4F591F686F57C0EF5986F5270B3E40", "8", null, null ], [ "4", "892699", "4236130951", null, "36130951", "0101000020E6100000A9056F842A6F57C0EB553C5CF4093E40", "8", null, null ] ], "shape": { "columns": 8, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcanonical_api10api10_numberapi_numbergeombasin_nameplay_nametexmex_name
08926794236101293None361012930101000020E6100000F9E7D515206E57C052658EAEA312...NoneNoneNone
18926964236130846None361308460101000020E610000031106C9FA26E57C01AA62FF3010D...8NoneNone
28926974236130889None361308890101000020E61000006EAF7E272C6F57C027656F143A0B...8NoneNone
38926984236130612None361306120101000020E6100000FE4F591F686F57C0EF5986F5270B...8NoneNone
48926994236130951None361309510101000020E6100000A9056F842A6F57C0EB553C5CF409...8NoneNone
\n", "
" ], "text/plain": [ " id canonical_api10 api10_number api_number \\\n", "0 892679 4236101293 None 36101293 \n", "1 892696 4236130846 None 36130846 \n", "2 892697 4236130889 None 36130889 \n", "3 892698 4236130612 None 36130612 \n", "4 892699 4236130951 None 36130951 \n", "\n", " geom basin_name play_name \\\n", "0 0101000020E6100000F9E7D515206E57C052658EAEA312... None None \n", "1 0101000020E610000031106C9FA26E57C01AA62FF3010D... 8 None \n", "2 0101000020E61000006EAF7E272C6F57C027656F143A0B... 8 None \n", "3 0101000020E6100000FE4F591F686F57C0EF5986F5270B... 8 None \n", "4 0101000020E6100000A9056F842A6F57C0EB553C5CF409... 8 None \n", "\n", " texmex_name \n", "0 None \n", "1 None \n", "2 None \n", "3 None \n", "4 None " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cell 6: Diagnostics & sample output\n", "with engine.begin() as conn:\n", " tables = conn.execute(text(\"\"\"\n", " SELECT table_name\n", " FROM information_schema.tables\n", " WHERE table_schema='public' AND table_type='BASE TABLE'\n", " \"\"\")).fetchall()\n", " table_list = sorted([t[0] for t in tables])\n", "print(\"Tables present (subset):\")\n", "for t in table_list:\n", " if t in [\"oil_gas_basins\", \"shale_plays\", \"texmex_regions\", \"well_geo_features\"]:\n", " print(f\" - {t}\")\n", "\n", "with engine.begin() as conn:\n", " sample = pd.read_sql(text(\"SELECT * FROM well_geo_features LIMIT 5\"), conn)\n", "print(\"Sample well_geo_features:\")\n", "sample" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 5 }