Initial commit

2026-01-30 10:57:55 -08:00
commit 7b8890ed80
73 changed files with 14439530 additions and 0 deletions
--- a/rebuild/rrc_api_data.ipynb
+++ b/rebuild/rrc_api_data.ipynb
@@ -0,0 +1,319 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "83f1549f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RealDictRow({'version': 'PostgreSQL 17.6 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 15.2.1 20250813, 64-bit'})\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Cell 0: connect to Postgres\n",
+    "# Requires: pip install psycopg2-binary\n",
+    "import os\n",
+    "import psycopg2\n",
+    "from psycopg2.extras import RealDictCursor\n",
+    "\n",
+    "# Configure via environment variables (safer) or edit defaults below\n",
+    "PGHOST = os.getenv(\"PGHOST\", \"localhost\")\n",
+    "PGPORT = os.getenv(\"PGPORT\", \"5432\")\n",
+    "PGUSER = os.getenv(\"PGUSER\", \"postgres\")\n",
+    "PGPASSWORD = os.getenv(\"PGPASSWORD\", \"\")\n",
+    "# Default DB name set to \"postgres\"; will fall back if PGDATABASE not set in env\n",
+    "PGDATABASE = os.getenv(\"PGDATABASE\", \"texas_data\")\n",
+    "\n",
+    "if PGDATABASE is None or PGDATABASE == \"\":\n",
+    "    print(\"[warn] PGDATABASE not set; using fallback 'postgres'\")\n",
+    "    PGDATABASE = \"postgres\"\n",
+    "\n",
+    "def get_conn():\n",
+    "    \"\"\"Return a new psycopg2 connection using configured environment variables.\"\"\"\n",
+    "    return psycopg2.connect(\n",
+    "        host=PGHOST,\n",
+    "        port=PGPORT,\n",
+    "        user=PGUSER,\n",
+    "        password=PGPASSWORD,\n",
+    "        dbname=PGDATABASE,\n",
+    "    )\n",
+    "\n",
+    "# Quick connection test (only runs if executed as a script, not typical in notebook)\n",
+    "if __name__ == \"__main__\":\n",
+    "    try:\n",
+    "        conn = get_conn()\n",
+    "        with conn.cursor(cursor_factory=RealDictCursor) as cur:\n",
+    "            cur.execute(\"SELECT version() AS version;\")\n",
+    "            print(cur.fetchone())\n",
+    "        conn.close()\n",
+    "    except Exception as e:\n",
+    "        print(\"Postgres connection failed:\", e)\n",
+    "        raise"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "92cb7516",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install dbfread --quiet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ba980c42",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 254 DBF files\n",
+      "Unified column count: 19\n",
+      "Created table rrc_well_api_raw with TEXT columns.\n",
+      "Wrote 1547869 rows to temp CSV /tmp/tmp9776133n\n",
+      "Loaded 1547869 rows into rrc_well_api_raw\n",
+      "Added surrogate primary key column id\n",
+      "DBF ingestion complete.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Bulk ingest all .dbf files from /home/dadams/data/rrc_api/all_dbf_files/ into Postgres\n",
+    "import os\n",
+    "import re\n",
+    "from pathlib import Path\n",
+    "from dbfread import DBF\n",
+    "import psycopg2\n",
+    "import tempfile\n",
+    "import csv\n",
+    "\n",
+    "# Reuse connection function defined earlier\n",
+    "conn = get_conn()\n",
+    "base_dir = Path('/home/dadams/data/rrc_api/all_dbf_files')\n",
+    "if not base_dir.exists():\n",
+    "    raise FileNotFoundError(f\"Directory {base_dir} not found\")\n",
+    "\n",
+    "# Collect DBF files\n",
+    "files = sorted(base_dir.glob('*.dbf'))\n",
+    "print(f\"Found {len(files)} DBF files\")\n",
+    "if not files:\n",
+    "    raise SystemExit(\"No DBF files to process\")\n",
+    "\n",
+    "# Helper to make Postgres-safe, lowercase column names\n",
+    "def sanitize(name: str) -> str:\n",
+    "    s = name.strip().lower()\n",
+    "    s = re.sub(r\"\\s+\", \"_\", s)\n",
+    "    s = re.sub(r\"[^a-z0-9_]+\", \"_\", s)\n",
+    "    s = re.sub(r\"_+\", \"_\", s).strip(\"_\")\n",
+    "    if not s or s[0].isdigit():\n",
+    "        s = f\"col_{s}\" if s else \"col\"\n",
+    "    return s\n",
+    "\n",
+    "# Preferred encoding for DBF char fields. latin1 is a safe single-byte fallback.\n",
+    "DBF_ENCODING = 'latin1'\n",
+    "DBF_DECODE_ERRORS = 'replace'  # or 'ignore' to drop undecodable bytes\n",
+    "\n",
+    "# First pass: determine unified set of columns (lowercased + sanitized)\n",
+    "all_columns = []\n",
+    "for f in files:\n",
+    "    try:\n",
+    "        table = DBF(\n",
+    "            f,\n",
+    "            ignore_missing_memofile=True,\n",
+    "            encoding=DBF_ENCODING,\n",
+    "            char_decode_errors=DBF_DECODE_ERRORS,\n",
+    "        )  # lazy\n",
+    "        cols = [sanitize(n) for n in table.field_names]\n",
+    "    except Exception as e:\n",
+    "        print(f\"[warn] Skipping {f.name} due to read error: {e}\")\n",
+    "        continue\n",
+    "    for c in cols:\n",
+    "        if c not in all_columns:\n",
+    "            all_columns.append(c)\n",
+    "print(f\"Unified column count: {len(all_columns)}\")\n",
+    "\n",
+    "# Create table with TEXT columns for safety; refine types later if needed\n",
+    "columns_sql = ', '.join(f'\"{c}\" TEXT' for c in all_columns)\n",
+    "\n",
+    "target_table = 'rrc_well_api_raw'\n",
+    "with conn.cursor() as cur:\n",
+    "    cur.execute(f'DROP TABLE IF EXISTS \"{target_table}\"')\n",
+    "    cur.execute(f'CREATE TABLE \"{target_table}\" ({columns_sql})')\n",
+    "conn.commit()\n",
+    "print(f\"Created table {target_table} with TEXT columns.\")\n",
+    "\n",
+    "# Prepare a temp CSV for COPY (faster than individual inserts)\n",
+    "rows_written = 0\n",
+    "with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False) as tmp:\n",
+    "    writer = csv.writer(tmp)\n",
+    "    writer.writerow(all_columns)  # header for clarity (we'll skip in COPY)\n",
+    "    for f in files:\n",
+    "        try:\n",
+    "            table = DBF(\n",
+    "                f,\n",
+    "                ignore_missing_memofile=True,\n",
+    "                encoding=DBF_ENCODING,\n",
+    "                char_decode_errors=DBF_DECODE_ERRORS,\n",
+    "            )  # lazy\n",
+    "            # map DBF original names (mostly uppercase) to sanitized lowercase\n",
+    "            name_map = {sanitize(n): n for n in table.field_names}\n",
+    "        except Exception as e:\n",
+    "            print(f\"[warn] Skipping {f.name} due to reload error: {e}\")\n",
+    "            continue\n",
+    "        for record in table:\n",
+    "            row = []\n",
+    "            for col in all_columns:\n",
+    "                src = name_map.get(col)\n",
+    "                val = record.get(src) if src else None\n",
+    "                # Ensure scalar string for CSV; replace newlines to keep one record per line\n",
+    "                sval = '' if val is None else str(val).replace('\\r', ' ').replace('\\n', ' ')\n",
+    "                row.append(sval)\n",
+    "            writer.writerow(row)\n",
+    "            rows_written += 1\n",
+    "    tmp_path = Path(tmp.name)\n",
+    "print(f\"Wrote {rows_written} rows to temp CSV {tmp_path}\")\n",
+    "\n",
+    "# COPY into Postgres (skip header row)\n",
+    "with conn.cursor() as cur, tmp_path.open('r') as fh:\n",
+    "    next(fh)  # skip header\n",
+    "    columns_list = \", \".join(f'\"{c}\"' for c in all_columns)\n",
+    "    cur.copy_expert(f'COPY \"{target_table}\" ({columns_list}) FROM STDIN WITH (FORMAT csv)', fh)\n",
+    "conn.commit()\n",
+    "print(f\"Loaded {rows_written} rows into {target_table}\")\n",
+    "\n",
+    "# Optional: add a surrogate primary key\n",
+    "with conn.cursor() as cur:\n",
+    "    try:\n",
+    "        cur.execute(f'ALTER TABLE \"{target_table}\" ADD COLUMN id BIGSERIAL PRIMARY KEY')\n",
+    "        conn.commit()\n",
+    "        print(\"Added surrogate primary key column id\")\n",
+    "    except Exception as e:\n",
+    "        print(\"[warn] Could not add surrogate primary key:\", e)\n",
+    "\n",
+    "conn.close()\n",
+    "print(\"DBF ingestion complete.\")\n",
+    "\n",
+    "# Next steps:\n",
+    "# - Add indexes on frequently queried columns, e.g., api numbers.\n",
+    "# - Cast to appropriate types after reviewing well-api-manual.pdf.\n",
+    "# - Consider partitioning if table grows large."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4fbfdf4b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tables in the database:\n",
+      "geography_columns\n",
+      "geometry_columns\n",
+      "inspections\n",
+      "rrc_well_api_raw\n",
+      "spatial_ref_sys\n",
+      "violations\n"
+     ]
+    }
+   ],
+   "source": [
+    "# show tables in the database to verify\n",
+    "conn = get_conn()\n",
+    "with conn.cursor() as cur:\n",
+    "    cur.execute(\"\"\"\n",
+    "        SELECT table_name\n",
+    "        FROM information_schema.tables\n",
+    "        WHERE table_schema = 'public'\n",
+    "        ORDER BY table_name;\n",
+    "    \"\"\")\n",
+    "    tables = cur.fetchall()\n",
+    "    print(\"Tables in the database:\")\n",
+    "    for table in tables:\n",
+    "        print(table[0])\n",
+    "conn.close()    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d339d904",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Column names: ['abstract', 'apinum', 'block', 'completion', 'field_name', 'lease_name', 'gas_rrcid', 'oil_gas_co', 'on_off_sch', 'operator', 'permit_num', 'plug_date', 'refer_to_a', 'section', 'survey', 'total_dept', 'wellid', 'quadnum', 'objectid_1', 'id']\n",
+      "(None, '00131513', None, '0', 'WILDCAT', 'ISSACS, J. W.', '000000', None, None, 'HUDSON RESOURCES CORP.', '215813', '19840218', '00000000', None, None, '12920', '1', '3295122', '345803', 1)\n",
+      "('22', '00132855', 'X', '20241109', 'GIRLIE CALDWELL (GOODLAND LM)', 'EVERYTHING UNIT', '16104', 'O', 'N', 'GEOSOUTHERN OPERATING II, LLC', '900796', '0', '00000000', None, 'FERGUSON, J', '8883', '   1H', '3295122', '1373345', 2)\n",
+      "('23', '00131559', None, '0', 'R.A.M. (PETTIT 12300) WILDCAT', 'PARROTT EL AL', '000000', None, None, 'MOSBACHER PRODUCTION CO.', '240164', '19840806', '00000000', None, 'JOSEPH FERGUSON', '13197', 'ST1', '3295122', '933695', 3)\n",
+      "('23', '00131559', None, '0', 'R.A.M. (PETTIT 12300) WILDCAT', 'PARROTT EL AL', '000000', None, None, 'MOSBACHER PRODUCTION CO.', '248530', '19841016', '00000000', None, 'JOSEPH FERGUSON', '13197', '   1', '3295122', '933695', 4)\n",
+      "(None, '00131512', None, '0', 'WILDCAT', 'SAUNDERS \"A\"', '000000', None, None, 'TXO PRODUCTION CORP.', '214214', '19840105', '00000000', None, None, '12730', '1', '3295122', '345809', 5)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# show header for rrc_well_api_raw to verify\n",
+    "conn = get_conn()\n",
+    "with conn.cursor() as cur:\n",
+    "    cur.execute('SELECT * FROM rrc_well_api_raw LIMIT 5;')\n",
+    "    rows = cur.fetchall()\n",
+    "    colnames = [desc[0] for desc in cur.description]\n",
+    "    print(\"Column names:\", colnames)\n",
+    "    for row in rows:\n",
+    "        print(row)\n",
+    "conn.close()    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "340311f5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}