texas-district-analysis/rebuild/rrc_api_data.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "83f1549f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RealDictRow({'version': 'PostgreSQL 17.6 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 15.2.1 20250813, 64-bit'})\n"
     ]
    }
   ],
   "source": [
    "# Cell 0: connect to Postgres\n",
    "# Requires: pip install psycopg2-binary\n",
    "import os\n",
    "import psycopg2\n",
    "from psycopg2.extras import RealDictCursor\n",
    "\n",
    "# Configure via environment variables (safer) or edit defaults below\n",
    "PGHOST = os.getenv(\"PGHOST\", \"localhost\")\n",
    "PGPORT = os.getenv(\"PGPORT\", \"5432\")\n",
    "PGUSER = os.getenv(\"PGUSER\", \"postgres\")\n",
    "PGPASSWORD = os.getenv(\"PGPASSWORD\", \"\")\n",
    "# Default DB name set to \"postgres\"; will fall back if PGDATABASE not set in env\n",
    "PGDATABASE = os.getenv(\"PGDATABASE\", \"texas_data\")\n",
    "\n",
    "if PGDATABASE is None or PGDATABASE == \"\":\n",
    "    print(\"[warn] PGDATABASE not set; using fallback 'postgres'\")\n",
    "    PGDATABASE = \"postgres\"\n",
    "\n",
    "def get_conn():\n",
    "    \"\"\"Return a new psycopg2 connection using configured environment variables.\"\"\"\n",
    "    return psycopg2.connect(\n",
    "        host=PGHOST,\n",
    "        port=PGPORT,\n",
    "        user=PGUSER,\n",
    "        password=PGPASSWORD,\n",
    "        dbname=PGDATABASE,\n",
    "    )\n",
    "\n",
    "# Quick connection test (only runs if executed as a script, not typical in notebook)\n",
    "if __name__ == \"__main__\":\n",
    "    try:\n",
    "        conn = get_conn()\n",
    "        with conn.cursor(cursor_factory=RealDictCursor) as cur:\n",
    "            cur.execute(\"SELECT version() AS version;\")\n",
    "            print(cur.fetchone())\n",
    "        conn.close()\n",
    "    except Exception as e:\n",
    "        print(\"Postgres connection failed:\", e)\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "92cb7516",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install dbfread --quiet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ba980c42",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 254 DBF files\n",
      "Unified column count: 19\n",
      "Created table rrc_well_api_raw with TEXT columns.\n",
      "Wrote 1547869 rows to temp CSV /tmp/tmp9776133n\n",
      "Loaded 1547869 rows into rrc_well_api_raw\n",
      "Added surrogate primary key column id\n",
      "DBF ingestion complete.\n"
     ]
    }
   ],
   "source": [
    "# Bulk ingest all .dbf files from /home/dadams/data/rrc_api/all_dbf_files/ into Postgres\n",
    "import os\n",
    "import re\n",
    "from pathlib import Path\n",
    "from dbfread import DBF\n",
    "import psycopg2\n",
    "import tempfile\n",
    "import csv\n",
    "\n",
    "# Reuse connection function defined earlier\n",
    "conn = get_conn()\n",
    "base_dir = Path('/home/dadams/data/rrc_api/all_dbf_files')\n",
    "if not base_dir.exists():\n",
    "    raise FileNotFoundError(f\"Directory {base_dir} not found\")\n",
    "\n",
    "# Collect DBF files\n",
    "files = sorted(base_dir.glob('*.dbf'))\n",
    "print(f\"Found {len(files)} DBF files\")\n",
    "if not files:\n",
    "    raise SystemExit(\"No DBF files to process\")\n",
    "\n",
    "# Helper to make Postgres-safe, lowercase column names\n",
    "def sanitize(name: str) -> str:\n",
    "    s = name.strip().lower()\n",
    "    s = re.sub(r\"\\s+\", \"_\", s)\n",
    "    s = re.sub(r\"[^a-z0-9_]+\", \"_\", s)\n",
    "    s = re.sub(r\"_+\", \"_\", s).strip(\"_\")\n",
    "    if not s or s[0].isdigit():\n",
    "        s = f\"col_{s}\" if s else \"col\"\n",
    "    return s\n",
    "\n",
    "# Preferred encoding for DBF char fields. latin1 is a safe single-byte fallback.\n",
    "DBF_ENCODING = 'latin1'\n",
    "DBF_DECODE_ERRORS = 'replace'  # or 'ignore' to drop undecodable bytes\n",
    "\n",
    "# First pass: determine unified set of columns (lowercased + sanitized)\n",
    "all_columns = []\n",
    "for f in files:\n",
    "    try:\n",
    "        table = DBF(\n",
    "            f,\n",
    "            ignore_missing_memofile=True,\n",
    "            encoding=DBF_ENCODING,\n",
    "            char_decode_errors=DBF_DECODE_ERRORS,\n",
    "        )  # lazy\n",
    "        cols = [sanitize(n) for n in table.field_names]\n",
    "    except Exception as e:\n",
    "        print(f\"[warn] Skipping {f.name} due to read error: {e}\")\n",
    "        continue\n",
    "    for c in cols:\n",
    "        if c not in all_columns:\n",
    "            all_columns.append(c)\n",
    "print(f\"Unified column count: {len(all_columns)}\")\n",
    "\n",
    "# Create table with TEXT columns for safety; refine types later if needed\n",
    "columns_sql = ', '.join(f'\"{c}\" TEXT' for c in all_columns)\n",
    "\n",
    "target_table = 'rrc_well_api_raw'\n",
    "with conn.cursor() as cur:\n",
    "    cur.execute(f'DROP TABLE IF EXISTS \"{target_table}\"')\n",
    "    cur.execute(f'CREATE TABLE \"{target_table}\" ({columns_sql})')\n",
    "conn.commit()\n",
    "print(f\"Created table {target_table} with TEXT columns.\")\n",
    "\n",
    "# Prepare a temp CSV for COPY (faster than individual inserts)\n",
    "rows_written = 0\n",
    "with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False) as tmp:\n",
    "    writer = csv.writer(tmp)\n",
    "    writer.writerow(all_columns)  # header for clarity (we'll skip in COPY)\n",
    "    for f in files:\n",
    "        try:\n",
    "            table = DBF(\n",
    "                f,\n",
    "                ignore_missing_memofile=True,\n",
    "                encoding=DBF_ENCODING,\n",
    "                char_decode_errors=DBF_DECODE_ERRORS,\n",
    "            )  # lazy\n",
    "            # map DBF original names (mostly uppercase) to sanitized lowercase\n",
    "            name_map = {sanitize(n): n for n in table.field_names}\n",
    "        except Exception as e:\n",
    "            print(f\"[warn] Skipping {f.name} due to reload error: {e}\")\n",
    "            continue\n",
    "        for record in table:\n",
    "            row = []\n",
    "            for col in all_columns:\n",
    "                src = name_map.get(col)\n",
    "                val = record.get(src) if src else None\n",
    "                # Ensure scalar string for CSV; replace newlines to keep one record per line\n",
    "                sval = '' if val is None else str(val).replace('\\r', ' ').replace('\\n', ' ')\n",
    "                row.append(sval)\n",
    "            writer.writerow(row)\n",
    "            rows_written += 1\n",
    "    tmp_path = Path(tmp.name)\n",
    "print(f\"Wrote {rows_written} rows to temp CSV {tmp_path}\")\n",
    "\n",
    "# COPY into Postgres (skip header row)\n",
    "with conn.cursor() as cur, tmp_path.open('r') as fh:\n",
    "    next(fh)  # skip header\n",
    "    columns_list = \", \".join(f'\"{c}\"' for c in all_columns)\n",
    "    cur.copy_expert(f'COPY \"{target_table}\" ({columns_list}) FROM STDIN WITH (FORMAT csv)', fh)\n",
    "conn.commit()\n",
    "print(f\"Loaded {rows_written} rows into {target_table}\")\n",
    "\n",
    "# Optional: add a surrogate primary key\n",
    "with conn.cursor() as cur:\n",
    "    try:\n",
    "        cur.execute(f'ALTER TABLE \"{target_table}\" ADD COLUMN id BIGSERIAL PRIMARY KEY')\n",
    "        conn.commit()\n",
    "        print(\"Added surrogate primary key column id\")\n",
    "    except Exception as e:\n",
    "        print(\"[warn] Could not add surrogate primary key:\", e)\n",
    "\n",
    "conn.close()\n",
    "print(\"DBF ingestion complete.\")\n",
    "\n",
    "# Next steps:\n",
    "# - Add indexes on frequently queried columns, e.g., api numbers.\n",
    "# - Cast to appropriate types after reviewing well-api-manual.pdf.\n",
    "# - Consider partitioning if table grows large."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4fbfdf4b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tables in the database:\n",
      "geography_columns\n",
      "geometry_columns\n",
      "inspections\n",
      "rrc_well_api_raw\n",
      "spatial_ref_sys\n",
      "violations\n"
     ]
    }
   ],
   "source": [
    "# show tables in the database to verify\n",
    "conn = get_conn()\n",
    "with conn.cursor() as cur:\n",
    "    cur.execute(\"\"\"\n",
    "        SELECT table_name\n",
    "        FROM information_schema.tables\n",
    "        WHERE table_schema = 'public'\n",
    "        ORDER BY table_name;\n",
    "    \"\"\")\n",
    "    tables = cur.fetchall()\n",
    "    print(\"Tables in the database:\")\n",
    "    for table in tables:\n",
    "        print(table[0])\n",
    "conn.close()    \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d339d904",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column names: ['abstract', 'apinum', 'block', 'completion', 'field_name', 'lease_name', 'gas_rrcid', 'oil_gas_co', 'on_off_sch', 'operator', 'permit_num', 'plug_date', 'refer_to_a', 'section', 'survey', 'total_dept', 'wellid', 'quadnum', 'objectid_1', 'id']\n",
      "(None, '00131513', None, '0', 'WILDCAT', 'ISSACS, J. W.', '000000', None, None, 'HUDSON RESOURCES CORP.', '215813', '19840218', '00000000', None, None, '12920', '1', '3295122', '345803', 1)\n",
      "('22', '00132855', 'X', '20241109', 'GIRLIE CALDWELL (GOODLAND LM)', 'EVERYTHING UNIT', '16104', 'O', 'N', 'GEOSOUTHERN OPERATING II, LLC', '900796', '0', '00000000', None, 'FERGUSON, J', '8883', '   1H', '3295122', '1373345', 2)\n",
      "('23', '00131559', None, '0', 'R.A.M. (PETTIT 12300) WILDCAT', 'PARROTT EL AL', '000000', None, None, 'MOSBACHER PRODUCTION CO.', '240164', '19840806', '00000000', None, 'JOSEPH FERGUSON', '13197', 'ST1', '3295122', '933695', 3)\n",
      "('23', '00131559', None, '0', 'R.A.M. (PETTIT 12300) WILDCAT', 'PARROTT EL AL', '000000', None, None, 'MOSBACHER PRODUCTION CO.', '248530', '19841016', '00000000', None, 'JOSEPH FERGUSON', '13197', '   1', '3295122', '933695', 4)\n",
      "(None, '00131512', None, '0', 'WILDCAT', 'SAUNDERS \"A\"', '000000', None, None, 'TXO PRODUCTION CORP.', '214214', '19840105', '00000000', None, None, '12730', '1', '3295122', '345809', 5)\n"
     ]
    }
   ],
   "source": [
    "# show header for rrc_well_api_raw to verify\n",
    "conn = get_conn()\n",
    "with conn.cursor() as cur:\n",
    "    cur.execute('SELECT * FROM rrc_well_api_raw LIMIT 5;')\n",
    "    rows = cur.fetchall()\n",
    "    colnames = [desc[0] for desc in cur.description]\n",
    "    print(\"Column names:\", colnames)\n",
    "    for row in rows:\n",
    "        print(row)\n",
    "conn.close()    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "340311f5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}