320 lines
12 KiB
Plaintext
320 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "83f1549f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"RealDictRow({'version': 'PostgreSQL 17.6 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 15.2.1 20250813, 64-bit'})\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Cell 0: connect to Postgres\n",
|
|
"# Requires: pip install psycopg2-binary\n",
|
|
"import os\n",
|
|
"import psycopg2\n",
|
|
"from psycopg2.extras import RealDictCursor\n",
|
|
"\n",
|
|
"# Configure via environment variables (safer) or edit defaults below\n",
|
|
"PGHOST = os.getenv(\"PGHOST\", \"localhost\")\n",
|
|
"PGPORT = os.getenv(\"PGPORT\", \"5432\")\n",
|
|
"PGUSER = os.getenv(\"PGUSER\", \"postgres\")\n",
|
|
"PGPASSWORD = os.getenv(\"PGPASSWORD\", \"\")\n",
|
|
"# Default DB name set to \"postgres\"; will fall back if PGDATABASE not set in env\n",
|
|
"PGDATABASE = os.getenv(\"PGDATABASE\", \"texas_data\")\n",
|
|
"\n",
|
|
"if PGDATABASE is None or PGDATABASE == \"\":\n",
|
|
" print(\"[warn] PGDATABASE not set; using fallback 'postgres'\")\n",
|
|
" PGDATABASE = \"postgres\"\n",
|
|
"\n",
|
|
"def get_conn():\n",
|
|
" \"\"\"Return a new psycopg2 connection using configured environment variables.\"\"\"\n",
|
|
" return psycopg2.connect(\n",
|
|
" host=PGHOST,\n",
|
|
" port=PGPORT,\n",
|
|
" user=PGUSER,\n",
|
|
" password=PGPASSWORD,\n",
|
|
" dbname=PGDATABASE,\n",
|
|
" )\n",
|
|
"\n",
|
|
"# Quick connection test (only runs if executed as a script, not typical in notebook)\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" try:\n",
|
|
" conn = get_conn()\n",
|
|
" with conn.cursor(cursor_factory=RealDictCursor) as cur:\n",
|
|
" cur.execute(\"SELECT version() AS version;\")\n",
|
|
" print(cur.fetchone())\n",
|
|
" conn.close()\n",
|
|
" except Exception as e:\n",
|
|
" print(\"Postgres connection failed:\", e)\n",
|
|
" raise"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "92cb7516",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%pip install dbfread --quiet"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "ba980c42",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Found 254 DBF files\n",
|
|
"Unified column count: 19\n",
|
|
"Created table rrc_well_api_raw with TEXT columns.\n",
|
|
"Wrote 1547869 rows to temp CSV /tmp/tmp9776133n\n",
|
|
"Loaded 1547869 rows into rrc_well_api_raw\n",
|
|
"Added surrogate primary key column id\n",
|
|
"DBF ingestion complete.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Bulk ingest all .dbf files from /home/dadams/data/rrc_api/all_dbf_files/ into Postgres\n",
|
|
"import os\n",
|
|
"import re\n",
|
|
"from pathlib import Path\n",
|
|
"from dbfread import DBF\n",
|
|
"import psycopg2\n",
|
|
"import tempfile\n",
|
|
"import csv\n",
|
|
"\n",
|
|
"# Reuse connection function defined earlier\n",
|
|
"conn = get_conn()\n",
|
|
"base_dir = Path('/home/dadams/data/rrc_api/all_dbf_files')\n",
|
|
"if not base_dir.exists():\n",
|
|
" raise FileNotFoundError(f\"Directory {base_dir} not found\")\n",
|
|
"\n",
|
|
"# Collect DBF files\n",
|
|
"files = sorted(base_dir.glob('*.dbf'))\n",
|
|
"print(f\"Found {len(files)} DBF files\")\n",
|
|
"if not files:\n",
|
|
" raise SystemExit(\"No DBF files to process\")\n",
|
|
"\n",
|
|
"# Helper to make Postgres-safe, lowercase column names\n",
|
|
"def sanitize(name: str) -> str:\n",
|
|
" s = name.strip().lower()\n",
|
|
" s = re.sub(r\"\\s+\", \"_\", s)\n",
|
|
" s = re.sub(r\"[^a-z0-9_]+\", \"_\", s)\n",
|
|
" s = re.sub(r\"_+\", \"_\", s).strip(\"_\")\n",
|
|
" if not s or s[0].isdigit():\n",
|
|
" s = f\"col_{s}\" if s else \"col\"\n",
|
|
" return s\n",
|
|
"\n",
|
|
"# Preferred encoding for DBF char fields. latin1 is a safe single-byte fallback.\n",
|
|
"DBF_ENCODING = 'latin1'\n",
|
|
"DBF_DECODE_ERRORS = 'replace' # or 'ignore' to drop undecodable bytes\n",
|
|
"\n",
|
|
"# First pass: determine unified set of columns (lowercased + sanitized)\n",
|
|
"all_columns = []\n",
|
|
"for f in files:\n",
|
|
" try:\n",
|
|
" table = DBF(\n",
|
|
" f,\n",
|
|
" ignore_missing_memofile=True,\n",
|
|
" encoding=DBF_ENCODING,\n",
|
|
" char_decode_errors=DBF_DECODE_ERRORS,\n",
|
|
" ) # lazy\n",
|
|
" cols = [sanitize(n) for n in table.field_names]\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"[warn] Skipping {f.name} due to read error: {e}\")\n",
|
|
" continue\n",
|
|
" for c in cols:\n",
|
|
" if c not in all_columns:\n",
|
|
" all_columns.append(c)\n",
|
|
"print(f\"Unified column count: {len(all_columns)}\")\n",
|
|
"\n",
|
|
"# Create table with TEXT columns for safety; refine types later if needed\n",
|
|
"columns_sql = ', '.join(f'\"{c}\" TEXT' for c in all_columns)\n",
|
|
"\n",
|
|
"target_table = 'rrc_well_api_raw'\n",
|
|
"with conn.cursor() as cur:\n",
|
|
" cur.execute(f'DROP TABLE IF EXISTS \"{target_table}\"')\n",
|
|
" cur.execute(f'CREATE TABLE \"{target_table}\" ({columns_sql})')\n",
|
|
"conn.commit()\n",
|
|
"print(f\"Created table {target_table} with TEXT columns.\")\n",
|
|
"\n",
|
|
"# Prepare a temp CSV for COPY (faster than individual inserts)\n",
|
|
"rows_written = 0\n",
|
|
"with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False) as tmp:\n",
|
|
" writer = csv.writer(tmp)\n",
|
|
" writer.writerow(all_columns) # header for clarity (we'll skip in COPY)\n",
|
|
" for f in files:\n",
|
|
" try:\n",
|
|
" table = DBF(\n",
|
|
" f,\n",
|
|
" ignore_missing_memofile=True,\n",
|
|
" encoding=DBF_ENCODING,\n",
|
|
" char_decode_errors=DBF_DECODE_ERRORS,\n",
|
|
" ) # lazy\n",
|
|
" # map DBF original names (mostly uppercase) to sanitized lowercase\n",
|
|
" name_map = {sanitize(n): n for n in table.field_names}\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"[warn] Skipping {f.name} due to reload error: {e}\")\n",
|
|
" continue\n",
|
|
" for record in table:\n",
|
|
" row = []\n",
|
|
" for col in all_columns:\n",
|
|
" src = name_map.get(col)\n",
|
|
" val = record.get(src) if src else None\n",
|
|
" # Ensure scalar string for CSV; replace newlines to keep one record per line\n",
|
|
" sval = '' if val is None else str(val).replace('\\r', ' ').replace('\\n', ' ')\n",
|
|
" row.append(sval)\n",
|
|
" writer.writerow(row)\n",
|
|
" rows_written += 1\n",
|
|
" tmp_path = Path(tmp.name)\n",
|
|
"print(f\"Wrote {rows_written} rows to temp CSV {tmp_path}\")\n",
|
|
"\n",
|
|
"# COPY into Postgres (skip header row)\n",
|
|
"with conn.cursor() as cur, tmp_path.open('r') as fh:\n",
|
|
" next(fh) # skip header\n",
|
|
" columns_list = \", \".join(f'\"{c}\"' for c in all_columns)\n",
|
|
" cur.copy_expert(f'COPY \"{target_table}\" ({columns_list}) FROM STDIN WITH (FORMAT csv)', fh)\n",
|
|
"conn.commit()\n",
|
|
"print(f\"Loaded {rows_written} rows into {target_table}\")\n",
|
|
"\n",
|
|
"# Optional: add a surrogate primary key\n",
|
|
"with conn.cursor() as cur:\n",
|
|
" try:\n",
|
|
" cur.execute(f'ALTER TABLE \"{target_table}\" ADD COLUMN id BIGSERIAL PRIMARY KEY')\n",
|
|
" conn.commit()\n",
|
|
" print(\"Added surrogate primary key column id\")\n",
|
|
" except Exception as e:\n",
|
|
" print(\"[warn] Could not add surrogate primary key:\", e)\n",
|
|
"\n",
|
|
"conn.close()\n",
|
|
"print(\"DBF ingestion complete.\")\n",
|
|
"\n",
|
|
"# Next steps:\n",
|
|
"# - Add indexes on frequently queried columns, e.g., api numbers.\n",
|
|
"# - Cast to appropriate types after reviewing well-api-manual.pdf.\n",
|
|
"# - Consider partitioning if table grows large."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "4fbfdf4b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Tables in the database:\n",
|
|
"geography_columns\n",
|
|
"geometry_columns\n",
|
|
"inspections\n",
|
|
"rrc_well_api_raw\n",
|
|
"spatial_ref_sys\n",
|
|
"violations\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# show tables in the database to verify\n",
|
|
"conn = get_conn()\n",
|
|
"with conn.cursor() as cur:\n",
|
|
" cur.execute(\"\"\"\n",
|
|
" SELECT table_name\n",
|
|
" FROM information_schema.tables\n",
|
|
" WHERE table_schema = 'public'\n",
|
|
" ORDER BY table_name;\n",
|
|
" \"\"\")\n",
|
|
" tables = cur.fetchall()\n",
|
|
" print(\"Tables in the database:\")\n",
|
|
" for table in tables:\n",
|
|
" print(table[0])\n",
|
|
"conn.close() \n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "d339d904",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Column names: ['abstract', 'apinum', 'block', 'completion', 'field_name', 'lease_name', 'gas_rrcid', 'oil_gas_co', 'on_off_sch', 'operator', 'permit_num', 'plug_date', 'refer_to_a', 'section', 'survey', 'total_dept', 'wellid', 'quadnum', 'objectid_1', 'id']\n",
|
|
"(None, '00131513', None, '0', 'WILDCAT', 'ISSACS, J. W.', '000000', None, None, 'HUDSON RESOURCES CORP.', '215813', '19840218', '00000000', None, None, '12920', '1', '3295122', '345803', 1)\n",
|
|
"('22', '00132855', 'X', '20241109', 'GIRLIE CALDWELL (GOODLAND LM)', 'EVERYTHING UNIT', '16104', 'O', 'N', 'GEOSOUTHERN OPERATING II, LLC', '900796', '0', '00000000', None, 'FERGUSON, J', '8883', ' 1H', '3295122', '1373345', 2)\n",
|
|
"('23', '00131559', None, '0', 'R.A.M. (PETTIT 12300) WILDCAT', 'PARROTT EL AL', '000000', None, None, 'MOSBACHER PRODUCTION CO.', '240164', '19840806', '00000000', None, 'JOSEPH FERGUSON', '13197', 'ST1', '3295122', '933695', 3)\n",
|
|
"('23', '00131559', None, '0', 'R.A.M. (PETTIT 12300) WILDCAT', 'PARROTT EL AL', '000000', None, None, 'MOSBACHER PRODUCTION CO.', '248530', '19841016', '00000000', None, 'JOSEPH FERGUSON', '13197', ' 1', '3295122', '933695', 4)\n",
|
|
"(None, '00131512', None, '0', 'WILDCAT', 'SAUNDERS \"A\"', '000000', None, None, 'TXO PRODUCTION CORP.', '214214', '19840105', '00000000', None, None, '12730', '1', '3295122', '345809', 5)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# show header for rrc_well_api_raw to verify\n",
|
|
"conn = get_conn()\n",
|
|
"with conn.cursor() as cur:\n",
|
|
" cur.execute('SELECT * FROM rrc_well_api_raw LIMIT 5;')\n",
|
|
" rows = cur.fetchall()\n",
|
|
" colnames = [desc[0] for desc in cur.description]\n",
|
|
" print(\"Column names:\", colnames)\n",
|
|
" for row in rows:\n",
|
|
" print(row)\n",
|
|
"conn.close() \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "340311f5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|