Files
colorado_spills/archive/spill_discription.ipynb
2025-04-19 06:48:25 -07:00

348 lines
20 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"import geopandas as gpd\n",
"\n",
"import os\n",
"\n",
"# Database connection details from zshrc environment variables\n",
"db_name = 'colorado_spills'\n",
"user = os.getenv('DB_USER')\n",
"password = os.getenv('DB_PASSWORD')\n",
"host = os.getenv('DB_HOST')\n",
"port = os.getenv('DB_PORT')\n",
"\n",
"\n",
"# Create an engine to connect to the PostgreSQL database\n",
"engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db_name}')\n",
"\n",
"# Read the spills_with_demographics data from the database\n",
"df = pd.read_sql_table('spills_with_demographics', engine)\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 Crews working on 6/11/14 in area of former pro...\n",
"1 Historical release discovered during removal o...\n",
"2 Historical release discovered during removal o...\n",
"3 The night operator noticed a high level alarm ...\n",
"4 On May 24, 2014, in anticipation of potential ...\n",
"Name: Spill Description, dtype: object"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Display the first few rows of the Spill Description column\n",
"df['Spill Description'].head()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 crews working on 61114 in area of former produ...\n",
"1 historical release discovered during removal o...\n",
"2 historical release discovered during removal o...\n",
"3 the night operator noticed a high level alarm ...\n",
"4 on may 24 2014 in anticipation of potential fl...\n",
"Name: Cleaned_Description, dtype: object"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"import pandas as pd\n",
"\n",
"# Text preprocessing function\n",
"def preprocess_text(text):\n",
" text = re.sub(r'\\s+', ' ', text) # Remove extra spaces\n",
" text = re.sub(r'[^\\w\\s]', '', text) # Remove punctuation\n",
" text = text.lower() # Convert to lowercase\n",
" return text\n",
"\n",
"# Apply the preprocessing to the Spill Description column\n",
"df['Cleaned_Description'] = df['Spill Description'].apply(preprocess_text)\n",
"\n",
"# Display the cleaned text for the first few rows\n",
"df['Cleaned_Description'].head()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"water 0.051375\n",
"release 0.047366\n",
"soil 0.038160\n",
"location 0.034915\n",
"tank 0.032697\n",
"produced 0.032328\n",
"discovered 0.031280\n",
"line 0.029943\n",
"activities 0.028456\n",
"impacted 0.026087\n",
"dtype: float64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"# Initialize TF-IDF Vectorizer\n",
"tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')\n",
"\n",
"# Fit and transform the cleaned descriptions\n",
"tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned_Description'])\n",
"\n",
"# Get the feature names (i.e., the words)\n",
"feature_names = tfidf_vectorizer.get_feature_names_out()\n",
"\n",
"# Create a DataFrame with TF-IDF scores\n",
"tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)\n",
"\n",
"# Display the top 10 words with the highest average TF-IDF score\n",
"top_keywords = tfidf_df.mean().sort_values(ascending=False).head(10)\n",
"top_keywords\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting en-core-web-sm==3.7.1\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from en-core-web-sm==3.7.1) (3.7.5)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n",
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.5)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n",
"Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.4.1)\n",
"Requirement already satisfied: typer<1.0.0,>=0.3.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.12.3)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.4)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.32.3)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.8.2)\n",
"Requirement already satisfied: jinja2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.4)\n",
"Requirement already satisfied: setuptools in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (70.0.0)\n",
"Requirement already satisfied: packaging>=20.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (24.1)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4.0)\n",
"Requirement already satisfied: numpy>=1.19.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)\n",
"Requirement already satisfied: language-data>=1.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.20.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.20.1)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.12.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.7.4)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.10)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)\n",
"Requirement already satisfied: click>=8.0.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n",
"Requirement already satisfied: shellingham>=1.3.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.5.4)\n",
"Requirement already satisfied: rich>=10.11.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (13.7.1)\n",
"Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.18.1)\n",
"Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (7.0.4)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.5)\n",
"Requirement already satisfied: marisa-trie>=0.7.7 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.0)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.0)\n",
"Requirement already satisfied: wrapt in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.16.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.2)\n",
"Installing collected packages: en-core-web-sm\n",
"Successfully installed en-core-web-sm-3.7.1\n",
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('en_core_web_sm')\n"
]
}
],
"source": [
"!python -m spacy download en_core_web_sm\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Cleaned_Description</th>\n",
" <th>Entities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>crews working on 61114 in area of former produ...</td>\n",
" <td>[(61114, CARDINAL), (61114, CARDINAL)]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>historical release discovered during removal o...</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>historical release discovered during removal o...</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>the night operator noticed a high level alarm ...</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>on may 24 2014 in anticipation of potential fl...</td>\n",
" <td>[(may 24 2014, DATE), (m365636736, PERSON), (8...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Cleaned_Description \\\n",
"0 crews working on 61114 in area of former produ... \n",
"1 historical release discovered during removal o... \n",
"2 historical release discovered during removal o... \n",
"3 the night operator noticed a high level alarm ... \n",
"4 on may 24 2014 in anticipation of potential fl... \n",
"\n",
" Entities \n",
"0 [(61114, CARDINAL), (61114, CARDINAL)] \n",
"1 [] \n",
"2 [] \n",
"3 [] \n",
"4 [(may 24 2014, DATE), (m365636736, PERSON), (8... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import spacy\n",
"\n",
"# Load the pre-trained NER model from spacy\n",
"nlp = spacy.load(\"en_core_web_sm\")\n",
"\n",
"# Function to extract named entities\n",
"def extract_entities(text):\n",
" doc = nlp(text)\n",
" entities = [(ent.text, ent.label_) for ent in doc.ents]\n",
" return entities\n",
"\n",
"# Apply the NER extraction to the cleaned descriptions\n",
"df['Entities'] = df['Cleaned_Description'].apply(extract_entities)\n",
"\n",
"# Display the entities for the first few rows\n",
"df[['Cleaned_Description', 'Entities']].head()\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Document #', 'Report', 'Operator', 'Operator #', 'Tracking #', 'Initial Report Date', 'Date of Discovery', 'Spill Type', 'Qtr Qtr', 'Section', 'Township', 'range', 'meridian', 'Latitude', 'Longitude', 'Municipality', 'county', 'Facility Type', 'Facility ID', 'API County Code', 'API Sequence Number', 'Spilled outside of berms', 'More than five barrels spilled', 'Oil Spill Volume', 'Condensate Spill Volume', 'Flow Back Spill Volume', 'Produced Water Spill Volume', 'E&P Waste Spill Volume', 'Other Waste', 'Drilling Fluid Spill Volume', 'Current Land Use', 'Other Land Use', 'Weather Conditions', 'Surface Owner', 'Surface Owner Other', 'Waters of the State', 'Residence / Occupied Structure', 'livestock', 'Public Byway', 'Surface Water Supply Area', 'Spill Description', 'Supplemental Report Date', 'Oil BBLs Spilled', 'Oil BBLs Recovered', 'Oil Unknown', 'Condensate BBLs Spilled', 'Condensate BBLs Recovered', 'Condensate Unknown', 'Produced Water BBLs Spilled', 'Produced Water BBLs Recovered', 'Produced Water Unknown', 'Drilling Fluid BBLs Spilled', 'Drilling Fluid BBLs Recovered', 'Drilling Fluid Unknown', 'Flow Back Fluid BBLs Spilled', 'Flow Back Fluid BBLs Recovered', 'Flow Back Fluid Unkown', 'Other E&P Waste BBLS Spilled', 'Other E&P Waste BBLS Recovered', 'Other E&P Waste Unknown', 'Other E&P Waste', 'Spill Contained within Berm', 'Emergency Pit Constructed', 'soil', 'groundwater', 'Surface Water', 'Dry Drainage Feature', 'Surface Area Length', 'Surface Area Width', 'Depth of Impact in Feet', 'Depth of Impact in Inches', 'Area Depth Determined', 'Geology Description', 'Depth to Groundwater', 'Water wells in area', 'Water Wells', 'Water Wells None', 'Surface Water Near', 'Surface Water None', 'Wetlands', 'Wetlands None', 'Springs', 'Springs None', 'Livestock Near', 'Livestock None', 'Occupied Buildings', 'Occupied Buildings None', 'Additional Spill Details', 'Supplemental Report Date CA', 'Human Error', 'Equipment Failure', 'Historical Unkown', 'Other', 'Other Description', 'Root Cause', 'Preventative Measures', 'Soil Excavated', 'Offsite Disposal', 'Onsite Treatment', 'Other Disposition', 'Other Disposition Description', 'Ground Water Removed', 'Surface Water Removed', 'Corrective Actions Completed', 'Approved Form 27', 'Form 27 Project Number', 'GEOID', 'TRACT_NAME', 'total_population', 'white_population', 'hispanic_population', 'median_household_income', 'poverty_population', 'unemployed_population', 'percent_white', 'percent_hispanic', 'percent_poverty', 'unemployment_rate', 'Cleaned_Description', 'Entities']\n"
]
}
],
"source": [
"print(df.columns.tolist())\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "funkyfunk",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}