{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sqlalchemy import create_engine\n", "import geopandas as gpd\n", "\n", "import os\n", "\n", "# Database connection details from zshrc environment variables\n", "db_name = 'colorado_spills'\n", "user = os.getenv('DB_USER')\n", "password = os.getenv('DB_PASSWORD')\n", "host = os.getenv('DB_HOST')\n", "port = os.getenv('DB_PORT')\n", "\n", "\n", "# Create an engine to connect to the PostgreSQL database\n", "engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db_name}')\n", "\n", "# Read the spills_with_demographics data from the database\n", "df = pd.read_sql_table('spills_with_demographics', engine)\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Crews working on 6/11/14 in area of former pro...\n", "1 Historical release discovered during removal o...\n", "2 Historical release discovered during removal o...\n", "3 The night operator noticed a high level alarm ...\n", "4 On May 24, 2014, in anticipation of potential ...\n", "Name: Spill Description, dtype: object" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Display the first few rows of the Spill Description column\n", "df['Spill Description'].head()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 crews working on 61114 in area of former produ...\n", "1 historical release discovered during removal o...\n", "2 historical release discovered during removal o...\n", "3 the night operator noticed a high level alarm ...\n", "4 on may 24 2014 in anticipation of potential fl...\n", "Name: Cleaned_Description, dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import pandas as pd\n", "\n", "# Text preprocessing function\n", "def preprocess_text(text):\n", " text = re.sub(r'\\s+', ' ', text) # Remove extra spaces\n", " text = re.sub(r'[^\\w\\s]', '', text) # Remove punctuation\n", " text = text.lower() # Convert to lowercase\n", " return text\n", "\n", "# Apply the preprocessing to the Spill Description column\n", "df['Cleaned_Description'] = df['Spill Description'].apply(preprocess_text)\n", "\n", "# Display the cleaned text for the first few rows\n", "df['Cleaned_Description'].head()\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "water 0.051375\n", "release 0.047366\n", "soil 0.038160\n", "location 0.034915\n", "tank 0.032697\n", "produced 0.032328\n", "discovered 0.031280\n", "line 0.029943\n", "activities 0.028456\n", "impacted 0.026087\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# Initialize TF-IDF Vectorizer\n", "tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')\n", "\n", "# Fit and transform the cleaned descriptions\n", "tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned_Description'])\n", "\n", "# Get the feature names (i.e., the words)\n", "feature_names = tfidf_vectorizer.get_feature_names_out()\n", "\n", "# Create a DataFrame with TF-IDF scores\n", "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)\n", "\n", "# Display the top 10 words with the highest average TF-IDF score\n", "top_keywords = tfidf_df.mean().sort_values(ascending=False).head(10)\n", "top_keywords\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-sm==3.7.1\n", " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from en-core-web-sm==3.7.1) (3.7.5)\n", "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n", "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.5)\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n", "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.4.1)\n", "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.12.3)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.4)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.32.3)\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.8.2)\n", "Requirement already satisfied: jinja2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.4)\n", "Requirement already satisfied: setuptools in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (70.0.0)\n", "Requirement already satisfied: packaging>=20.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (24.1)\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4.0)\n", "Requirement already satisfied: numpy>=1.19.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)\n", "Requirement already satisfied: language-data>=1.2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)\n", "Requirement already satisfied: annotated-types>=0.4.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.0)\n", "Requirement already satisfied: pydantic-core==2.20.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.20.1)\n", "Requirement already satisfied: typing-extensions>=4.6.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.12.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.2.1)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.7.4)\n", "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.10)\n", "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)\n", "Requirement already satisfied: click>=8.0.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n", "Requirement already satisfied: shellingham>=1.3.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.5.4)\n", "Requirement already satisfied: rich>=10.11.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (13.7.1)\n", "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.18.1)\n", "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (7.0.4)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.5)\n", "Requirement already satisfied: marisa-trie>=0.7.7 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.0)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.0)\n", "Requirement already satisfied: wrapt in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.16.0)\n", "Requirement already satisfied: mdurl~=0.1 in /home/dadams/miniconda3/envs/funkyfunk/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.2)\n", "Installing collected packages: en-core-web-sm\n", "Successfully installed en-core-web-sm-3.7.1\n", "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('en_core_web_sm')\n" ] } ], "source": [ "!python -m spacy download en_core_web_sm\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Cleaned_Description | \n", "Entities | \n", "
|---|---|---|
| 0 | \n", "crews working on 61114 in area of former produ... | \n", "[(61114, CARDINAL), (61114, CARDINAL)] | \n", "
| 1 | \n", "historical release discovered during removal o... | \n", "[] | \n", "
| 2 | \n", "historical release discovered during removal o... | \n", "[] | \n", "
| 3 | \n", "the night operator noticed a high level alarm ... | \n", "[] | \n", "
| 4 | \n", "on may 24 2014 in anticipation of potential fl... | \n", "[(may 24 2014, DATE), (m365636736, PERSON), (8... | \n", "