Files
california-equity-git/analysis/database_setup.ipynb
2025-01-26 19:24:23 -08:00

327 lines
24 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting pandas\n",
" Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)\n",
"Collecting geopandas\n",
" Using cached geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)\n",
"Collecting sqlalchemy\n",
" Using cached SQLAlchemy-2.0.37-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)\n",
"Collecting psycopg2-binary\n",
" Using cached psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
"Requirement already satisfied: numpy>=1.26.0 in /home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages (from pandas) (2.2.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n",
"Collecting pytz>=2020.1 (from pandas)\n",
" Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)\n",
"Collecting tzdata>=2022.7 (from pandas)\n",
" Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
"Collecting pyogrio>=0.7.2 (from geopandas)\n",
" Using cached pyogrio-0.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.5 kB)\n",
"Requirement already satisfied: packaging in /home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages (from geopandas) (24.2)\n",
"Collecting pyproj>=3.3.0 (from geopandas)\n",
" Using cached pyproj-3.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)\n",
"Collecting shapely>=2.0.0 (from geopandas)\n",
" Using cached shapely-2.0.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)\n",
"Collecting greenlet!=0.4.17 (from sqlalchemy)\n",
" Using cached greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)\n",
"Collecting typing-extensions>=4.6.0 (from sqlalchemy)\n",
" Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)\n",
"Collecting certifi (from pyogrio>=0.7.2->geopandas)\n",
" Using cached certifi-2024.12.14-py3-none-any.whl.metadata (2.3 kB)\n",
"Requirement already satisfied: six>=1.5 in /home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
"Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)\n",
"Using cached geopandas-1.0.1-py3-none-any.whl (323 kB)\n",
"Using cached SQLAlchemy-2.0.37-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)\n",
"Using cached psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
"Using cached greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (613 kB)\n",
"Using cached pyogrio-0.10.0-cp312-cp312-manylinux_2_28_x86_64.whl (24.0 MB)\n",
"Using cached pyproj-3.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)\n",
"Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)\n",
"Using cached shapely-2.0.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)\n",
"Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)\n",
"Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)\n",
"Using cached certifi-2024.12.14-py3-none-any.whl (164 kB)\n",
"Installing collected packages: pytz, tzdata, typing-extensions, shapely, psycopg2-binary, greenlet, certifi, sqlalchemy, pyproj, pyogrio, pandas, geopandas\n",
"Successfully installed certifi-2024.12.14 geopandas-1.0.1 greenlet-3.1.1 pandas-2.2.3 psycopg2-binary-2.9.10 pyogrio-0.10.0 pyproj-3.7.0 pytz-2024.2 shapely-2.0.6 sqlalchemy-2.0.37 typing-extensions-4.12.2 tzdata-2025.1\n"
]
}
],
"source": [
"!pip install pandas geopandas sqlalchemy psycopg2-binary"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"A module that was compiled using NumPy 1.x cannot be run in\n",
"NumPy 2.2.2 as it may crash. To support both 1.x and 2.x\n",
"versions of NumPy, modules must be compiled with NumPy 2.0.\n",
"Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n",
"\n",
"If you are a user of the module, the easiest solution will be to\n",
"downgrade to 'numpy<2' or try to upgrade the affected module.\n",
"We expect that some modules will need time to support NumPy 2.\n",
"\n",
"Traceback (most recent call last): File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
" File \"<frozen runpy>\", line 88, in _run_code\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel_launcher.py\", line 18, in <module>\n",
" app.launch_new_instance()\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/traitlets/config/application.py\", line 1075, in launch_instance\n",
" app.start()\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py\", line 739, in start\n",
" self.io_loop.start()\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/tornado/platform/asyncio.py\", line 205, in start\n",
" self.asyncio_loop.run_forever()\n",
" File \"/home/dadams/miniconda3/lib/python3.12/asyncio/base_events.py\", line 641, in run_forever\n",
" self._run_once()\n",
" File \"/home/dadams/miniconda3/lib/python3.12/asyncio/base_events.py\", line 1986, in _run_once\n",
" handle._run()\n",
" File \"/home/dadams/miniconda3/lib/python3.12/asyncio/events.py\", line 88, in _run\n",
" self._context.run(self._callback, *self._args)\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 545, in dispatch_queue\n",
" await self.process_one()\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 534, in process_one\n",
" await dispatch(*args)\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 437, in dispatch_shell\n",
" await result\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py\", line 362, in execute_request\n",
" await super().execute_request(stream, ident, parent)\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 778, in execute_request\n",
" reply_content = await reply_content\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py\", line 449, in do_execute\n",
" res = shell.run_cell(\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/ipykernel/zmqshell.py\", line 549, in run_cell\n",
" return super().run_cell(*args, **kwargs)\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3075, in run_cell\n",
" result = self._run_cell(\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3130, in _run_cell\n",
" result = runner(coro)\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/IPython/core/async_helpers.py\", line 128, in _pseudo_sync_runner\n",
" coro.send(None)\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3334, in run_cell_async\n",
" has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3517, in run_ast_nodes\n",
" if await self.run_code(code, result, async_=asy):\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3577, in run_code\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
" File \"/tmp/ipykernel_794405/3254404226.py\", line 2, in <module>\n",
" import geopandas as gpd\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/__init__.py\", line 1, in <module>\n",
" from geopandas._config import options\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/_config.py\", line 109, in <module>\n",
" default_value=_default_use_pygeos(),\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/_config.py\", line 95, in _default_use_pygeos\n",
" import geopandas._compat as compat\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/_compat.py\", line 9, in <module>\n",
" import shapely\n",
" File \"/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/shapely/__init__.py\", line 1, in <module>\n",
" from shapely.lib import GEOSException # NOQA\n"
]
},
{
"ename": "AttributeError",
"evalue": "_ARRAY_API not found",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;31mAttributeError\u001b[0m: _ARRAY_API not found"
]
},
{
"ename": "ImportError",
"evalue": "numpy.core.multiarray failed to import",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mgeopandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mgpd\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msqlalchemy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m create_engine\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mgeopandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_config\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m options\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mgeopandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeoseries\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m GeoSeries\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mgeopandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeodataframe\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m GeoDataFrame\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/_config.py:109\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mgeopandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_compat\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mcompat\u001b[39;00m\n\u001b[1;32m 104\u001b[0m compat\u001b[38;5;241m.\u001b[39mset_use_pygeos(value)\n\u001b[1;32m 107\u001b[0m use_pygeos \u001b[38;5;241m=\u001b[39m Option(\n\u001b[1;32m 108\u001b[0m key\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse_pygeos\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m--> 109\u001b[0m default_value\u001b[38;5;241m=\u001b[39m\u001b[43m_default_use_pygeos\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m,\n\u001b[1;32m 110\u001b[0m doc\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 111\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhether to use PyGEOS to speed up spatial operations. The default is True \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mif PyGEOS is installed, and follows the USE_PYGEOS environment variable \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mif set.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 114\u001b[0m ),\n\u001b[1;32m 115\u001b[0m validator\u001b[38;5;241m=\u001b[39m_validate_bool,\n\u001b[1;32m 116\u001b[0m callback\u001b[38;5;241m=\u001b[39m_callback_use_pygeos,\n\u001b[1;32m 117\u001b[0m )\n\u001b[1;32m 120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_validate_io_engine\u001b[39m(value):\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/_config.py:95\u001b[0m, in \u001b[0;36m_default_use_pygeos\u001b[0;34m()\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_default_use_pygeos\u001b[39m():\n\u001b[0;32m---> 95\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mgeopandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_compat\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mcompat\u001b[39;00m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compat\u001b[38;5;241m.\u001b[39mUSE_PYGEOS\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/_compat.py:9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mshapely\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mshapely\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeos\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# -----------------------------------------------------------------------------\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# pandas compat\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# -----------------------------------------------------------------------------\u001b[39;00m\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/shapely/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mshapely\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m GEOSException \u001b[38;5;66;03m# NOQA\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mshapely\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Geometry \u001b[38;5;66;03m# NOQA\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mshapely\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m geos_version, geos_version_string \u001b[38;5;66;03m# NOQA\u001b[39;00m\n",
"\u001b[0;31mImportError\u001b[0m: numpy.core.multiarray failed to import"
]
}
],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"from sqlalchemy import create_engine\n",
"import numpy as np\n",
"from datetime import datetime\n",
"\n",
"import os\n",
"\n",
"# database variables\n",
"DB_USER = os.getenv('DB_USER')\n",
"DB_PASSWORD = os.getenv('DB_PASSWORD')\n",
"DB_HOST = os.getenv('DB_HOST')\n",
"DB_PORT = os.getenv('DB_PORT')\n",
"\n",
"\n",
"# Create database connection\n",
"def create_db_engine():\n",
" return create_engine('postgresql://' + DB_USER + ':' + DB_PASSWORD + '@' + DB_HOST + ':' + DB_PORT + '/cci_db')\n",
"\n",
"# Load and clean CES data\n",
"def process_ces_data(filepath):\n",
" print(\"Processing CalEnviroScreen data...\")\n",
" gdf = gpd.read_file(filepath)\n",
" \n",
" # Clean and standardize column names\n",
" gdf.columns = [col.lower().replace(' ', '_') for col in gdf.columns]\n",
" \n",
" # Convert tract ID to string and ensure it's clean\n",
" gdf['tract'] = gdf['tract'].astype(str).str.strip()\n",
" \n",
" # Select and rename relevant columns\n",
" ces_data = gdf[['tract', 'zip', 'county', 'approxloc', 'totpop19',\n",
" 'ciscore', 'ciscorep', 'ozone', 'ozonep', 'pm2_5',\n",
" 'pm2_5_p', 'drinkwat', 'drinkwatp', 'poverty',\n",
" 'povertyp', 'unempl', 'unemplp', 'housburd',\n",
" 'housburdp', 'geometry']]\n",
" \n",
" # Rename columns to match database schema\n",
" column_map = {\n",
" 'tract': 'tract_id',\n",
" 'zip': 'zip_code',\n",
" 'approxloc': 'approx_loc',\n",
" 'totpop19': 'total_pop_19',\n",
" 'ciscore': 'ci_score',\n",
" 'ciscorep': 'ci_score_pctl',\n",
" 'pm2_5': 'pm25',\n",
" 'pm2_5_p': 'pm25_pctl',\n",
" 'drinkwat': 'drinking_water',\n",
" 'drinkwatp': 'drinking_water_pctl',\n",
" 'housburd': 'housing_burden',\n",
" 'housburdp': 'housing_burden_pctl',\n",
" 'geometry': 'geom'\n",
" }\n",
" ces_data = ces_data.rename(columns=column_map)\n",
" \n",
" return ces_data\n",
"\n",
"# Load and clean CCI data\n",
"def process_cci_data(filepath):\n",
" print(\"Processing CCI project data...\")\n",
" df = pd.read_csv(filepath, low_memory=False)\n",
" \n",
" # Clean column names\n",
" df.columns = [col.lower().replace(' ', '_') for col in df.columns]\n",
" \n",
" # Convert date columns\n",
" df['date_operational'] = pd.to_datetime(df['date_operational'])\n",
" \n",
" # Filter date range\n",
" df = df[\n",
" (df['date_operational'] >= '2015-01-01') &\n",
" (df['date_operational'] <= '2024-12-31')\n",
" ]\n",
" \n",
" # Process project partners into array\n",
" df['project_partners'] = df['project_partners'].fillna('')\n",
" df['project_partners'] = df['project_partners'].apply(\n",
" lambda x: '{' + ','.join([p.strip() for p in str(x).split(',')]) + '}'\n",
" if x else '{}'\n",
" )\n",
" \n",
" # Select and prepare relevant columns\n",
" cci_data = df[[\n",
" 'project_idnumber', 'reporting_cycle_name', 'agency_name',\n",
" 'program_name', 'program_description', 'project_name',\n",
" 'project_type', 'project_description', 'date_operational',\n",
" 'census_tract', 'county', 'total_program_ggrffunding',\n",
" 'total_project_ghgreductions', 'is_benefit_disadvantaged_communities',\n",
" 'project_partners'\n",
" ]]\n",
" \n",
" # Rename columns to match schema\n",
" column_map = {\n",
" 'project_idnumber': 'project_id',\n",
" 'reporting_cycle_name': 'reporting_cycle',\n",
" 'total_program_ggrffunding': 'total_funding',\n",
" 'total_project_ghgreductions': 'ghg_reduction',\n",
" 'is_benefit_disadvantaged_communities': 'dac_benefit'\n",
" }\n",
" cci_data = cci_data.rename(columns=column_map)\n",
" \n",
" # Convert boolean columns\n",
" cci_data['dac_benefit'] = cci_data['dac_benefit'].astype(bool)\n",
" \n",
" return cci_data\n",
"\n",
"# Main loading function\n",
"def load_data_to_db():\n",
" try:\n",
" engine = create_db_engine()\n",
" \n",
" # Load CES data\n",
" ces_data = process_ces_data('california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp')\n",
" print(\"Loading CES data to database...\")\n",
" ces_data.to_postgis('ces_data', engine, if_exists='replace', index=False)\n",
" \n",
" # Load CCI data\n",
" cci_data = process_cci_data('data_raw/cci_programs_data.csv')\n",
" print(\"Loading CCI data to database...\")\n",
" cci_data.to_sql('cci_projects', engine, if_exists='replace', index=False)\n",
" \n",
" print(\"Data loading completed successfully!\")\n",
" \n",
" # Return sample counts for verification\n",
" return {\n",
" 'ces_records': len(ces_data),\n",
" 'cci_records': len(cci_data)\n",
" }\n",
" \n",
" except Exception as e:\n",
" print(f\"Error loading data: {str(e)}\")\n",
" raise\n",
"\n",
"# Execute loading\n",
"record_counts = load_data_to_db()\n",
"print(\"\\nRecord counts:\")\n",
"print(f\"CES data: {record_counts['ces_records']} records\")\n",
"print(f\"CCI projects: {record_counts['cci_records']} records\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}