file organizations

This commit is contained in:
2025-04-09 20:26:45 -07:00
parent 3dd1bd6dee
commit c7c3d75ec9
26 changed files with 141960 additions and 466099 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Successfully connected to the database!\n",
"Dropping existing database objects...\n",
"Processing CalEnviroScreen data...\n",
"Loading CES data to database...\n",
"Processing CCI project data...\n",
"Loading CCI data to database...\n",
"Creating analysis views...\n",
"Data loading completed successfully!\n",
"\n",
"Record counts:\n",
"CES data: 8035 records\n",
"CCI projects: 120715 records\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import os\n",
"import geopandas as gpd\n",
"from sqlalchemy import create_engine, text\n",
"from datetime import datetime\n",
"\n",
"# Database configuration\n",
"DB_USER = os.getenv('DB_USER', 'postgres')\n",
"DB_PASSWORD = os.getenv('DB_PASSWORD', 'MandyLinkToby3')\n",
"DB_HOST = os.getenv('DB_HOST', '192.168.0.74')\n",
"DB_PORT = os.getenv('DB_PORT', '5432')\n",
"DB_NAME = 'calif_equity'\n",
"\n",
"# Set working directory\n",
"os.chdir('/home/dadams/Repos/california_equity_git')\n",
"\n",
"# Create database connection\n",
"def create_db_engine():\n",
" connection_string = f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'\n",
" return create_engine(connection_string)\n",
"\n",
"def drop_existing_objects(engine):\n",
" \"\"\"Drop existing database objects in the correct order\"\"\"\n",
" with engine.connect() as connection:\n",
" # Drop views first\n",
" connection.execute(text(\"DROP VIEW IF EXISTS project_efficiency CASCADE\"))\n",
" connection.execute(text(\"DROP VIEW IF EXISTS regional_collaboration CASCADE\"))\n",
" # Then drop tables\n",
" connection.execute(text(\"DROP TABLE IF EXISTS agency_partnerships CASCADE\"))\n",
" connection.execute(text(\"DROP TABLE IF EXISTS cci_projects CASCADE\"))\n",
" connection.execute(text(\"DROP TABLE IF EXISTS ces_data CASCADE\"))\n",
" connection.commit()\n",
"\n",
"def process_ces_data(filepath):\n",
" print(\"Processing CalEnviroScreen data...\")\n",
" gdf = gpd.read_file(filepath)\n",
" \n",
" # Clean and standardize column names\n",
" gdf.columns = [col.lower().replace(' ', '_') for col in gdf.columns]\n",
" \n",
" # Convert tract ID to string and ensure it's clean\n",
" gdf['tract'] = gdf['tract'].astype(str).str.strip()\n",
" \n",
" # Select and rename relevant columns\n",
" ces_data = gdf[['tract', 'zip', 'county', 'approxloc', 'totpop19',\n",
" 'ciscore', 'ciscorep', 'ozone', 'ozonep', 'pm2_5',\n",
" 'pm2_5_p', 'drinkwat', 'drinkwatp', 'poverty',\n",
" 'povertyp', 'unempl', 'unemplp', 'housburd',\n",
" 'housburdp', 'geometry']]\n",
" \n",
" # Rename columns to match database schema\n",
" column_map = {\n",
" 'tract': 'tract_id',\n",
" 'zip': 'zip_code',\n",
" 'approxloc': 'approx_loc',\n",
" 'totpop19': 'total_pop_19',\n",
" 'ciscore': 'ci_score',\n",
" 'ciscorep': 'ci_score_pctl',\n",
" 'pm2_5': 'pm25',\n",
" 'pm2_5_p': 'pm25_pctl',\n",
" 'drinkwat': 'drinking_water',\n",
" 'drinkwatp': 'drinking_water_pctl',\n",
" 'housburd': 'housing_burden',\n",
" 'housburdp': 'housing_burden_pctl',\n",
" 'geometry': 'geom'\n",
" }\n",
" ces_data = ces_data.rename(columns=column_map)\n",
" \n",
" # Set the geometry column explicitly\n",
" ces_data = ces_data.set_geometry('geom')\n",
" \n",
" return ces_data\n",
"\n",
"def process_cci_data(filepath):\n",
" print(\"Processing CCI project data...\")\n",
" df = pd.read_csv(filepath, low_memory=False)\n",
" \n",
" # Clean column names\n",
" df.columns = [col.lower().replace(' ', '_') for col in df.columns]\n",
" \n",
" # Convert date columns\n",
" df['date_operational'] = pd.to_datetime(df['date_operational'])\n",
" \n",
" # Filter date range\n",
" df = df[\n",
" (df['date_operational'] >= '2015-01-01') &\n",
" (df['date_operational'] <= '2024-12-31')\n",
" ]\n",
" \n",
" # Process project partners\n",
" df['project_partners'] = df['project_partners'].fillna('')\n",
" \n",
" # Select and prepare relevant columns\n",
" cci_data = df[[\n",
" 'project_idnumber', 'reporting_cycle_name', 'agency_name',\n",
" 'program_name', 'program_description', 'project_name',\n",
" 'project_type', 'project_description', 'date_operational',\n",
" 'census_tract', 'county', 'total_program_ggrffunding',\n",
" 'total_project_ghgreductions', 'is_benefit_disadvantaged_communities',\n",
" 'project_partners'\n",
" ]]\n",
" \n",
" # Rename columns to match schema\n",
" column_map = {\n",
" 'project_idnumber': 'project_id',\n",
" 'reporting_cycle_name': 'reporting_cycle',\n",
" 'total_program_ggrffunding': 'total_funding',\n",
" 'total_project_ghgreductions': 'ghg_reduction',\n",
" 'is_benefit_disadvantaged_communities': 'dac_benefit'\n",
" }\n",
" cci_data = cci_data.rename(columns=column_map)\n",
" \n",
" # Convert boolean columns\n",
" cci_data['dac_benefit'] = cci_data['dac_benefit'].astype(bool)\n",
" \n",
" return cci_data\n",
"\n",
"def create_views(engine):\n",
" with engine.connect() as connection:\n",
" # Project efficiency view\n",
" connection.execute(text(\"\"\"\n",
" CREATE VIEW project_efficiency AS\n",
" SELECT \n",
" p.project_id,\n",
" p.program_name,\n",
" p.agency_name,\n",
" p.total_funding,\n",
" p.ghg_reduction,\n",
" p.dac_benefit,\n",
" CASE \n",
" WHEN p.total_funding > 0 THEN p.ghg_reduction / p.total_funding \n",
" ELSE 0 \n",
" END as ghg_efficiency,\n",
" c.ci_score as ces_score,\n",
" CASE \n",
" WHEN p.project_partners = '' THEN 0\n",
" ELSE (length(p.project_partners) - length(replace(p.project_partners, ',', '')) + 1)\n",
" END as partner_count\n",
" FROM cci_projects p\n",
" LEFT JOIN ces_data c ON cast(p.census_tract as text) = cast(c.tract_id as text)\n",
" \"\"\"))\n",
" \n",
" # Regional collaboration view\n",
" connection.execute(text(\"\"\"\n",
" CREATE VIEW regional_collaboration AS\n",
" SELECT \n",
" county,\n",
" COUNT(DISTINCT project_id) as project_count,\n",
" AVG(CASE \n",
" WHEN project_partners = '' THEN 0\n",
" ELSE (length(project_partners) - length(replace(project_partners, ',', '')) + 1)\n",
" END) as avg_partners,\n",
" SUM(total_funding) as total_funding,\n",
" SUM(CASE WHEN dac_benefit THEN 1 ELSE 0 END)::FLOAT / COUNT(*) as dac_rate,\n",
" SUM(ghg_reduction) / NULLIF(SUM(total_funding), 0) as region_efficiency\n",
" FROM cci_projects\n",
" GROUP BY county\n",
" \"\"\"))\n",
" \n",
" connection.commit()\n",
"\n",
"def load_data_to_db():\n",
" try:\n",
" engine = create_db_engine()\n",
" \n",
" # Drop existing objects first\n",
" print(\"Dropping existing database objects...\")\n",
" drop_existing_objects(engine)\n",
" \n",
" # Load CES data\n",
" ces_data = process_ces_data('/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4_Final_Shapefile.shp')\n",
" print(\"Loading CES data to database...\")\n",
" ces_data.to_postgis('ces_data', engine, if_exists='replace', index=False)\n",
" \n",
" # Load CCI data\n",
" cci_data = process_cci_data('/home/dadams/Repos/california_equity_git/data_raw/cci_programs_data.csv')\n",
" print(\"Loading CCI data to database...\")\n",
" cci_data.to_sql('cci_projects', engine, if_exists='replace', index=False)\n",
" \n",
" print(\"Creating analysis views...\")\n",
" create_views(engine)\n",
" \n",
" print(\"Data loading completed successfully!\")\n",
" \n",
" return {\n",
" 'ces_records': len(ces_data),\n",
" 'cci_records': len(cci_data)\n",
" }\n",
" \n",
" except Exception as e:\n",
" print(f\"Error loading data: {e}\")\n",
" raise\n",
"\n",
"# Test database connection\n",
"try:\n",
" engine = create_db_engine()\n",
" with engine.connect() as conn:\n",
" print(\"Successfully connected to the database!\")\n",
"except Exception as e:\n",
" print(f\"Error connecting to the database: {str(e)}\")\n",
"\n",
"# Execute loading\n",
"record_counts = load_data_to_db()\n",
"print(\"\\nRecord counts:\")\n",
"print(f\"CES data: {record_counts['ces_records']} records\")\n",
"print(f\"CCI projects: {record_counts['cci_records']} records\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,321 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# calif equity\n",
"## Looking at collaboration components \n",
"Date: 2024-12-19"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Importing the necessary libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"## set directory\n",
"import os\n",
"os.chdir('/home/dadams/Repos/california_equity_git')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('data_raw/cci_programs_data.csv', low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Project IDNumber', 'Reporting Cycle Name', 'Agency Name',\n",
" 'Program Name', 'Program Description', 'Sub Program Name',\n",
" 'Record Type', 'Project Name', 'Project Type', 'Project Description',\n",
" ...\n",
" 'Net Density DUA', 'Applicants Assisted', 'Invasive Cover 12 Months',\n",
" 'Invasive Cover 36 Months', 'Project Acreage', 'IS IAE',\n",
" 'Intermediary Admin Expenses Calc', 'PRIMARY_FUNDING_RECIPIENT_TYPE',\n",
" 'TRIBAL AFFILIATION', 'PROJECT PARTNERS'],\n",
" dtype='object', length=127)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import geopandas as gpd\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"def debug_shapefile(shapefile_path):\n",
" \"\"\"\n",
" Debug shapefile reading issues by checking file existence and required components.\n",
" \n",
" Args:\n",
" shapefile_path (str): Path to the .shp file\n",
" \n",
" Returns:\n",
" dict: Dictionary containing debug information\n",
" \"\"\"\n",
" base_path = Path(shapefile_path).parent\n",
" file_name = Path(shapefile_path).stem\n",
" \n",
" # Required shapefile components\n",
" required_extensions = ['.shp', '.shx', '.dbf']\n",
" optional_extensions = ['.prj', '.cpg', '.sbn', '.sbx']\n",
" \n",
" debug_info = {\n",
" 'file_exists': os.path.exists(shapefile_path),\n",
" 'parent_dir_exists': os.path.exists(base_path),\n",
" 'components': {},\n",
" 'file_sizes': {},\n",
" 'readable': False\n",
" }\n",
" \n",
" # Check for all component files\n",
" for ext in required_extensions + optional_extensions:\n",
" full_path = base_path / f\"{file_name}{ext}\"\n",
" exists = full_path.exists()\n",
" debug_info['components'][ext] = exists\n",
" if exists:\n",
" debug_info['file_sizes'][ext] = os.path.getsize(full_path)\n",
" \n",
" # Try reading with explicit driver\n",
" try:\n",
" gdf = gpd.read_file(shapefile_path, driver='ESRI Shapefile')\n",
" debug_info['readable'] = True\n",
" debug_info['num_features'] = len(gdf)\n",
" except Exception as e:\n",
" debug_info['error'] = str(e)\n",
" \n",
" return debug_info\n",
"\n",
"# Usage example\n",
"shapefile_path = \"/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp\"\n",
"debug_results = debug_shapefile(shapefile_path)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking .shp file: True\n",
"File permissions: 644\n",
"Checking .shx file: True\n",
"File permissions: 644\n",
"Checking .dbf file: True\n",
"File permissions: 644\n"
]
}
],
"source": [
"import os\n",
"from pathlib import Path\n",
"\n",
"# Define base path and file name\n",
"base_path = '/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape'\n",
"file_name = 'CES4 Final Shapefile'\n",
"\n",
"# Check for existence of all required files\n",
"required_files = ['.shp', '.shx', '.dbf']\n",
"for ext in required_files:\n",
" full_path = os.path.join(base_path, file_name + ext)\n",
" print(f\"Checking {ext} file: {os.path.exists(full_path)}\")\n",
" if os.path.exists(full_path):\n",
" print(f\"File permissions: {oct(os.stat(full_path).st_mode)[-3:]}\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Approach 1 failed: '/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp' not recognized as being in a supported file format. It might help to specify the correct driver explicitly by prefixing the file path with '<DRIVER>:', e.g. 'CSV:path'.\n",
"Approach 2 failed: '/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp' not recognized as being in a supported file format. It might help to specify the correct driver explicitly by prefixing the file path with '<DRIVER>:', e.g. 'CSV:path'.\n",
"Directory contents: ['CES4 Final Shapefile.sbx', 'CES4 Final Shapefile.shp', 'CES4 Final Shapefile.sbn', 'CES4 Final Shapefile.shp.xml', 'CES4 Final Shapefile.shx', 'CES4 Final Shapefile.prj', 'CES4 Final Shapefile.cpg', 'CES4 Final Shapefile.dbf']\n"
]
}
],
"source": [
"import geopandas as gpd\n",
"\n",
"# Approach 1: Using absolute path with normalized separators\n",
"shapefile_path = Path(base_path) / f\"{file_name}.shp\"\n",
"try:\n",
" gdf = gpd.read_file(shapefile_path)\n",
"except Exception as e:\n",
" print(f\"Approach 1 failed: {e}\")\n",
"\n",
"# Approach 2: Using explicit ESRI Shapefile driver\n",
"try:\n",
" gdf = gpd.read_file(shapefile_path, driver='ESRI Shapefile')\n",
"except Exception as e:\n",
" print(f\"Approach 2 failed: {e}\")\n",
"\n",
"# Approach 3: Check if the directory is readable\n",
"try:\n",
" print(f\"Directory contents: {os.listdir(base_path)}\")\n",
"except Exception as e:\n",
" print(f\"Cannot list directory: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GeoPandas version: 1.0.1\n",
"Fiona version: 1.10.1\n"
]
}
],
"source": [
"import geopandas as gpd\n",
"import fiona\n",
"print(f\"GeoPandas version: {gpd.__version__}\")\n",
"print(f\"Fiona version: {fiona.__version__}\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"ename": "DataSourceError",
"evalue": "'/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp' not recognized as being in a supported file format. It might help to specify the correct driver explicitly by prefixing the file path with '<DRIVER>:', e.g. 'CSV:path'.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mDataSourceError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[30], line 5\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Load the shapefile\u001b[39;00m\n\u001b[1;32m 4\u001b[0m shapefile_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 5\u001b[0m gdf \u001b[38;5;241m=\u001b[39m \u001b[43mgpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mshapefile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# Print the head of the GeoDataFrame\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(gdf\u001b[38;5;241m.\u001b[39mhead())\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/io/file.py:294\u001b[0m, in \u001b[0;36m_read_file\u001b[0;34m(filename, bbox, mask, columns, rows, engine, **kwargs)\u001b[0m\n\u001b[1;32m 291\u001b[0m from_bytes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m engine \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyogrio\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 294\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read_file_pyogrio\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmask\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrows\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrows\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m engine \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfiona\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 299\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pd\u001b[38;5;241m.\u001b[39mapi\u001b[38;5;241m.\u001b[39mtypes\u001b[38;5;241m.\u001b[39mis_file_like(filename):\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/geopandas/io/file.py:547\u001b[0m, in \u001b[0;36m_read_file_pyogrio\u001b[0;34m(path_or_bytes, bbox, mask, rows, **kwargs)\u001b[0m\n\u001b[1;32m 538\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 539\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minclude_fields\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mignore_fields\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m keywords are deprecated, and \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 540\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwill be removed in a future release. You can use the \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m keyword \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 543\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m,\n\u001b[1;32m 544\u001b[0m )\n\u001b[1;32m 545\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minclude_fields\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 547\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpyogrio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_bytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pyogrio/geopandas.py:265\u001b[0m, in \u001b[0;36mread_dataframe\u001b[0;34m(path_or_buffer, layer, encoding, columns, read_geometry, force_2d, skip_features, max_features, where, bbox, mask, fids, sql, sql_dialect, fid_as_index, use_arrow, on_invalid, arrow_to_pandas_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_arrow:\n\u001b[1;32m 261\u001b[0m \u001b[38;5;66;03m# For arrow, datetimes are read as is.\u001b[39;00m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;66;03m# For numpy IO, datetimes are read as string values to preserve timezone info\u001b[39;00m\n\u001b[1;32m 263\u001b[0m \u001b[38;5;66;03m# as numpy does not directly support timezones.\u001b[39;00m\n\u001b[1;32m 264\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdatetime_as_string\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 265\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mread_func\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 266\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 267\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlayer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mread_geometry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mread_geometry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_2d\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgdal_force_2d\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43mskip_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_features\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_features\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 274\u001b[0m \u001b[43m \u001b[49m\u001b[43mwhere\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwhere\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 275\u001b[0m \u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 276\u001b[0m \u001b[43m \u001b[49m\u001b[43mmask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 277\u001b[0m \u001b[43m \u001b[49m\u001b[43mfids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 278\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 279\u001b[0m \u001b[43m \u001b[49m\u001b[43msql_dialect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msql_dialect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 280\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_fids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfid_as_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 281\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 282\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_arrow:\n\u001b[1;32m 285\u001b[0m meta, table \u001b[38;5;241m=\u001b[39m result\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pyogrio/raw.py:198\u001b[0m, in \u001b[0;36mread\u001b[0;34m(path_or_buffer, layer, encoding, columns, read_geometry, force_2d, skip_features, max_features, where, bbox, mask, fids, sql, sql_dialect, return_fids, datetime_as_string, **kwargs)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Read OGR data source into numpy arrays.\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \n\u001b[1;32m 61\u001b[0m \u001b[38;5;124;03mIMPORTANT: non-linear geometry types (e.g., MultiSurface) are converted\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 194\u001b[0m \n\u001b[1;32m 195\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 196\u001b[0m dataset_kwargs \u001b[38;5;241m=\u001b[39m _preprocess_options_key_value(kwargs) \u001b[38;5;28;01mif\u001b[39;00m kwargs \u001b[38;5;28;01melse\u001b[39;00m {}\n\u001b[0;32m--> 198\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mogr_read\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[43m \u001b[49m\u001b[43mget_vsi_path_or_buffer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_buffer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 200\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlayer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mread_geometry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mread_geometry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 204\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_2d\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_2d\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 205\u001b[0m \u001b[43m \u001b[49m\u001b[43mskip_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_features\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 206\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_features\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 207\u001b[0m \u001b[43m \u001b[49m\u001b[43mwhere\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwhere\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 208\u001b[0m \u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 209\u001b[0m \u001b[43m \u001b[49m\u001b[43mmask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_mask_to_wkb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmask\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 210\u001b[0m \u001b[43m \u001b[49m\u001b[43mfids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 211\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 212\u001b[0m \u001b[43m \u001b[49m\u001b[43msql_dialect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msql_dialect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 213\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_fids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_fids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 214\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdataset_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 215\u001b[0m \u001b[43m \u001b[49m\u001b[43mdatetime_as_string\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdatetime_as_string\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pyogrio/_io.pyx:1240\u001b[0m, in \u001b[0;36mpyogrio._io.ogr_read\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32m~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pyogrio/_io.pyx:216\u001b[0m, in \u001b[0;36mpyogrio._io.ogr_open\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mDataSourceError\u001b[0m: '/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp' not recognized as being in a supported file format. It might help to specify the correct driver explicitly by prefixing the file path with '<DRIVER>:', e.g. 'CSV:path'."
]
}
],
"source": [
"import geopandas as gpd\n",
"\n",
"# Load the shapefile\n",
"shapefile_path = '/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp'\n",
"gdf = gpd.read_file(shapefile_path)\n",
"\n",
"# Print the head of the GeoDataFrame\n",
"print(gdf.head())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total GGRF Funding: $8.13B\n",
"Number of projects: 131428\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Basic cleaning\n",
"data['Date Operational'] = pd.to_datetime(data['Date Operational'])\n",
"data = data[\n",
" (data['Date Operational'] >= '2010-01-01') & \n",
" (data['Date Operational'] <= '2024-11-01')\n",
"].copy()\n",
"\n",
"# Remove rows with no GGRF funding\n",
"data = data.dropna(subset=['Total Program GGRFFunding'])\n",
"\n",
"# Add derived columns\n",
"data['Year'] = data['Date Operational'].dt.year\n",
"data['is_multi_county'] = data['County'].str.contains(',', na=False)\n",
"data['partnership_size'] = data['County'].str.count(',').fillna(0) + 1\n",
"\n",
"# Quick validation\n",
"print(f\"Total GGRF Funding: ${data['Total Program GGRFFunding'].sum()/1e9:.2f}B\")\n",
"print(f\"Number of projects: {len(data)}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

104
archive/two_way_table.csv Normal file

File diff suppressed because one or more lines are too long