In [1]:
# Project: California Equity
## File: initial_view/overview_hypotesting_20241031.ipynb
### Author: David P. Adams
### Date: 2024-10-31
In [2]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
In [3]:
## set directory
import os
os.chdir('/home/dadams/Repos/california_equity_git')
In [4]:
# read in the data
data = pd.read_csv('data_raw/cci_programs_data.csv', low_memory=False)
In [5]:
columns_data = pd.DataFrame(data.columns)
# export the 'DataFrame' to a csv file
columns_data.to_csv('data_raw/columns.csv', index=False)
In [6]:
# check the data types
data.dtypes
# save the data types to a csv file
data.dtypes.to_csv('data_raw/data_types.csv', header = False)
In [7]:
# describe the data
data.describe()
Out[7]:
| Census Tract | Total Project Cost | Total Program GGRFFunding | Total Project GHGReductions | Annual Project GHGReductions | Project Count | Total GGRFDisadvantaged Community Funding | Funding Benefiting Disadvantaged Communities | Estimated Num Vehicles In Service | Funding Within Disadvantage Communities | ... | Indirect Jobs Fte | Induced Jobs Fte | Compost Produced Tons | Compost Produced Tons Yr | Net Density DUA | Applicants Assisted | Invasive Cover 12 Months | Invasive Cover 36 Months | Project Acreage | Intermediary Admin Expenses Calc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.193700e+05 | 1.414290e+05 | 1.414290e+05 | 1.414290e+05 | 141429.000000 | 141429.000000 | 5.518700e+04 | 55187.000000 | 141429.000000 | 5.518700e+04 | ... | 141429.000000 | 141429.000000 | 141429.000000 | 141429.0 | 141429.000000 | 141429.000000 | 141429.000000 | 141429.000000 | 1.414290e+05 | 1.414290e+05 |
| mean | 6.053889e+09 | 9.206412e+05 | 7.791664e+04 | 7.717972e+02 | 0.205389 | 4.090872 | 2.736820e+04 | 110.217551 | 0.047331 | 2.030028e+04 | ... | 0.064567 | 0.117056 | 0.440977 | 0.0 | 0.082260 | 0.019642 | 0.010656 | 0.010578 | 9.771087e+00 | 1.911114e+03 |
| std | 2.641870e+07 | 3.736191e+07 | 1.011645e+06 | 2.371604e+04 | 3.361723 | 18.381861 | 6.327936e+05 | 1738.772195 | 1.972262 | 5.590536e+05 | ... | 1.444316 | 2.261146 | 45.712955 | 0.0 | 4.382593 | 1.406914 | 0.758250 | 0.757677 | 3.669526e+03 | 1.196317e+05 |
| min | 6.001400e+09 | 0.000000e+00 | 0.000000e+00 | -2.785930e+05 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000e+00 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 |
| 25% | 6.037271e+09 | 4.000000e+03 | 3.500000e+03 | 8.000000e+00 | 0.000000 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000e+00 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 |
| 50% | 6.059022e+09 | 8.000000e+03 | 7.500000e+03 | 1.500000e+01 | 0.000000 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000e+00 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 |
| 75% | 6.073016e+09 | 2.150000e+04 | 1.900000e+04 | 4.000000e+01 | 0.000000 | 3.000000 | 5.500000e+03 | 0.000000 | 0.000000 | 1.500000e+03 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 |
| max | 6.115041e+09 | 5.767700e+09 | 1.412670e+08 | 4.748924e+06 | 336.000000 | 2072.000000 | 6.443700e+07 | 102348.000000 | 503.000000 | 6.443700e+07 | ... | 110.170000 | 151.000000 | 10365.000000 | 0.0 | 706.000000 | 320.000000 | 85.000000 | 85.000000 | 1.380000e+06 | 2.000000e+07 |
8 rows × 82 columns
In [8]:
import geopandas as gpd
# Load the shapefile
shapefile_path = '/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp'
gdf = gpd.read_file(shapefile_path)
# Print the head of the GeoDataFrame
print(gdf.head())
Tract ZIP County ApproxLoc TotPop19 CIscore \
0 6.083002e+09 93454 Santa Barbara Santa Maria 4495 36.019653
1 6.083002e+09 93455 Santa Barbara Santa Maria 13173 37.030667
2 6.083002e+09 93454 Santa Barbara Santa Maria 2398 31.213140
3 6.083002e+09 93455 Santa Barbara Orcutt 4496 6.639331
4 6.083002e+09 93455 Santa Barbara Orcutt 4008 14.022852
CIscoreP Ozone OzoneP PM2_5 ... Elderly65 Hispanic \
0 69.162885 0.034190 10.566273 7.567724 ... 12.5028 68.9210
1 70.637922 0.035217 11.561917 7.624775 ... 5.3519 78.6229
2 61.069087 0.034190 10.566273 7.548835 ... 12.8857 65.7214
3 5.988401 0.036244 13.615432 7.660570 ... 14.4128 22.9537
4 23.121533 0.036244 13.615432 7.663210 ... 18.8872 33.4082
White AfricanAm NativeAm OtherMult Shape_Leng Shape_Area \
0 20.8899 0.4004 0.2670 1.3126 6999.357689 2.847611e+06
1 13.2240 2.5051 0.0000 0.9489 19100.578232 1.635292e+07
2 30.6088 0.9591 0.0000 2.1685 4970.985897 1.352329e+06
3 69.1948 0.9342 0.7117 2.5356 6558.956012 2.417717e+06
4 59.7804 0.6986 1.4721 1.3723 6570.368730 2.608422e+06
AAPI geometry
0 8.2091 POLYGON ((-39795.07 -341919.191, -38126.384 -3...
1 4.6990 POLYGON ((-39795.07 -341919.191, -39803.632 -3...
2 0.5421 POLYGON ((-38115.747 -341130.248, -38126.384 -...
3 3.6699 POLYGON ((-37341.662 -348530.437, -37252.307 -...
4 3.2685 POLYGON ((-39465.107 -348499.262, -38244.305 -...
[5 rows x 67 columns]
In [9]:
# 1. Fix the classification function
def classify_project_type_v2(row):
program = str(row['Program Name']).lower()
project_type = str(row['Project Type']).lower()
# Individual incentive programs
if program == 'low carbon transportation':
if 'voucher' in project_type.lower():
return 'Individual - Vehicle'
return 'Transportation - Other'
elif program in ['low-income weatherization program', 'woodsmoke reduction program']:
return 'Individual - Home Energy'
elif program == 'water-energy efficiency':
return 'Water-Energy Projects'
elif program == 'community air protection':
return 'Air Quality Projects'
elif program == 'climate smart agriculture':
return 'Agricultural Projects'
elif 'voucher' in project_type.lower():
return 'Individual - Other'
elif program == 'transformative climate communities':
return 'Community Projects'
# Institutional by recipient type
elif pd.notna(row['PRIMARY_FUNDING_RECIPIENT_TYPE']):
recipient = str(row['PRIMARY_FUNDING_RECIPIENT_TYPE'])
if recipient in ['Government Agency', 'Government agency', 'California Native American Tribe']:
return 'Institutional - Government'
elif recipient == 'Educational institution':
return 'Institutional - Education'
elif recipient == 'Nonprofit':
return 'Institutional - Nonprofit'
elif recipient == 'Business':
if row['Total Project Cost'] > 100000:
return 'Institutional - Large Business'
return 'Business - Small'
return 'Other'
# 2. Add safe division function for metrics
def safe_divide(a, b):
if b == 0:
return 0
return a / b
# 3. Recalculate metrics
data['project_category'] = data.apply(classify_project_type_v2, axis=1)
data['dac_funding_ratio'] = data.apply(lambda x: safe_divide(x['Total GGRFDisadvantaged Community Funding'], x['Total Project Cost']), axis=1)
data['ghg_per_dollar'] = data.apply(lambda x: safe_divide(x['Total Project GHGReductions'], x['Total Project Cost']), axis=1)
# 4. Rerun analysis
updated_analysis = data.groupby('project_category').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'Total Project GHGReductions': ['sum', 'mean'],
'dac_funding_ratio': ['mean', 'median'],
'ghg_per_dollar': ['mean', 'median']
}).round(4)
print("\nUpdated Project Category Distribution:")
print(data['project_category'].value_counts())
print("\nUpdated Analysis:")
print(updated_analysis)
Updated Project Category Distribution:
project_category
Transportation - Other 101427
Other 14340
Individual - Home Energy 7427
Air Quality Projects 5514
Water-Energy Projects 5111
Individual - Vehicle 4538
Agricultural Projects 1688
Community Projects 973
Institutional - Government 269
Institutional - Nonprofit 100
Institutional - Large Business 23
Institutional - Education 17
Business - Small 2
Name: count, dtype: int64
Updated Analysis:
Total Project Cost \
count sum mean
project_category
Agricultural Projects 1688 886514974 5.251866e+05
Air Quality Projects 5514 1211608583 2.197331e+05
Business - Small 2 65376 3.268800e+04
Community Projects 973 508817727 5.229370e+05
Individual - Home Energy 7427 320008647 4.308720e+04
Individual - Vehicle 4538 1286418141 2.834769e+05
Institutional - Education 17 5561672 3.271572e+05
Institutional - Government 269 16939974420 6.297388e+07
Institutional - Large Business 23 513921827 2.234443e+07
Institutional - Nonprofit 100 1402247011 1.402247e+07
Other 14340 104572013338 7.292330e+06
Transportation - Other 101427 2473090031 2.438296e+04
Water-Energy Projects 5111 85125916 1.665543e+04
Total Project GHGReductions \
sum mean
project_category
Agricultural Projects 24249378 14365.7453
Air Quality Projects 250801 45.4844
Business - Small 0 0.0000
Community Projects 154010 158.2837
Individual - Home Energy 693914 93.4313
Individual - Vehicle 993992 219.0375
Institutional - Education 457 26.8824
Institutional - Government 6580289 24462.0409
Institutional - Large Business 388663 16898.3913
Institutional - Nonprofit 4791834 47918.3400
Other 67313019 4694.0738
Transportation - Other 3312930 32.6632
Water-Energy Projects 425225 83.1980
dac_funding_ratio ghg_per_dollar \
mean median mean
project_category
Agricultural Projects 0.2388 0.0000 0.0106
Air Quality Projects NaN NaN 0.0002
Business - Small NaN NaN 0.0000
Community Projects NaN NaN 0.0004
Individual - Home Energy 0.6504 0.7405 0.0038
Individual - Vehicle 0.1660 0.0245 0.0017
Institutional - Education NaN NaN 0.0008
Institutional - Government 0.0000 0.0000 0.0022
Institutional - Large Business NaN NaN 0.0038
Institutional - Nonprofit 0.0000 0.0000 0.0182
Other 0.1483 0.0000 0.0023
Transportation - Other 0.4190 0.0000 0.0027
Water-Energy Projects 0.4359 0.0000 0.1027
median
project_category
Agricultural Projects 0.0051
Air Quality Projects 0.0000
Business - Small 0.0000
Community Projects 0.0000
Individual - Home Energy 0.0026
Individual - Vehicle 0.0010
Institutional - Education 0.0000
Institutional - Government 0.0001
Institutional - Large Business 0.0005
Institutional - Nonprofit 0.0004
Other 0.0001
Transportation - Other 0.0032
Water-Energy Projects 0.0096
This refined analysis reveals some fascinating patterns about California's climate investments and their environmental justice implications. Let's break down the key findings:
- Program Distribution and Scale:
- Transportation dominates numerically:
- Transportation-Other: 101,427 projects (71.6% of all projects)
- Individual Vehicle incentives: 4,538 projects
- Individual programs have high reach but moderate costs:
- Home Energy: 7,427 projects (~$43K average)
- Vehicle incentives: 4,538 projects (~$283K average)
- Institutional projects are fewer but larger:
- Government: 269 projects (~$63M average)
- Nonprofits: 100 projects (~$14M average)
- Environmental Justice (DAC Funding Ratio):
- Home Energy programs show strongest DAC focus (65% DAC funding ratio)
- Water-Energy (44%) and Transportation-Other (42%) also show good DAC targeting
- Individual Vehicle programs have lower DAC funding (17%)
- Institutional programs show minimal DAC funding ratios
- GHG Reduction Efficiency (GHG per dollar):
- Water-Energy Projects are most efficient (0.1027 tons/$)
- Nonprofit projects show high efficiency (0.0182 tons/$)
- Agricultural Projects also perform well (0.0106 tons/$)
- Individual incentives show lower efficiency:
- Home Energy: 0.0038 tons/$
- Vehicle: 0.0017 tons/$
Key Environmental Justice Implications:
- Individual vs Institutional Trade-offs:
- Individual programs (especially home energy) are better at reaching disadvantaged communities
- But institutional programs tend to be more cost-effective for GHG reductions
- This suggests a potential tension between equity and efficiency goals
- Program-Specific Patterns:
- Home Energy programs show the best balance of DAC reach and moderate GHG efficiency
- Vehicle incentives have lower DAC reach and efficiency, supporting your hypothesis about wealth bias
- Water-Energy projects show promising combination of good DAC reach and high efficiency
- Policy Implications:
- Consider expanding successful programs like Water-Energy that combine good DAC reach with high efficiency
- Look for ways to improve DAC access to vehicle incentives
- Consider bundling institutional and individual programs to balance efficiency and equity goals
Would you like to explore any of these aspects in more detail? We could:
- Drill deeper into geographic distribution
- Analyze temporal trends in these patterns
- Look at specific program characteristics that drive DAC success
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
# Prepare data for plotting
plot_data = data.groupby('project_category').agg({
'Total Project Cost': ['count', 'mean'],
'dac_funding_ratio': 'mean',
'ghg_per_dollar': 'mean'
}).reset_index()
# Flatten column names
plot_data.columns = ['project_category', 'count', 'avg_cost', 'dac_ratio', 'ghg_efficiency']
# Create figure with multiple subplots
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 20))
# 1. Project Counts
sns.barplot(data=plot_data,
x='count',
y='project_category',
ax=ax1,
palette='viridis')
ax1.set_title('Number of Projects by Category')
ax1.set_xlabel('Number of Projects')
# 2. DAC Funding Ratio
mask = plot_data['dac_ratio'].notna() # Filter out NaN values
sns.barplot(data=plot_data[mask],
x='dac_ratio',
y='project_category',
ax=ax2,
palette='viridis')
ax2.set_title('Disadvantaged Community Funding Ratio by Category')
ax2.set_xlabel('DAC Funding Ratio')
# 3. GHG Efficiency
mask = plot_data['ghg_efficiency'].notna() # Filter out NaN values
sns.barplot(data=plot_data[mask],
x='ghg_efficiency',
y='project_category',
ax=ax3,
palette='viridis')
ax3.set_title('GHG Reduction Efficiency by Category (tons CO2e per dollar)')
ax3.set_xlabel('GHG Reduction per Dollar')
plt.tight_layout()
plt.show()
# Create a scatter plot to show relationship between project size and efficiency
plt.figure(figsize=(12, 8))
plt.scatter(plot_data['avg_cost'],
plot_data['ghg_efficiency'],
alpha=0.6)
# Add labels for each point
for i, row in plot_data.iterrows():
plt.annotate(row['project_category'],
(row['avg_cost'], row['ghg_efficiency']),
xytext=(5, 5),
textcoords='offset points')
plt.xscale('log') # Use log scale for cost due to wide range
plt.xlabel('Average Project Cost (log scale)')
plt.ylabel('GHG Reduction Efficiency (tons CO2e per dollar)')
plt.title('Project Cost vs GHG Reduction Efficiency')
plt.grid(True)
plt.show()
/tmp/ipykernel_846869/3534785368.py:18: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(data=plot_data, /tmp/ipykernel_846869/3534785368.py:28: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(data=plot_data[mask], /tmp/ipykernel_846869/3534785368.py:38: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(data=plot_data[mask],
In [11]:
# First let's see how many counties we have data for and their distribution
county_analysis = data.groupby(['County', 'project_category']).agg({
'Total Project Cost': ['count', 'sum'],
'Total Project GHGReductions': 'sum',
'dac_funding_ratio': 'mean'
}).round(2)
# Reset index for easier manipulation
county_analysis = county_analysis.reset_index()
# Flatten column names
county_analysis.columns = ['County', 'project_category', 'project_count', 'total_cost', 'total_ghg', 'dac_ratio']
print("\nTop 10 Counties by Project Count:")
print(data['County'].value_counts().head(10))
print("\nProject Categories by County (top 5 counties):")
top_counties = data['County'].value_counts().head(5).index
for county in top_counties:
print(f"\n{county}:")
county_cats = data[data['County'] == county]['project_category'].value_counts()
print(county_cats)
# Calculate percentage of projects in each county by category
county_category_pcts = pd.crosstab(data['County'],
data['project_category'],
values=data['Total Project Cost'],
aggfunc='sum',
normalize='index') * 100
print("\nPercentage of Funding by Category in Top 5 Counties:")
print(county_category_pcts.loc[top_counties].round(2))
Top 10 Counties by Project Count: County Los Angeles 38032 Orange 10694 San Diego 8468 Fresno 7854 Santa Clara 7154 Alameda 6381 Riverside 6190 San Bernardino 5159 Sacramento 4781 San Joaquin 4202 Name: count, dtype: int64 Project Categories by County (top 5 counties): Los Angeles: project_category Transportation - Other 30411 Individual - Home Energy 2233 Individual - Vehicle 2187 Water-Energy Projects 1945 Other 794 Air Quality Projects 352 Community Projects 79 Institutional - Government 19 Institutional - Nonprofit 6 Agricultural Projects 3 Institutional - Large Business 3 Name: count, dtype: int64 Orange: project_category Transportation - Other 9866 Individual - Vehicle 290 Individual - Home Energy 184 Water-Energy Projects 169 Other 129 Air Quality Projects 51 Institutional - Government 4 Community Projects 1 Name: count, dtype: int64 San Diego: project_category Transportation - Other 7571 Other 375 Individual - Vehicle 294 Air Quality Projects 105 Individual - Home Energy 83 Agricultural Projects 30 Institutional - Government 6 Institutional - Nonprofit 3 Community Projects 1 Name: count, dtype: int64 Fresno: project_category Transportation - Other 2541 Other 2144 Air Quality Projects 1747 Individual - Home Energy 769 Water-Energy Projects 338 Agricultural Projects 142 Individual - Vehicle 96 Community Projects 64 Institutional - Government 7 Institutional - Education 3 Institutional - Large Business 2 Institutional - Nonprofit 1 Name: count, dtype: int64 Santa Clara: project_category Transportation - Other 6214 Water-Energy Projects 530 Individual - Vehicle 177 Other 89 Individual - Home Energy 74 Air Quality Projects 50 Agricultural Projects 14 Institutional - Government 3 Institutional - Nonprofit 2 Institutional - Large Business 1 Name: count, dtype: int64 Percentage of Funding by Category in Top 5 Counties: project_category Agricultural Projects Air Quality Projects \ County Los Angeles 0.00 0.36 Orange 0.00 1.44 San Diego 0.05 1.61 Fresno 6.12 10.44 Santa Clara 0.03 0.61 project_category Business - Small Community Projects \ County Los Angeles 0.0 0.14 Orange 0.0 0.15 San Diego 0.0 0.00 Fresno 0.0 17.55 Santa Clara 0.0 0.00 project_category Individual - Home Energy Individual - Vehicle \ County Los Angeles 0.06 0.91 Orange 0.09 2.90 San Diego 0.10 1.45 Fresno 4.74 2.50 Santa Clara 0.12 2.07 project_category Institutional - Education Institutional - Government \ County Los Angeles 0.00 5.35 Orange 0.00 37.65 San Diego 0.00 2.90 Fresno 0.02 0.73 Santa Clara 0.00 15.35 project_category Institutional - Large Business Institutional - Nonprofit \ County Los Angeles 0.26 0.71 Orange 0.00 0.00 San Diego 0.00 2.54 Fresno 4.07 0.01 Santa Clara 3.88 0.04 project_category Other Transportation - Other Water-Energy Projects County Los Angeles 91.48 0.73 0.00 Orange 51.20 6.46 0.12 San Diego 88.01 3.33 0.00 Fresno 47.73 5.73 0.36 Santa Clara 73.37 4.43 0.09
In [12]:
# Define a more distinct color palette that groups similar categories
colors = {
'Agricultural Projects': '#228B22', # Forest Green
'Air Quality Projects': '#87CEEB', # Sky Blue
'Business - Small': '#DDA0DD', # Plum
'Community Projects': '#FF7F50', # Coral
'Individual - Home Energy': '#FFD700', # Gold
'Individual - Vehicle': '#DAA520', # Goldenrod
'Institutional - Education': '#4B0082', # Indigo
'Institutional - Government': '#800000', # Maroon
'Institutional - Large Business': '#4682B4', # Steel Blue
'Institutional - Nonprofit': '#2E8B57', # Sea Green
'Other': '#808080', # Grey
'Transportation - Other': '#FF4500', # Orange Red
'Water-Energy Projects': '#00CED1' # Turquoise
}
plt.figure(figsize=(15, 8))
county_category_pcts.loc[top_counties].plot(kind='bar',
stacked=True,
color=[colors[x] for x in county_category_pcts.columns])
plt.title('Distribution of Project Funding by Category Across Top Counties')
plt.xlabel('County')
plt.ylabel('Percentage of Total Funding')
plt.legend(bbox_to_anchor=(1.05, 1),
loc='upper left',
title='Project Categories')
plt.tight_layout()
plt.show()
# Create a visualization for DAC funding ratio by county
plt.figure(figsize=(15, 8))
sns.boxplot(data=data[data['dac_funding_ratio'].notna()],
x='County',
y='dac_funding_ratio',
order=top_counties,
palette='viridis')
plt.title('DAC Funding Ratio Distribution by County')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
<Figure size 1500x800 with 0 Axes>
/tmp/ipykernel_846869/2919179271.py:33: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=data[data['dac_funding_ratio'].notna()],
In [13]:
# Clearer organization
def classify_for_ej_analysis(row):
program = str(row['Program Name']).lower()
project_type = str(row['Project Type']).lower()
# Individual incentives
if program == 'low carbon transportation' and 'voucher' in project_type.lower():
return 'Individual - Vehicle'
elif program == 'low-income weatherization program':
return 'Individual - Solar/Energy'
elif program == 'woodsmoke reduction program':
return 'Individual - Solar/Energy'
# Large projects
elif row['Total Project Cost'] > 100000: # threshold for "large" projects
return 'Large Projects'
else:
return 'Other'
data['ej_category'] = data.apply(classify_for_ej_analysis, axis=1)
# Analyze environmental justice metrics by category
ej_analysis = data.groupby('ej_category').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'dac_funding_ratio': ['mean', 'median'],
'Is Benefit Disadvantaged Communities': 'mean',
'Is Low Income Communities': 'mean',
'Total Project GHGReductions': 'sum'
}).round(4)
print("Environmental Justice Analysis by Category:")
print(ej_analysis)
# Calculate percentage of funds going to different income communities
ej_distribution = data.groupby('ej_category').agg({
'Total Project Cost': 'sum',
'Total GGRFDisadvantaged Community Funding': 'sum',
'Low Income Amount': 'sum'
})
ej_distribution['DAC_percentage'] = (ej_distribution['Total GGRFDisadvantaged Community Funding'] /
ej_distribution['Total Project Cost'] * 100)
ej_distribution['LowIncome_percentage'] = (ej_distribution['Low Income Amount'] /
ej_distribution['Total Project Cost'] * 100)
print("\nPercentage of Funding to Disadvantaged and Low-Income Communities:")
print(ej_distribution[['DAC_percentage', 'LowIncome_percentage']].round(2))
# Visualize the distribution
plt.figure(figsize=(12, 6))
bar_width = 0.35
categories = ej_distribution.index
x = np.arange(len(categories))
plt.bar(x - bar_width/2, ej_distribution['DAC_percentage'],
bar_width, label='DAC Funding %', alpha=0.8)
plt.bar(x + bar_width/2, ej_distribution['LowIncome_percentage'],
bar_width, label='Low Income Funding %', alpha=0.8)
plt.xlabel('Project Category')
plt.ylabel('Percentage of Total Funding')
plt.title('Distribution of Funding to Disadvantaged and Low-Income Communities')
plt.xticks(x, categories, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
# Let's also look at the per-project impact
impact_analysis = data.groupby('ej_category').agg({
'Total Project GHGReductions': 'mean',
'Total Project Cost': 'mean'
}).round(2)
impact_analysis['GHG_reduction_per_dollar'] = (impact_analysis['Total Project GHGReductions'] /
impact_analysis['Total Project Cost']).round(4)
print("\nImpact Analysis:")
print(impact_analysis)
Environmental Justice Analysis by Category:
Total Project Cost \
count sum mean
ej_category
Individual - Solar/Energy 7427 320008647 4.308720e+04
Individual - Vehicle 4538 1286418141 2.834769e+05
Large Projects 10009 127060954384 1.269467e+07
Other 119455 1537986491 1.287503e+04
dac_funding_ratio \
mean median
ej_category
Individual - Solar/Energy 0.6504 0.7405
Individual - Vehicle 0.1660 0.0245
Large Projects 0.2761 0.0000
Other 0.4129 0.0000
Is Benefit Disadvantaged Communities \
mean
ej_category
Individual - Solar/Energy 0.7824
Individual - Vehicle 0.3455
Large Projects 0.0664
Other 0.3275
Is Low Income Communities \
mean
ej_category
Individual - Solar/Energy 0.9338
Individual - Vehicle 0.4125
Large Projects 0.5602
Other 0.5708
Total Project GHGReductions
sum
ej_category
Individual - Solar/Energy 693914
Individual - Vehicle 993992
Large Projects 103527784
Other 3938822
Percentage of Funding to Disadvantaged and Low-Income Communities:
DAC_percentage LowIncome_percentage
ej_category
Individual - Solar/Energy 41.46 4.97
Individual - Vehicle 5.13 2.01
Large Projects 0.91 1.47
Other 10.11 16.38
Impact Analysis:
Total Project GHGReductions Total Project Cost \
ej_category
Individual - Solar/Energy 93.43 43087.20
Individual - Vehicle 219.04 283476.89
Large Projects 10343.47 12694670.24
Other 32.97 12875.03
GHG_reduction_per_dollar
ej_category
Individual - Solar/Energy 0.0022
Individual - Vehicle 0.0008
Large Projects 0.0008
Other 0.0026
Patterns of Equity in California's Climate Investments¶
- Individual Solar/Energy Programs Show Strong Equity Focus:
- Highest DAC funding ratio (0.65 mean, 0.74 median)
- 78% benefit disadvantaged communities
- 93% benefit low-income communities
- 41.46% of funds go to DACs, highest among all categories
- However, relatively small total investment ($320M)
- Individual Vehicle Programs Show Wealth Disparity:
- Much lower DAC funding ratio (0.17 mean, 0.02 median)
- Only 34.5% benefit disadvantaged communities
- Only 5.13% of funds go to DACs
- Larger individual project costs (~$283K vs $43K for solar)
- This supports your hypothesis about vehicle incentives benefiting wealthier communities
- Large Projects Show Mixed Results:
- Low DAC funding ratio (0.28 mean, 0.00 median)
- Only 6.6% benefit disadvantaged communities
- But 56% benefit low-income communities
- Largest total investment ($127B)
- Highest total GHG reductions (103.5M tons)
Key Findings:
- the hypothesis about vehicle incentives primarily benefiting wealthier communities is strongly supported by the data
- However, individual solar/energy programs are actually very successful at reaching disadvantaged communities
- Large projects, while achieving the most total GHG reductions, have mixed equity outcomes - good at reaching low-income communities but not DACs
In [14]:
# 1. Analyze the overall spatial distribution of investments
county_analysis = data.groupby('County').agg({
'Total Project Cost': ['count', 'sum'],
'Total Project GHGReductions': 'sum',
'dac_funding_ratio': 'mean',
'Is Benefit Disadvantaged Communities': 'mean',
'Is Low Income Communities': 'mean'
}).round(4)
# Add per capita metrics (we would need to merge with county population data)
# Add GHG reduction per dollar by county
# 2. Analyze vulnerable communities specifically
vulnerability_analysis = pd.DataFrame({
'Total Projects': [
len(data),
len(data[data['Is Benefit Disadvantaged Communities'] == 1]),
len(data[data['Is Low Income Communities'] == 1])
],
'Total Investment': [
data['Total Project Cost'].sum(),
data[data['Is Benefit Disadvantaged Communities'] == 1]['Total Project Cost'].sum(),
data[data['Is Low Income Communities'] == 1]['Total Project Cost'].sum()
],
'GHG Reduction': [
data['Total Project GHGReductions'].sum(),
data[data['Is Benefit Disadvantaged Communities'] == 1]['Total Project GHGReductions'].sum(),
data[data['Is Low Income Communities'] == 1]['Total Project GHGReductions'].sum()
]
}, index=['All Communities', 'Disadvantaged Communities', 'Low Income Communities'])
# 3. Project type distribution in vulnerable communities
project_vulnerability = data.groupby(['project_category', 'Is Benefit Disadvantaged Communities']).agg({
'Total Project Cost': ['count', 'sum'],
'Total Project GHGReductions': 'sum'
}).round(4)
print("Overall Project Distribution by County:")
print(county_analysis.sort_values(('Total Project Cost', 'sum'), ascending=False).head(10))
print("\nVulnerable Communities Analysis:")
print(vulnerability_analysis)
print("\nProject Types in Vulnerable Communities:")
print(project_vulnerability)
# Visualizations
plt.figure(figsize=(15, 8))
# Plot investment distribution...
plt.subplot(1, 2, 1)
county_analysis[('Total Project Cost', 'sum')].sort_values(ascending=False).head(10).plot(kind='bar')
plt.title('Top 10 Counties by Total Investment')
plt.ylabel('Total Investment')
plt.xlabel('County')
# ...and GHG reductions
plt.subplot(1, 2, 2)
county_analysis[('Total Project GHGReductions', 'sum')].sort_values(ascending=False).head(10).plot(kind='bar')
plt.title('Top 10 Counties by GHG Reductions')
plt.ylabel('Total GHG Reductions')
plt.xlabel('County')
plt.tight_layout()
plt.show()
Overall Project Distribution by County:
Total Project Cost \
count
County
Los Angeles 38032
Alameda 6381
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 9
Sacramento 4781
San Diego 8468
San Francisco 2748
Santa Clara 7154
San Francisco, Santa Clara 6
Orange 10694
San Diego, Santa Barbara, Ventura 23
\
sum
County
Los Angeles 69936060480
Alameda 10379456877
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 7333200000
Sacramento 5858784824
San Diego 4870580185
San Francisco 3813460158
Santa Clara 3178166479
San Francisco, Santa Clara 2998442533
Orange 2383485407
San Diego, Santa Barbara, Ventura 2235315130
Total Project GHGReductions \
sum
County
Los Angeles 11307708
Alameda 10518799
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 546406
Sacramento 1975903
San Diego 2233426
San Francisco 1850982
Santa Clara 1793848
San Francisco, Santa Clara 1538067
Orange 1180528
San Diego, Santa Barbara, Ventura 791432
dac_funding_ratio \
mean
County
Los Angeles 0.5751
Alameda 0.2983
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... NaN
Sacramento 0.4194
San Diego 0.1004
San Francisco 0.1057
Santa Clara 0.2247
San Francisco, Santa Clara 0.0101
Orange 0.3994
San Diego, Santa Barbara, Ventura NaN
Is Benefit Disadvantaged Communities \
mean
County
Los Angeles 0.5420
Alameda 0.2612
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 0.0000
Sacramento 0.3403
San Diego 0.0993
San Francisco 0.1121
Santa Clara 0.2200
San Francisco, Santa Clara 0.1667
Orange 0.3571
San Diego, Santa Barbara, Ventura 0.0000
Is Low Income Communities
mean
County
Los Angeles 0.6534
Alameda 0.5557
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 1.0000
Sacramento 0.5162
San Diego 0.5265
San Francisco 0.5761
Santa Clara 0.4911
San Francisco, Santa Clara 0.8333
Orange 0.6461
San Diego, Santa Barbara, Ventura 1.0000
Vulnerable Communities Analysis:
Total Projects Total Investment GHG Reduction
All Communities 141429 130205367663 109154512
Disadvantaged Communities 47167 14111685888 12626682
Low Income Communities 82595 103691488875 55739585
Project Types in Vulnerable Communities:
Total Project Cost \
count
project_category Is Benefit Disadvantaged Communities
Agricultural Projects False 1490
True 198
Air Quality Projects False 5514
Business - Small False 2
Community Projects False 973
Individual - Home Energy False 1616
True 5811
Individual - Vehicle False 2970
True 1568
Institutional - Education False 17
Institutional - Government False 269
Institutional - Large Business False 23
Institutional - Nonprofit False 100
Other False 13788
True 552
Transportation - Other False 64674
True 36753
Water-Energy Projects False 2826
True 2285
\
sum
project_category Is Benefit Disadvantaged Communities
Agricultural Projects False 824640269
True 61874705
Air Quality Projects False 1211608583
Business - Small False 65376
Community Projects False 508817727
Individual - Home Energy False 117426101
True 202582546
Individual - Vehicle False 839347111
True 447071030
Institutional - Education False 5561672
Institutional - Government False 16939974420
Institutional - Large Business False 513921827
Institutional - Nonprofit False 1402247011
Other False 91784138193
True 12787875145
Transportation - Other False 1888149934
True 584940097
Water-Energy Projects False 57783551
True 27342365
Total Project GHGReductions
sum
project_category Is Benefit Disadvantaged Communities
Agricultural Projects False 22903912
True 1345466
Air Quality Projects False 250801
Business - Small False 0
Community Projects False 154010
Individual - Home Energy False 203507
True 490407
Individual - Vehicle False 699758
True 294234
Institutional - Education False 457
Institutional - Government False 6580289
Institutional - Large Business False 388663
Institutional - Nonprofit False 4791834
Other False 57935224
True 9377795
Transportation - Other False 2392628
True 920302
Water-Energy Projects False 226747
True 198478
/tmp/ipykernel_846869/3616794259.py:63: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all Axes decorations. plt.tight_layout()
In [15]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
# First, summarize the data at tract level
tract_summary = data.groupby('Census Tract').agg({
'Total Project Cost': ['count', 'sum'],
'Total Project GHGReductions': 'sum',
'Is Benefit Disadvantaged Communities': 'mean',
'Is Low Income Communities': 'mean'
}).round(4)
# Reset the index to make Census Tract a column
tract_summary = tract_summary.reset_index()
# Create a few different maps to show different aspects
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 20))
# Function to create a map with proper legend
def create_map(data, column, title, ax, cmap):
im = ax.scatter(data['Census Tract'],
data[column],
c=data[column],
cmap=cmap)
plt.colorbar(im, ax=ax, label=title)
ax.set_title(title)
ax.axis('off')
# 1. Project Count Map
create_map(tract_summary,
('Total Project Cost', 'count'),
'Number of CCI Projects by Census Tract',
ax1,
'YlOrRd')
# 2. Total Investment Map
create_map(tract_summary,
('Total Project Cost', 'sum'),
'Total CCI Investment by Census Tract',
ax2,
'viridis')
# 3. DAC Benefits Map
create_map(tract_summary,
('Is Benefit Disadvantaged Communities', 'mean'),
'Proportion of Projects Benefiting DACs',
ax3,
'RdYlBu')
# 4. GHG Reductions Map
create_map(tract_summary,
('Total Project GHGReductions', 'sum'),
'Total GHG Reductions by Census Tract',
ax4,
'YlGn')
plt.tight_layout()
plt.show()
# Let's also look at the basic statistics
print("\nSpatial Distribution Statistics:")
print("\nTop 10 Census Tracts by Total Investment:")
print(tract_summary.nlargest(10, ('Total Project Cost', 'sum')))
print("\nTop 10 Census Tracts by Project Count:")
print(tract_summary.nlargest(10, ('Total Project Cost', 'count')))
print("\nTop 10 Census Tracts by GHG Reductions:")
print(tract_summary.nlargest(10, ('Total Project GHGReductions', 'sum')))
# Additional visualization of investment intensity
plt.figure(figsize=(15, 10))
tract_summary['investment_per_project'] = tract_summary[('Total Project Cost', 'sum')] / tract_summary[('Total Project Cost', 'count')]
plt.scatter(tract_summary[('Is Benefit Disadvantaged Communities', 'mean')],
tract_summary['investment_per_project'],
alpha=0.5)
plt.xlabel('Proportion of Projects Benefiting DACs')
plt.ylabel('Investment per Project ($)')
plt.title('Investment Intensity vs DAC Benefits')
plt.show()
Spatial Distribution Statistics:
Top 10 Census Tracts by Total Investment:
Census Tract Total Project Cost Total Project GHGReductions \
count sum sum
1686 6.037206e+09 369 128838282 71147
1701 6.037208e+09 246 79854037 21955
1250 6.037113e+09 115 67551314 45625
5715 6.073006e+09 38 65309556 1546
3340 6.037901e+09 61 37506825 44030
2391 6.037408e+09 207 34961176 31158
3482 6.037980e+09 67 30672161 23886
6642 6.081602e+09 61 26686653 14972
7164 6.085512e+09 49 24118370 17130
2268 6.037403e+09 81 23674333 11183
Is Benefit Disadvantaged Communities Is Low Income Communities
mean mean
1686 0.5176 0.0108
1701 0.7886 0.0447
1250 0.0087 0.0435
5715 0.0000 0.5789
3340 0.0000 0.9672
2391 0.8164 0.7923
3482 0.0000 0.0149
6642 0.0000 0.9508
7164 0.0000 0.0204
2268 0.3333 0.8148
Top 10 Census Tracts by Project Count:
Census Tract Total Project Cost Total Project GHGReductions \
count sum sum
1686 6.037206e+09 369 128838282 71147
1701 6.037208e+09 246 79854037 21955
2391 6.037408e+09 207 34961176 31158
6443 6.077002e+09 132 616605 650
6448 6.077002e+09 121 401633 284
1250 6.037113e+09 115 67551314 45625
6452 6.077003e+09 113 438571 580
2670 6.037504e+09 103 13744994 12914
6449 6.077002e+09 102 534104 625
2276 6.037403e+09 99 5585697 12839
Is Benefit Disadvantaged Communities Is Low Income Communities
mean mean
1686 0.5176 0.0108
1701 0.7886 0.0447
2391 0.8164 0.7923
6443 0.0985 0.9848
6448 0.0661 0.9835
1250 0.0087 0.0435
6452 0.0885 0.9469
2670 0.1942 0.0583
6449 0.1275 0.9804
2276 0.0000 0.1818
Top 10 Census Tracts by GHG Reductions:
Census Tract Total Project Cost Total Project GHGReductions \
count sum sum
419 6.009000e+09 13 6718556 735067
7330 6.091010e+09 8 1842014 730641
7963 6.113011e+09 21 9324110 297675
7411 6.095253e+09 17 2631219 163765
4432 6.061022e+09 13 1199979 78245
1686 6.037206e+09 369 128838282 71147
3677 6.053011e+09 26 5634176 53453
1250 6.037113e+09 115 67551314 45625
3340 6.037901e+09 61 37506825 44030
4166 6.059086e+09 84 18951369 32458
Is Benefit Disadvantaged Communities Is Low Income Communities
mean mean
419 0.0000 0.1538
7330 0.0000 0.7500
7963 0.0000 0.1429
7411 0.0000 0.2353
4432 0.0000 0.0000
1686 0.5176 0.0108
3677 0.7692 0.3462
1250 0.0087 0.0435
3340 0.0000 0.9672
4166 0.3214 0.2262
Analysis of Spatial Distribution and Environmental Justice Implications¶
Key Findings:¶
- Individual vs. Large Project Analysis Key Findings:
Individual Program Distribution:
- Home Energy/Solar Programs:
- Strong equity performance (78% benefit DACs)
- High DAC funding ratio (0.65)
- 93% benefit low-income communities
- Lower total investment ($320M)
- Vehicle Programs:
- Lower equity performance (35% benefit DACs)
- Low DAC funding ratio (0.17)
- Only 41% benefit low-income communities
- Higher individual costs (~$283K vs $43K for solar)
- Home Energy/Solar Programs:
Large Project Performance:
- Higher total investment ($127B)
- Highest total GHG reductions (103.5M tons)
- Mixed equity outcomes:
- Low DAC funding ratio (0.28)
- Only 6.6% benefit DACs
- But 56% benefit low-income communities
- Spatial Distribution Analysis Key Findings:
Geographic Concentration:
- Urban dominance (LA: 27% of projects, 54% of funds)
- Top 3 counties (LA, Alameda, San Diego) = 37% of projects
- Clear urban-rural divide in investment distribution
Environmental Justice Implications:
- DACs receive 33% of projects but only 11% of funding
- Low-income communities show better performance (58% of projects, 80% of funding)
- Significant variation in DAC benefit rates across regions
Next Steps:
- Individual vs. Large Project Analysis
- Detailed program design analysis:
- What makes home energy programs more successful at reaching DACs?
- What barriers exist in vehicle programs?
- How can large project DAC benefits be improved?
- Cost-effectiveness comparison:
- Calculate and compare GHG reduction per dollar across programs
- Analyze trade-offs between equity and efficiency
- Policy recommendations:
- Identify best practices from successful programs
- Suggest modifications for underperforming programs
- Spatial Distribution Analysis
- Enhanced geographic analysis:
- Create detailed maps showing investment patterns
- Analyze urban-rural disparities
- Examine relationship between CalEnviroScreen scores and investment patterns
- Environmental justice metrics:
- Develop composite indicators of investment equity
- Analyze temporal changes in distribution
- Compare to demographic and socioeconomic patterns
- Integration and Synthesis
- Combine findings from both approaches to:
- Identify overlapping patterns
- Develop comprehensive policy recommendations
- Create framework for evaluating climate investment equity
- Specific Data Analysis Tasks:
# 1. Create program efficiency metrics
program_efficiency = data.groupby('project_category').agg({
'Total Project GHGReductions': 'sum',
'Total Project Cost': 'sum'
}).assign(
efficiency = lambda x: x['Total Project GHGReductions'] / x['Total Project Cost']
)
# 2. Analyze spatial equity
spatial_equity = merged_data.groupby('CES_quartile').agg({
'Total Project Cost': ['sum', 'mean', 'count'],
'Total Project GHGReductions': ['sum', 'mean']
})
# 3. Create temporal analysis
temporal_patterns = data.groupby(['project_category', 'Date Operational']).agg({
'Total Project Cost': 'sum',
'Is Benefit Disadvantaged Communities': 'mean'
})
In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Read the data (I see you already have this loaded as 'data')
# 1. First, let's create our core analytical metrics
def calculate_program_metrics(df):
metrics = df.groupby('Program Name').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'Total Project GHGReductions': ['sum', 'mean'],
'Total GGRFDisadvantaged Community Funding': ['sum', 'mean'],
'Is Benefit Disadvantaged Communities': 'mean',
'Is Low Income Communities': 'mean'
}).round(2)
# Add efficiency metrics
metrics['GHG_per_dollar'] = (metrics[('Total Project GHGReductions', 'sum')] /
metrics[('Total Project Cost', 'sum')]).round(4)
metrics['DAC_funding_ratio'] = (metrics[('Total GGRFDisadvantaged Community Funding', 'sum')] /
metrics[('Total Project Cost', 'sum')]).round(4)
return metrics
program_metrics = calculate_program_metrics(data)
# Display top programs by different metrics
print("\nTop 5 Programs by Total Investment:")
print(program_metrics.sort_values(('Total Project Cost', 'sum'), ascending=False).head())
print("\nTop 5 Programs by GHG Reduction Efficiency:")
print(program_metrics.sort_values('GHG_per_dollar', ascending=False).head())
print("\nTop 5 Programs by DAC Funding Ratio:")
print(program_metrics.sort_values('DAC_funding_ratio', ascending=False).head())
Top 5 Programs by Total Investment:
Total Project Cost \
count
Program Name
Transit and Intercity Rail Capital Program 245
Low Carbon Transit Operations Program 1003
Affordable Housing and Sustainable Communities ... 151
Low Carbon Transportation 105965
Community Air Protection 5514
\
sum
Program Name
Transit and Intercity Rail Capital Program 101461750666
Low Carbon Transit Operations Program 8777590799
Affordable Housing and Sustainable Communities ... 8492944393
Low Carbon Transportation 3759508172
Community Air Protection 1211608583
\
mean
Program Name
Transit and Intercity Rail Capital Program 4.141296e+08
Low Carbon Transit Operations Program 8.751337e+06
Affordable Housing and Sustainable Communities ... 5.624466e+07
Low Carbon Transportation 3.547877e+04
Community Air Protection 2.197331e+05
Total Project GHGReductions \
sum
Program Name
Transit and Intercity Rail Capital Program 23458701
Low Carbon Transit Operations Program 6971510
Affordable Housing and Sustainable Communities ... 3590596
Low Carbon Transportation 4306922
Community Air Protection 250801
\
mean
Program Name
Transit and Intercity Rail Capital Program 95749.80
Low Carbon Transit Operations Program 6950.66
Affordable Housing and Sustainable Communities ... 23778.78
Low Carbon Transportation 40.64
Community Air Protection 45.48
Total GGRFDisadvantaged Community Funding \
sum
Program Name
Transit and Intercity Rail Capital Program 477700000.0
Low Carbon Transit Operations Program 127826618.0
Affordable Housing and Sustainable Communities ... 241985013.0
Low Carbon Transportation 324003740.0
Community Air Protection 0.0
\
mean
Program Name
Transit and Intercity Rail Capital Program 9952083.33
Low Carbon Transit Operations Program 399458.18
Affordable Housing and Sustainable Communities ... 4033083.55
Low Carbon Transportation 7644.66
Community Air Protection NaN
Is Benefit Disadvantaged Communities \
mean
Program Name
Transit and Intercity Rail Capital Program 0.16
Low Carbon Transit Operations Program 0.21
Affordable Housing and Sustainable Communities ... 0.22
Low Carbon Transportation 0.36
Community Air Protection 0.00
Is Low Income Communities \
mean
Program Name
Transit and Intercity Rail Capital Program 0.73
Low Carbon Transit Operations Program 0.64
Affordable Housing and Sustainable Communities ... 0.60
Low Carbon Transportation 0.59
Community Air Protection 0.87
GHG_per_dollar \
Program Name
Transit and Intercity Rail Capital Program 0.0002
Low Carbon Transit Operations Program 0.0008
Affordable Housing and Sustainable Communities ... 0.0004
Low Carbon Transportation 0.0011
Community Air Protection 0.0002
DAC_funding_ratio
Program Name
Transit and Intercity Rail Capital Program 0.0047
Low Carbon Transit Operations Program 0.0146
Affordable Housing and Sustainable Communities ... 0.0285
Low Carbon Transportation 0.0862
Community Air Protection 0.0000
Top 5 Programs by GHG Reduction Efficiency:
Total Project Cost \
count
Program Name
Fluorinated Gases Emission Reduction Incentives 15
Sustainable Agricultural Lands Conservation Pro... 96
Climate Smart Agriculture 1688
Forest Health Program 258
Food Production Investment Program 56
\
sum mean
Program Name
Fluorinated Gases Emission Reduction Incentives 0 0.00
Sustainable Agricultural Lands Conservation Pro... 212276194 2211210.35
Climate Smart Agriculture 886514974 525186.60
Forest Health Program 837651776 3246712.31
Food Production Investment Program 181509329 3241238.02
Total Project GHGReductions \
sum
Program Name
Fluorinated Gases Emission Reduction Incentives 36715
Sustainable Agricultural Lands Conservation Pro... 15080237
Climate Smart Agriculture 24249378
Forest Health Program 20575142
Food Production Investment Program 2974529
\
mean
Program Name
Fluorinated Gases Emission Reduction Incentives 2447.67
Sustainable Agricultural Lands Conservation Pro... 157085.80
Climate Smart Agriculture 14365.75
Forest Health Program 79748.61
Food Production Investment Program 53116.59
Total GGRFDisadvantaged Community Funding \
sum
Program Name
Fluorinated Gases Emission Reduction Incentives 0.0
Sustainable Agricultural Lands Conservation Pro... 4264000.0
Climate Smart Agriculture 29472350.0
Forest Health Program 0.0
Food Production Investment Program 0.0
\
mean
Program Name
Fluorinated Gases Emission Reduction Incentives NaN
Sustainable Agricultural Lands Conservation Pro... 101523.81
Climate Smart Agriculture 50814.40
Forest Health Program 0.00
Food Production Investment Program NaN
Is Benefit Disadvantaged Communities \
mean
Program Name
Fluorinated Gases Emission Reduction Incentives 0.00
Sustainable Agricultural Lands Conservation Pro... 0.01
Climate Smart Agriculture 0.12
Forest Health Program 0.00
Food Production Investment Program 0.00
Is Low Income Communities \
mean
Program Name
Fluorinated Gases Emission Reduction Incentives 0.60
Sustainable Agricultural Lands Conservation Pro... 0.03
Climate Smart Agriculture 0.35
Forest Health Program 0.30
Food Production Investment Program 0.71
GHG_per_dollar \
Program Name
Fluorinated Gases Emission Reduction Incentives inf
Sustainable Agricultural Lands Conservation Pro... 0.0710
Climate Smart Agriculture 0.0274
Forest Health Program 0.0246
Food Production Investment Program 0.0164
DAC_funding_ratio
Program Name
Fluorinated Gases Emission Reduction Incentives NaN
Sustainable Agricultural Lands Conservation Pro... 0.0201
Climate Smart Agriculture 0.0332
Forest Health Program 0.0000
Food Production Investment Program 0.0000
Top 5 Programs by DAC Funding Ratio:
Total Project Cost \
count sum
Program Name
Low-Income Weatherization Program 6468 307899153
Urban and Community Forestry Program 967 107767408
Water-Energy Efficiency 5111 85125916
Urban Greening Program 177 270240730
Wetlands and Watershed Restoration 22 86150135
Total Project GHGReductions \
mean sum
Program Name
Low-Income Weatherization Program 47603.46 594567
Urban and Community Forestry Program 111445.10 479295
Water-Energy Efficiency 16655.43 425225
Urban Greening Program 1526783.79 54285
Wetlands and Watershed Restoration 3915915.23 999950
\
mean
Program Name
Low-Income Weatherization Program 91.92
Urban and Community Forestry Program 495.65
Water-Energy Efficiency 83.20
Urban Greening Program 306.69
Wetlands and Watershed Restoration 45452.27
Total GGRFDisadvantaged Community Funding \
sum
Program Name
Low-Income Weatherization Program 132666048.0
Urban and Community Forestry Program 34138019.0
Water-Energy Efficiency 23286225.0
Urban Greening Program 68717491.0
Wetlands and Watershed Restoration 13382907.0
\
mean
Program Name
Low-Income Weatherization Program 22383.34
Urban and Community Forestry Program 60635.91
Water-Energy Efficiency 4556.10
Urban Greening Program 848364.09
Wetlands and Watershed Restoration 1115242.25
Is Benefit Disadvantaged Communities \
mean
Program Name
Low-Income Weatherization Program 0.90
Urban and Community Forestry Program 0.21
Water-Energy Efficiency 0.45
Urban Greening Program 0.19
Wetlands and Watershed Restoration 0.09
Is Low Income Communities GHG_per_dollar \
mean
Program Name
Low-Income Weatherization Program 0.97 0.0019
Urban and Community Forestry Program 0.09 0.0044
Water-Energy Efficiency 0.00 0.0050
Urban Greening Program 0.27 0.0002
Wetlands and Watershed Restoration 0.27 0.0116
DAC_funding_ratio
Program Name
Low-Income Weatherization Program 0.4309
Urban and Community Forestry Program 0.3168
Water-Energy Efficiency 0.2736
Urban Greening Program 0.2543
Wetlands and Watershed Restoration 0.1553
In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Let's first get a clear view of our data
print("\nTotal Investment by Program (Top 10):")
investment_by_program = data.groupby('Program Name')['Total Project Cost'].agg(['count', 'sum', 'mean'])
print(investment_by_program.nlargest(10, 'sum'))
# Calculate our key metrics
program_metrics = data.groupby('Program Name').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'Total Project GHGReductions': ['sum', 'mean'],
'Total GGRFDisadvantaged Community Funding': ['sum', 'mean'],
'Is Benefit Disadvantaged Communities': 'mean'
}).round(2)
# Add efficiency metrics
program_metrics['GHG_per_dollar'] = (program_metrics[('Total Project GHGReductions', 'sum')] /
program_metrics[('Total Project Cost', 'sum')]).round(4)
program_metrics['DAC_funding_ratio'] = (program_metrics[('Total GGRFDisadvantaged Community Funding', 'sum')] /
program_metrics[('Total Project Cost', 'sum')]).round(4)
# Clean data for visualization
clean_data = program_metrics[
(program_metrics['GHG_per_dollar'] != float('inf')) &
(program_metrics['DAC_funding_ratio'].notna())
]
# Create visualization
plt.figure(figsize=(15, 10))
# Basic scatter plot
scatter = plt.scatter(clean_data['DAC_funding_ratio'],
clean_data['GHG_per_dollar'],
s=clean_data[('Total Project Cost', 'sum')]/1e7,
alpha=0.6)
# Add labels for major programs
for idx, row in clean_data.iterrows():
if row[('Total Project Cost', 'sum')] > 500000000: # Only label major programs
plt.annotate(idx[:30] + '...' if len(idx) > 30 else idx,
(row['DAC_funding_ratio'], row['GHG_per_dollar']),
xytext=(5, 5),
textcoords='offset points',
fontsize=8)
plt.xlabel('DAC Funding Ratio')
plt.ylabel('GHG Reduction per Dollar')
plt.title('Program Performance: Equity vs. Climate Impact\nSize = Total Investment')
plt.tight_layout()
plt.show()
# Analysis of key metrics
print("\nProgram Performance Analysis:")
# Top programs by equity
print("\n1. Top 5 Programs by DAC Funding Ratio:")
print(clean_data.nlargest(5, 'DAC_funding_ratio')[
['DAC_funding_ratio', 'GHG_per_dollar', ('Total Project Cost', 'sum')]
])
# Top programs by GHG efficiency
print("\n2. Top 5 Programs by GHG Reduction Efficiency:")
print(clean_data.nlargest(5, 'GHG_per_dollar')[
['DAC_funding_ratio', 'GHG_per_dollar', ('Total Project Cost', 'sum')]
])
# Investment distribution
print("\n3. Investment Distribution Summary:")
print(clean_data[('Total Project Cost', 'sum')].describe())
Total Investment by Program (Top 10):
count sum \
Program Name
Transit and Intercity Rail Capital Program 245 101461750666
Low Carbon Transit Operations Program 1003 8777590799
Affordable Housing and Sustainable Communities ... 151 8492944393
Low Carbon Transportation 105965 3759508172
Community Air Protection 5514 1211608583
Climate Smart Agriculture 1688 886514974
Forest Health Program 258 837651776
Funding Agricultural Replacement Measures for E... 8554 771297976
Waste Diversion 265 643549843
Fire Prevention Program 600 614281793
mean
Program Name
Transit and Intercity Rail Capital Program 4.141296e+08
Low Carbon Transit Operations Program 8.751337e+06
Affordable Housing and Sustainable Communities ... 5.624466e+07
Low Carbon Transportation 3.547877e+04
Community Air Protection 2.197331e+05
Climate Smart Agriculture 5.251866e+05
Forest Health Program 3.246712e+06
Funding Agricultural Replacement Measures for E... 9.016811e+04
Waste Diversion 2.428490e+06
Fire Prevention Program 1.023803e+06
/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1465: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead x = float(self.convert_xunits(x)) /home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1467: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead y = float(self.convert_yunits(y))
Program Performance Analysis: 1. Top 5 Programs by DAC Funding Ratio:
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Cell In[27], line 62 60 # Top programs by equity 61 print("\n1. Top 5 Programs by DAC Funding Ratio:") ---> 62 print(clean_data.nlargest(5, 'DAC_funding_ratio')[ 63 ['DAC_funding_ratio', 'GHG_per_dollar', ('Total Project Cost', 'sum')] 64 ]) 66 # Top programs by GHG efficiency 67 print("\n2. Top 5 Programs by GHG Reduction Efficiency:") File ~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4108, in DataFrame.__getitem__(self, key) 4106 if is_iterator(key): 4107 key = list(key) -> 4108 indexer = self.columns._get_indexer_strict(key, "columns")[1] 4110 # take() does not accept boolean indexers 4111 if getattr(indexer, "dtype", None) == bool: File ~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pandas/core/indexes/multi.py:2763, in MultiIndex._get_indexer_strict(self, key, axis_name) 2760 if len(keyarr) and not isinstance(keyarr[0], tuple): 2761 indexer = self._get_indexer_level_0(keyarr) -> 2763 self._raise_if_missing(key, indexer, axis_name) 2764 return self[indexer], indexer 2766 return super()._get_indexer_strict(key, axis_name) File ~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pandas/core/indexes/multi.py:2781, in MultiIndex._raise_if_missing(self, key, indexer, axis_name) 2779 cmask = check == -1 2780 if cmask.any(): -> 2781 raise KeyError(f"{keyarr[cmask]} not in index") 2782 # We get here when levels still contain values which are not 2783 # actually in Index anymore 2784 raise KeyError(f"{keyarr} not in index") KeyError: "[('Total Project Cost', 'sum')] not in index"
In [23]:
# install sklearn library
!pip install scikit-learn
Collecting scikit-learn Using cached scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB) Requirement already satisfied: numpy>=1.19.5 in ./.venv/lib/python3.12/site-packages (from scikit-learn) (2.1.2) Requirement already satisfied: scipy>=1.6.0 in ./.venv/lib/python3.12/site-packages (from scikit-learn) (1.14.1) Collecting joblib>=1.2.0 (from scikit-learn) Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB) Collecting threadpoolctl>=3.1.0 (from scikit-learn) Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB) Using cached scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB) Using cached joblib-1.4.2-py3-none-any.whl (301 kB) Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB) Installing collected packages: threadpoolctl, joblib, scikit-learn Successfully installed joblib-1.4.2 scikit-learn-1.5.2 threadpoolctl-3.5.0
In [26]:
# Let's first look at the raw total investment amounts
print("Raw Investment Totals by Program:")
print(clean_data[('Total Project Cost', 'sum')].sort_values(ascending=False).head(10))
# Modified analysis with corrected calculations
def create_program_typology_v2(df):
# Calculate thresholds
ghg_med = df['GHG_per_dollar'].median()
dac_med = df['DAC_funding_ratio'].median()
size_med = df[('Total Project Cost', 'sum')].median()
# Create categories with corrected investment calculations
conditions = [
(df['GHG_per_dollar'] > ghg_med) & (df['DAC_funding_ratio'] > dac_med),
(df['GHG_per_dollar'] > ghg_med) & (df['DAC_funding_ratio'] <= dac_med),
(df['GHG_per_dollar'] <= ghg_med) & (df['DAC_funding_ratio'] > dac_med),
(df['GHG_per_dollar'] <= ghg_med) & (df['DAC_funding_ratio'] <= dac_med)
]
choices = [
'High Performer (Both)',
'GHG Efficient',
'Equity Focused',
'Below Median'
]
df['Performance_Type'] = np.select(conditions, choices, default='Other')
df['Scale_Type'] = np.where(df[('Total Project Cost', 'sum')] > size_med,
'Large Scale',
'Small Scale')
return df
# Rerun analysis with corrected calculations
analysis_data_v2 = clean_data.copy()
analysis_data_v2 = create_program_typology_v2(analysis_data_v2)
# Summary statistics with corrected investment amounts
print("\nRevised Program Type Characteristics:")
for ptype in analysis_data_v2['Performance_Type'].unique():
subset = analysis_data_v2[analysis_data_v2['Performance_Type'] == ptype]
print(f"\n{ptype}:")
print(f"Number of programs: {len(subset)}")
print(f"Average GHG efficiency: {subset['GHG_per_dollar'].mean():.4f}")
print(f"Average DAC ratio: {subset['DAC_funding_ratio'].mean():.4f}")
print(f"Average project cost: ${subset[('Total Project Cost', 'mean')].mean():,.2f}")
print(f"Largest program: ${subset[('Total Project Cost', 'sum')].max():,.2f}")
print("\nExample programs:")
print(subset.index.tolist()[:3])
# Create visualization with corrected data
plt.figure(figsize=(15, 10))
# Plot each program type with different colors and corrected sizes
for ptype in analysis_data_v2['Performance_Type'].unique():
mask = analysis_data_v2['Performance_Type'] == ptype
subset = analysis_data_v2[mask]
plt.scatter(subset['DAC_funding_ratio'],
subset['GHG_per_dollar'],
s=subset[('Total Project Cost', 'mean')]/1e5, # Adjusted scaling factor
alpha=0.6,
label=ptype)
plt.xlabel('DAC Funding Ratio')
plt.ylabel('GHG Reduction per Dollar')
plt.title('Program Performance Types (Corrected Investment Scaling)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Add annotations for major programs
for idx, row in analysis_data_v2.iterrows():
if row[('Total Project Cost', 'mean')] > 1e6: # Adjust threshold as needed
plt.annotate(idx[:30] + '...' if len(idx) > 30 else idx,
(row['DAC_funding_ratio'], row['GHG_per_dollar']),
xytext=(5, 5),
textcoords='offset points',
fontsize=8)
plt.tight_layout()
plt.show()
Raw Investment Totals by Program: Program Name Transit and Intercity Rail Capital Program 101461750666 Low Carbon Transit Operations Program 8777590799 Affordable Housing and Sustainable Communities Program 8492944393 Low Carbon Transportation 3759508172 Community Air Protection 1211608583 Climate Smart Agriculture 886514974 Forest Health Program 837651776 Funding Agricultural Replacement Measures for Emission Reductions Program 771297976 Waste Diversion 643549843 Fire Prevention Program 614281793 Name: (Total Project Cost, sum), dtype: int64 Revised Program Type Characteristics: Below Median: Number of programs: 16 Average GHG efficiency: 0.0000 Average DAC ratio: 0.0000 Average project cost: $1,352,617.47 Largest program: $1,211,608,583.00 Example programs: ['Active Transportation Program', 'Climate Change Research Program', 'Climate Ready Program '] High Performer (Both): Number of programs: 10 Average GHG efficiency: 0.0129 Average DAC ratio: 0.1410 Average project cost: $7,428,798.66 Largest program: $8,777,590,799.00 Example programs: ['Affordable Housing and Sustainable Communities Program', 'Climate Smart Agriculture', 'Low Carbon Transit Operations Program'] GHG Efficient: Number of programs: 9 Average GHG efficiency: 0.0089 Average DAC ratio: 0.0000 Average project cost: $1,967,095.81 Largest program: $837,651,776.00 Example programs: ['Climate Adaptation and Resiliency Program', 'Food Production Investment Program', 'Forest Health Program'] Equity Focused: Number of programs: 3 Average GHG efficiency: 0.0001 Average DAC ratio: 0.1038 Average project cost: $138,638,075.83 Largest program: $101,461,750,666.00 Example programs: ['Community Assistance for Climate Equity Program', 'Transit and Intercity Rail Capital Program', 'Urban Greening Program']
/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1465: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead x = float(self.convert_xunits(x)) /home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1467: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead y = float(self.convert_yunits(y))
In [28]:
# Let's create a summary visualization that highlights our main findings
plt.figure(figsize=(15, 12))
# Create four subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 16))
# 1. Program Scale Distribution (Top 10 by Investment)
top_10_programs = clean_data[('Total Project Cost', 'sum')].nlargest(10)
ax1.barh(range(len(top_10_programs)), top_10_programs/1e9) # Convert to billions
ax1.set_yticks(range(len(top_10_programs)))
ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_programs.index])
ax1.set_xlabel('Total Investment (Billions $)')
ax1.set_title('Top 10 Programs by Investment')
# 2. GHG Efficiency vs Project Size
ax2.scatter(clean_data[('Total Project Cost', 'mean')]/1e6,
clean_data['GHG_per_dollar'],
alpha=0.6)
ax2.set_xlabel('Average Project Size (Millions $)')
ax2.set_ylabel('GHG Reduction per Dollar')
ax2.set_title('GHG Efficiency vs Project Size')
# 3. DAC Funding Ratio vs Program Size
ax3.scatter(clean_data[('Total Project Cost', 'sum')]/1e9,
clean_data['DAC_funding_ratio'],
alpha=0.6)
ax3.set_xlabel('Total Program Investment (Billions $)')
ax3.set_ylabel('DAC Funding Ratio')
ax3.set_title('Equity vs Program Size')
# 4. Project Count vs Average Size
ax4.scatter(clean_data[('Total Project Cost', 'count')],
clean_data[('Total Project Cost', 'mean')]/1e6,
alpha=0.6)
ax4.set_xlabel('Number of Projects')
ax4.set_ylabel('Average Project Size (Millions $)')
ax4.set_title('Project Count vs Average Size')
plt.tight_layout()
plt.show()
<Figure size 1500x1200 with 0 Axes>
In [36]:
# install seaborn library
!pip install seaborn
Requirement already satisfied: seaborn in ./.venv/lib/python3.12/site-packages (0.13.2) Requirement already satisfied: numpy!=1.24.0,>=1.20 in ./.venv/lib/python3.12/site-packages (from seaborn) (2.1.2) Requirement already satisfied: pandas>=1.2 in ./.venv/lib/python3.12/site-packages (from seaborn) (2.2.3) Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in ./.venv/lib/python3.12/site-packages (from seaborn) (3.9.2) Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.0) Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.54.1) Requirement already satisfied: kiwisolver>=1.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.7) Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1) Requirement already satisfied: pillow>=8 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.0.0) Requirement already satisfied: pyparsing>=2.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.0) Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2) Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2) Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Set up the figure with a clean, modern style
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3
fig = plt.figure(figsize=(20, 15))
# 1. Project Volume vs Investment (Enhanced)
ax1 = plt.subplot(2, 2, 1)
scatter = plt.scatter(clean_data[('Total Project Cost', 'count')],
clean_data[('Total Project Cost', 'mean')]/1e6,
s=clean_data[('Total Project Cost', 'sum')]/1e8,
alpha=0.7,
c=range(len(clean_data)),
cmap='viridis')
ax1.set_xlabel('Number of Projects', fontsize=12, fontweight='bold')
ax1.set_ylabel('Average Project Size (Millions $)', fontsize=12, fontweight='bold')
ax1.set_title('Program Scale Analysis\nBubble size represents total investment',
fontsize=14, pad=20, fontweight='bold')
# Add annotations for notable programs
for idx, row in clean_data.iterrows():
if row[('Total Project Cost', 'sum')] > 5e9:
plt.annotate(idx[:30],
(row[('Total Project Cost', 'count')],
row[('Total Project Cost', 'mean')]/1e6),
xytext=(5, 5), textcoords='offset points',
fontsize=8,
bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))
# 2. Program Size Distribution (Enhanced)
ax2 = plt.subplot(2, 2, 2)
sizes = clean_data[('Total Project Cost', 'sum')]/1e6
plt.hist(np.log10(sizes[sizes > 0]),
bins=20,
color='dodgerblue',
edgecolor='white',
alpha=0.7)
ax2.set_xlabel('Log10(Total Program Size) in Millions $', fontsize=12, fontweight='bold')
ax2.set_ylabel('Count', fontsize=12, fontweight='bold')
ax2.set_title('Distribution of Program Sizes\nNatural groupings by investment scale',
fontsize=14, pad=20, fontweight='bold')
# 3. Top 10 Programs (Enhanced)
ax3 = plt.subplot(2, 2, 3)
top_10 = clean_data[('Total Project Cost', 'sum')].nlargest(10)/1e9
colors = plt.cm.viridis(np.linspace(0, 0.8, len(top_10)))
bars = ax3.barh(range(len(top_10)), top_10, color=colors)
ax3.set_yticks(range(len(top_10)))
ax3.set_yticklabels([name[:40] + '...' if len(name) > 40 else name
for name in top_10.index],
fontsize=8)
ax3.set_xlabel('Total Investment (Billions $)', fontsize=12, fontweight='bold')
ax3.set_title('Top 10 Programs by Investment\nHighlighting investment concentration',
fontsize=14, pad=20, fontweight='bold')
# Add value labels to bars
for i, bar in enumerate(bars):
width = bar.get_width()
ax3.text(width, bar.get_y() + bar.get_height()/2,
f'${width:.1f}B',
ha='left', va='center', fontsize=8,
bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))
# 4. Geographic Distribution (Enhanced)
ax4 = plt.subplot(2, 2, 4)
top_15_counties = county_analysis['total_investment'].nlargest(15)/1e6
colors = plt.cm.viridis(np.linspace(0, 0.8, len(top_15_counties)))
bars = ax4.barh(range(len(top_15_counties)), top_15_counties, color=colors)
ax4.set_yticks(range(len(top_15_counties)))
ax4.set_yticklabels(top_15_counties.index, fontsize=8)
ax4.set_xlabel('Total Investment (Millions $)', fontsize=12, fontweight='bold')
ax4.set_title('Investment by County\nTop 15 counties by total investment',
fontsize=14, pad=20, fontweight='bold')
# Add value labels
for i, bar in enumerate(bars):
width = bar.get_width()
ax4.text(width, bar.get_y() + bar.get_height()/2,
f'${width:.0f}M',
ha='left', va='center', fontsize=8,
bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))
plt.tight_layout()
plt.show()
# Print summary statistics for context
print("\nProgram Scale Categories:")
quartiles = np.percentile(sizes[sizes > 0], [25, 50, 75])
print("\nInvestment Scale Categories (in Millions $):")
print(f"Small Programs: < ${quartiles[0]:.1f}M")
print(f"Medium Programs: ${quartiles[0]:.1f}M - ${quartiles[1]:.1f}M")
print(f"Large Programs: ${quartiles[1]:.1f}M - ${quartiles[2]:.1f}M")
print(f"Mega Programs: > ${quartiles[2]:.1f}M")
Program Scale Categories: Investment Scale Categories (in Millions $): Small Programs: < $20.0M Medium Programs: $20.0M - $97.0M Large Programs: $97.0M - $587.9M Mega Programs: > $587.9M
In [39]:
# First, let's prepare our county-level metrics
county_analysis = data.groupby('County').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'Total Project GHGReductions': 'sum',
'Is Benefit Disadvantaged Communities': 'mean',
'Is Low Income Communities': 'mean'
})
# Flatten column names
county_analysis.columns = [
'project_count', 'total_investment', 'avg_project_size',
'total_ghg_reduction', 'dac_rate', 'low_income_rate'
]
# Calculate per-project metrics
county_analysis['ghg_per_dollar'] = county_analysis['total_ghg_reduction'] / county_analysis['total_investment']
county_analysis['investment_per_project'] = county_analysis['total_investment'] / county_analysis['project_count']
# Create visualization of spatial patterns
fig = plt.figure(figsize=(20, 15))
# 1. Investment Distribution
plt.subplot(2, 2, 1)
top_15 = county_analysis['total_investment'].sort_values(ascending=True).tail(15)/1e6
plt.barh(range(len(top_15)), top_15, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15)), top_15.index)
plt.xlabel('Total Investment (Millions $)')
plt.title('Top 15 Counties by Total Investment', fontsize=12, pad=15)
# 2. Project Count Distribution
plt.subplot(2, 2, 2)
top_15_count = county_analysis['project_count'].sort_values(ascending=True).tail(15)
plt.barh(range(len(top_15_count)), top_15_count, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15_count)), top_15_count.index)
plt.xlabel('Number of Projects')
plt.title('Top 15 Counties by Project Count', fontsize=12, pad=15)
# 3. DAC Benefit Rate
plt.subplot(2, 2, 3)
top_15_dac = county_analysis['dac_rate'].sort_values(ascending=True).tail(15)
plt.barh(range(len(top_15_dac)), top_15_dac*100, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15_dac)), top_15_dac.index)
plt.xlabel('Percentage of Projects Benefiting DACs')
plt.title('Top 15 Counties by DAC Benefit Rate', fontsize=12, pad=15)
# 4. GHG Efficiency
plt.subplot(2, 2, 4)
top_15_ghg = county_analysis['ghg_per_dollar'].sort_values(ascending=True).tail(15)
plt.barh(range(len(top_15_ghg)), top_15_ghg, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15_ghg)), top_15_ghg.index)
plt.xlabel('GHG Reduction per Dollar')
plt.title('Top 15 Counties by GHG Efficiency', fontsize=12, pad=15)
plt.tight_layout()
plt.show()
# Print detailed analysis
print("\nSpatial Distribution Analysis:")
print("\n1. Investment Concentration:")
investment_share = (county_analysis['total_investment'].nlargest(5).sum() /
county_analysis['total_investment'].sum() * 100)
print(f"Top 5 counties account for {investment_share:.1f}% of total investment")
print("\n2. Project Distribution:")
project_share = (county_analysis['project_count'].nlargest(5).sum() /
county_analysis['project_count'].sum() * 100)
print(f"Top 5 counties account for {project_share:.1f}% of all projects")
print("\n3. Regional Equity Analysis:")
print("\nDAC Benefit Rate by Region:")
print(county_analysis['dac_rate'].describe().round(3))
print("\n4. Project Type Distribution:")
# Look at program types in top 5 counties
top_5_counties = county_analysis['total_investment'].nlargest(5).index
for county in top_5_counties:
print(f"\n{county} Program Distribution:")
county_programs = data[data['County'] == county]['Program Name'].value_counts().head(3)
print(county_programs)
Spatial Distribution Analysis: 1. Investment Concentration: Top 5 counties account for 75.6% of total investment 2. Project Distribution: Top 5 counties account for 51.1% of all projects 3. Regional Equity Analysis: DAC Benefit Rate by Region: count 299.000 mean 0.149 std 0.297 min 0.000 25% 0.000 50% 0.000 75% 0.125 max 1.000 Name: dac_rate, dtype: float64 4. Project Type Distribution: Los Angeles Program Distribution: Program Name Low Carbon Transportation 32598 Low-Income Weatherization Program 2233 Water-Energy Efficiency 1945 Name: count, dtype: int64 Alameda Program Distribution: Program Name Low Carbon Transportation 5677 Water-Energy Efficiency 244 Community Air Protection 223 Name: count, dtype: int64 Alameda, Contra Costa, Fresno, Kern, Kings, Madera, Merced, Sacramento, San Joaquin, Santa Clara, Stanislaus, Tulare Program Distribution: Program Name Transit and Intercity Rail Capital Program 9 Name: count, dtype: int64 Sacramento Program Distribution: Program Name Low Carbon Transportation 3556 Water-Energy Efficiency 422 Low-Income Weatherization Program 297 Name: count, dtype: int64 San Diego Program Distribution: Program Name Low Carbon Transportation 7865 Community Air Protection 105 Low-Income Weatherization Program 83 Name: count, dtype: int64
In [40]:
# First, let's identify multi-county projects
def is_multi_county(county):
return ',' in str(county)
# Create multi-county analysis dataframe
multi_county_data = data[data['County'].apply(is_multi_county)].copy()
# Analyze these collaborations
print("\nMulti-County Collaboration Analysis:")
print(f"\nTotal number of multi-county projects: {len(multi_county_data)}")
print(f"Total investment in multi-county projects: ${multi_county_data['Total Project Cost'].sum()/1e9:.2f}B")
# Look at the specific collaborations
print("\nMajor Multi-County Collaborations:")
collaboration_summary = multi_county_data.groupby('County').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'Total Project GHGReductions': 'sum',
'Is Benefit Disadvantaged Communities': 'mean',
'Total GGRFDisadvantaged Community Funding': 'sum'
}).round(2)
# Flatten column names
collaboration_summary.columns = [
'project_count', 'total_investment', 'avg_project_size',
'total_ghg_reduction', 'dac_benefit_rate', 'dac_funding'
]
# Calculate efficiency metrics
collaboration_summary['ghg_per_dollar'] = collaboration_summary['total_ghg_reduction'] / collaboration_summary['total_investment']
collaboration_summary['dac_funding_ratio'] = collaboration_summary['dac_funding'] / collaboration_summary['total_investment']
# Sort by total investment
print(collaboration_summary.sort_values('total_investment', ascending=False).head(10))
# Create visualizations
plt.figure(figsize=(20, 12))
# 1. Project Size Comparison
plt.subplot(2, 2, 1)
avg_size = data['Total Project Cost'].mean()
multi_avg_size = multi_county_data['Total Project Cost'].mean()
plt.bar(['Single-County Projects', 'Multi-County Projects'],
[avg_size/1e6, multi_avg_size/1e6],
color=['lightblue', 'darkblue'])
plt.ylabel('Average Project Size (Millions $)')
plt.title('Project Size Comparison')
# 2. GHG Efficiency Comparison
plt.subplot(2, 2, 2)
single_ghg_efficiency = (data[~data['County'].apply(is_multi_county)]['Total Project GHGReductions'].sum() /
data[~data['County'].apply(is_multi_county)]['Total Project Cost'].sum())
multi_ghg_efficiency = (multi_county_data['Total Project GHGReductions'].sum() /
multi_county_data['Total Project Cost'].sum())
plt.bar(['Single-County Projects', 'Multi-County Projects'],
[single_ghg_efficiency, multi_ghg_efficiency],
color=['lightblue', 'darkblue'])
plt.ylabel('GHG Reduction per Dollar')
plt.title('GHG Efficiency Comparison')
# 3. Top Multi-County Collaborations
plt.subplot(2, 2, 3)
top_collaborations = collaboration_summary.sort_values('total_investment', ascending=True).tail(5)
plt.barh(range(len(top_collaborations)), top_collaborations['total_investment']/1e9)
plt.yticks(range(len(top_collaborations)),
[name[:50] + '...' if len(name) > 50 else name for name in top_collaborations.index])
plt.xlabel('Total Investment (Billions $)')
plt.title('Top 5 Multi-County Collaborations by Investment')
# 4. DAC Benefit Rate Comparison
plt.subplot(2, 2, 4)
plt.bar(['Single-County Projects', 'Multi-County Projects'],
[data[~data['County'].apply(is_multi_county)]['Is Benefit Disadvantaged Communities'].mean(),
multi_county_data['Is Benefit Disadvantaged Communities'].mean()],
color=['lightblue', 'darkblue'])
plt.ylabel('Average DAC Benefit Rate')
plt.title('DAC Benefit Rate Comparison')
plt.tight_layout()
plt.show()
# Analyze types of projects that work well in collaboration
print("\nMost Common Types of Multi-County Projects:")
print(multi_county_data['Program Name'].value_counts().head())
# Look at success factors
print("\nSuccess Metrics for Multi-County Projects:")
print("\nDAC Benefit Rate Distribution:")
print(multi_county_data['Is Benefit Disadvantaged Communities'].describe())
print("\nGHG Reduction Distribution (tons per $):")
multi_county_data['ghg_efficiency'] = multi_county_data['Total Project GHGReductions'] / multi_county_data['Total Project Cost']
print(multi_county_data['ghg_efficiency'].describe())
Multi-County Collaboration Analysis:
Total number of multi-county projects: 874
Total investment in multi-county projects: $18.31B
Major Multi-County Collaborations:
project_count \
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 9
San Francisco, Santa Clara 6
San Diego, Santa Barbara, Ventura 23
San Francisco, San Mateo, Santa Clara 3
Los Angeles, Ventura 3
Marin, Sonoma 15
Los Angeles, San Bernardino 29
Humboldt, Mendocino 5
Contra Costa, San Francisco 3
Los Angeles, San Diego, San Luis Obispo, Santa ... 2
total_investment \
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 7333200000
San Francisco, Santa Clara 2998442533
San Diego, Santa Barbara, Ventura 2235315130
San Francisco, San Mateo, Santa Clara 2145553263
Los Angeles, Ventura 877252312
Marin, Sonoma 363189831
Los Angeles, San Bernardino 177831134
Humboldt, Mendocino 155426149
Contra Costa, San Francisco 135489931
Los Angeles, San Diego, San Luis Obispo, Santa ... 130936629
avg_project_size \
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 8.148000e+08
San Francisco, Santa Clara 4.997404e+08
San Diego, Santa Barbara, Ventura 9.718761e+07
San Francisco, San Mateo, Santa Clara 7.151844e+08
Los Angeles, Ventura 2.924174e+08
Marin, Sonoma 2.421266e+07
Los Angeles, San Bernardino 6.132108e+06
Humboldt, Mendocino 3.108523e+07
Contra Costa, San Francisco 4.516331e+07
Los Angeles, San Diego, San Luis Obispo, Santa ... 6.546831e+07
total_ghg_reduction \
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 546406
San Francisco, Santa Clara 1538067
San Diego, Santa Barbara, Ventura 791432
San Francisco, San Mateo, Santa Clara 2100845
Los Angeles, Ventura 74882
Marin, Sonoma 234508
Los Angeles, San Bernardino 22220
Humboldt, Mendocino 24160
Contra Costa, San Francisco 60369
Los Angeles, San Diego, San Luis Obispo, Santa ... 104667
dac_benefit_rate \
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 0.00
San Francisco, Santa Clara 0.17
San Diego, Santa Barbara, Ventura 0.00
San Francisco, San Mateo, Santa Clara 0.33
Los Angeles, Ventura 0.00
Marin, Sonoma 0.00
Los Angeles, San Bernardino 0.48
Humboldt, Mendocino 0.00
Contra Costa, San Francisco 0.33
Los Angeles, San Diego, San Luis Obispo, Santa ... 0.00
dac_funding \
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 0.0
San Francisco, Santa Clara 20000000.0
San Diego, Santa Barbara, Ventura 0.0
San Francisco, San Mateo, Santa Clara 935322.0
Los Angeles, Ventura 0.0
Marin, Sonoma 0.0
Los Angeles, San Bernardino 10812422.0
Humboldt, Mendocino 0.0
Contra Costa, San Francisco 631879.0
Los Angeles, San Diego, San Luis Obispo, Santa ... 0.0
ghg_per_dollar \
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 0.000075
San Francisco, Santa Clara 0.000513
San Diego, Santa Barbara, Ventura 0.000354
San Francisco, San Mateo, Santa Clara 0.000979
Los Angeles, Ventura 0.000085
Marin, Sonoma 0.000646
Los Angeles, San Bernardino 0.000125
Humboldt, Mendocino 0.000155
Contra Costa, San Francisco 0.000446
Los Angeles, San Diego, San Luis Obispo, Santa ... 0.000799
dac_funding_ratio
County
Alameda, Contra Costa, Fresno, Kern, Kings, Mad... 0.000000
San Francisco, Santa Clara 0.006670
San Diego, Santa Barbara, Ventura 0.000000
San Francisco, San Mateo, Santa Clara 0.000436
Los Angeles, Ventura 0.000000
Marin, Sonoma 0.000000
Los Angeles, San Bernardino 0.060802
Humboldt, Mendocino 0.000000
Contra Costa, San Francisco 0.004664
Los Angeles, San Diego, San Luis Obispo, Santa ... 0.000000
Most Common Types of Multi-County Projects: Program Name Urban and Community Forestry Program 333 Low Carbon Transit Operations Program 189 Transit and Intercity Rail Capital Program 87 Safe and Affordable Drinking Water Fund 47 Climate Smart Agriculture 43 Name: count, dtype: int64 Success Metrics for Multi-County Projects: DAC Benefit Rate Distribution: count 874 unique 2 top False freq 756 Name: Is Benefit Disadvantaged Communities, dtype: object GHG Reduction Distribution (tons per $): count 578.000000 mean 0.003167 std 0.017891 min -0.002485 25% 0.000000 50% 0.000220 75% 0.002056 max 0.400316 Name: ghg_efficiency, dtype: float64
In [41]:
# 1. Deep Dive into Successful Collaborations
# Let's create a success metric combining equity and efficiency
def calculate_success_score(row):
# Normalize GHG efficiency and DAC rates to 0-1 scale
ghg_norm = (row['ghg_per_dollar'] - min_ghg) / (max_ghg - min_ghg)
dac_norm = row['dac_benefit_rate']
return (ghg_norm + dac_norm) / 2
# Add success scores
collaboration_success = collaboration_summary.copy()
min_ghg = collaboration_summary['ghg_per_dollar'].min()
max_ghg = collaboration_summary['ghg_per_dollar'].max()
collaboration_success['success_score'] = collaboration_summary.apply(calculate_success_score, axis=1)
# Create visualization of success patterns
plt.figure(figsize=(20, 15))
# 1. Success Stories Matrix
plt.subplot(2, 2, 1)
plt.scatter(collaboration_success['ghg_per_dollar'],
collaboration_success['dac_benefit_rate'],
s=collaboration_success['total_investment']/1e7,
alpha=0.6)
plt.xlabel('GHG Efficiency (Reduction per Dollar)')
plt.ylabel('DAC Benefit Rate')
plt.title('Multi-County Collaboration Performance Matrix\nSize = Total Investment')
# 2. Program Type Analysis
program_type_analysis = multi_county_data.groupby('Program Name').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'Total Project GHGReductions': 'sum',
'Is Benefit Disadvantaged Communities': 'mean'
}).round(2)
program_type_analysis.columns = ['project_count', 'total_investment',
'avg_project_size', 'total_ghg', 'dac_rate']
program_type_analysis['ghg_efficiency'] = program_type_analysis['total_ghg'] / program_type_analysis['total_investment']
# Plot program type performance
plt.subplot(2, 2, 2)
top_programs = program_type_analysis.nlargest(5, 'total_investment')
plt.bar(range(len(top_programs)), top_programs['ghg_efficiency'])
plt.xticks(range(len(top_programs)),
[name[:20] + '...' if len(name) > 20 else name for name in top_programs.index],
rotation=45)
plt.title('GHG Efficiency by Program Type')
# 3. Partnership Size Analysis
plt.subplot(2, 2, 3)
partnership_sizes = multi_county_data['County'].str.count(',') + 1
size_performance = pd.DataFrame({
'partner_count': partnership_sizes,
'ghg_efficiency': multi_county_data['Total Project GHGReductions'] / multi_county_data['Total Project Cost'],
'dac_rate': multi_county_data['Is Benefit Disadvantaged Communities']
})
avg_by_size = size_performance.groupby('partner_count').mean()
plt.plot(avg_by_size.index, avg_by_size['ghg_efficiency'], marker='o')
plt.xlabel('Number of Partner Counties')
plt.ylabel('Average GHG Efficiency')
plt.title('Performance by Partnership Size')
# 4. Regional Patterns
plt.subplot(2, 2, 4)
# Create regional categories based on county groupings
def categorize_region(counties):
if 'Los Angeles' in counties:
return 'Southern California'
elif any(county in counties for county in ['San Francisco', 'Alameda', 'Santa Clara']):
return 'Bay Area'
elif any(county in counties for county in ['Fresno', 'Kern', 'Kings']):
return 'Central Valley'
else:
return 'Other'
collaboration_success['region'] = collaboration_success.index.map(categorize_region)
regional_performance = collaboration_success.groupby('region').agg({
'ghg_per_dollar': 'mean',
'dac_benefit_rate': 'mean',
'total_investment': 'sum'
})
plt.bar(regional_performance.index, regional_performance['ghg_per_dollar'])
plt.xticks(rotation=45)
plt.title('GHG Efficiency by Region')
plt.tight_layout()
plt.show()
# Print detailed analysis
print("\n1. Most Successful Collaborations:")
print(collaboration_success.nlargest(5, 'success_score')[
['total_investment', 'ghg_per_dollar', 'dac_benefit_rate', 'success_score']
])
print("\n2. Program Type Success Factors:")
print(program_type_analysis.nlargest(5, 'ghg_efficiency'))
print("\n3. Optimal Partnership Size:")
print(avg_by_size)
print("\n4. Regional Performance Patterns:")
print(regional_performance.round(4))
# Additional insights on equity-efficiency trade-offs
print("\nEquity-Efficiency Trade-off Analysis:")
correlation = collaboration_success['ghg_per_dollar'].corr(collaboration_success['dac_benefit_rate'])
print(f"\nCorrelation between GHG efficiency and DAC benefit rate: {correlation:.4f}")
1. Most Successful Collaborations:
total_investment \
County
Marin, San Francisco 2466450
Los Angeles, Orange, Riverside, San Diego 766345
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba 991000
Napa, San Joaquin 1557570
Lake, Sacramento 2697479
ghg_per_dollar \
County
Marin, San Francisco 0.010916
Los Angeles, Orange, Riverside, San Diego 0.008612
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba 0.008322
Napa, San Joaquin 0.005557
Lake, Sacramento 0.004171
dac_benefit_rate \
County
Marin, San Francisco 1.0
Los Angeles, Orange, Riverside, San Diego 1.0
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba 1.0
Napa, San Joaquin 1.0
Lake, Sacramento 1.0
success_score
County
Marin, San Francisco 0.515503
Los Angeles, Orange, Riverside, San Diego 0.512379
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba 0.511986
Napa, San Joaquin 0.508237
Lake, Sacramento 0.506358
2. Program Type Success Factors:
project_count \
Program Name
Forest Health Program 6
Food Production Investment Program 9
Wetlands and Watershed Restoration 1
Sustainable Agricultural Lands Conservation Pro... 1
Urban and Community Forestry Program 333
total_investment \
Program Name
Forest Health Program 20238006
Food Production Investment Program 20586793
Wetlands and Watershed Restoration 920666
Sustainable Agricultural Lands Conservation Pro... 17690045
Urban and Community Forestry Program 37197703
avg_project_size \
Program Name
Forest Health Program 3373001.00
Food Production Investment Program 2287421.44
Wetlands and Watershed Restoration 920666.00
Sustainable Agricultural Lands Conservation Pro... 17690045.00
Urban and Community Forestry Program 111704.81
total_ghg dac_rate \
Program Name
Forest Health Program 830634 0.0
Food Production Investment Program 501007 0.0
Wetlands and Watershed Restoration 15166 0.0
Sustainable Agricultural Lands Conservation Pro... 251149 0.0
Urban and Community Forestry Program 195214 0.2
ghg_efficiency
Program Name
Forest Health Program 0.041043
Food Production Investment Program 0.024336
Wetlands and Watershed Restoration 0.016473
Sustainable Agricultural Lands Conservation Pro... 0.014197
Urban and Community Forestry Program 0.005248
3. Optimal Partnership Size:
ghg_efficiency dac_rate
partner_count
2 0.004588 0.149798
3 0.002167 0.064000
4 0.003443 0.052632
5 0.000880 0.130435
6 0.002270 0.236842
7 0.002930 0.281250
8 0.001286 0.307692
9 0.000373 0.000000
10 0.000014 0.250000
11 0.000000 0.000000
12 0.000160 0.000000
13 0.000011 0.000000
14 0.003643 0.111111
16 0.004098 0.111111
19 0.000000 0.000000
22 0.000000 0.000000
24 0.000000 0.000000
25 0.000000 0.000000
29 0.000000 0.000000
34 0.000000 0.000000
47 0.000000 0.000000
4. Regional Performance Patterns:
ghg_per_dollar dac_benefit_rate total_investment
region
Bay Area 0.0015 0.1488 13255750738
Central Valley 0.0022 0.1816 294777141
Other 0.0073 0.1191 3029393261
Southern California 0.0039 0.1862 1731340906
Equity-Efficiency Trade-off Analysis:
Correlation between GHG efficiency and DAC benefit rate: -0.0398
In [47]:
# Filter for more recent period
multi_county_data['Year'] = pd.to_datetime(multi_county_data['Date Operational']).dt.year
recent_data = multi_county_data[multi_county_data['Year'] >= 2010]
# Create temporal analysis
temporal_analysis = recent_data.groupby('Year').agg({
'Total Project Cost': ['count', 'sum', 'mean'],
'Total Project GHGReductions': 'sum',
'Is Benefit Disadvantaged Communities': 'mean'
})
# Visualize evolution
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
# 1. Project Volume
axes[0,0].plot(temporal_analysis.index,
temporal_analysis[('Total Project Cost', 'count')],
marker='o', color='darkblue', linewidth=2)
axes[0,0].set_title('Number of Multi-County Projects by Year (2010-2024)',
fontsize=12, pad=15)
axes[0,0].set_ylabel('Number of Projects')
axes[0,0].grid(True, alpha=0.3)
# 2. Investment Size
axes[0,1].plot(temporal_analysis.index,
temporal_analysis[('Total Project Cost', 'mean')]/1e6,
marker='o', color='darkblue', linewidth=2)
axes[0,1].set_title('Average Project Size by Year',
fontsize=12, pad=15)
axes[0,1].set_ylabel('Average Size (Millions $)')
axes[0,1].grid(True, alpha=0.3)
# 3. Equity Performance
axes[1,0].plot(temporal_analysis.index,
temporal_analysis[('Is Benefit Disadvantaged Communities', 'mean')],
marker='o', color='darkblue', linewidth=2)
axes[1,0].set_title('DAC Benefit Rate by Year',
fontsize=12, pad=15)
axes[1,0].set_ylabel('Proportion Benefiting DACs')
axes[1,0].grid(True, alpha=0.3)
# 4. Partnership Size Evolution
recent_data['partnership_size'] = recent_data['County'].str.count(',') + 1
partnership_evolution = recent_data.groupby('Year')['partnership_size'].mean()
axes[1,1].plot(partnership_evolution.index,
partnership_evolution.values,
marker='o', color='darkblue', linewidth=2)
axes[1,1].set_title('Average Number of Partner Counties by Year',
fontsize=12, pad=15)
axes[1,1].set_ylabel('Average Partners per Project')
axes[1,1].grid(True, alpha=0.3)
# Adjust x-axis for all subplots
for ax in axes.flat:
ax.set_xlim(2014, 2024)
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# Print summary statistics for three periods: pre-COVID, COVID, and post-COVID
print("\nPeriod Comparison:")
pre_covid = recent_data[recent_data['Year'] < 2020]
covid = recent_data[recent_data['Year'].isin([2020, 2021])]
post_covid = recent_data[recent_data['Year'] > 2021]
metrics = {
'Number of Projects': lambda x: len(x),
'Average Project Size ($M)': lambda x: x['Total Project Cost'].mean()/1e6,
'DAC Benefit Rate': lambda x: x['Is Benefit Disadvantaged Communities'].mean(),
'Average Partners': lambda x: x['partnership_size'].mean()
}
comparison = pd.DataFrame({
'Pre-COVID (2010-2019)': {metric: func(pre_covid) for metric, func in metrics.items()},
'COVID Period (2020-2021)': {metric: func(covid) for metric, func in metrics.items()},
'Post-COVID (2022+)': {metric: func(post_covid) for metric, func in metrics.items()}
})
print(comparison.round(2))
# Analyze types of projects during COVID period
print("\nMost Common Project Types During COVID Period:")
print(covid['Program Name'].value_counts().head())
# Look at largest COVID-period collaborations
print("\nLargest COVID-Period Collaborations:")
print(covid.nlargest(5, 'Total Project Cost')[
['County', 'Program Name', 'Total Project Cost', 'Total Project GHGReductions']
].round(2))
/tmp/ipykernel_846869/2239187850.py:43: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
recent_data['partnership_size'] = recent_data['County'].str.count(',') + 1
Period Comparison:
Pre-COVID (2010-2019) COVID Period (2020-2021) \
Number of Projects 211.00 304.00
Average Project Size ($M) 8.24 31.78
DAC Benefit Rate 0.37 0.02
Average Partners 3.70 5.53
Post-COVID (2022+)
Number of Projects 286.00
Average Project Size ($M) 23.08
DAC Benefit Rate 0.01
Average Partners 3.92
Most Common Project Types During COVID Period:
Program Name
Urban and Community Forestry Program 144
Low Carbon Transit Operations Program 46
Safe and Affordable Drinking Water Fund 34
Climate Smart Agriculture 26
Transit and Intercity Rail Capital Program 15
Name: count, dtype: int64
Largest COVID-Period Collaborations:
County \
42458 San Francisco, Santa Clara
62279 Alameda, Contra Costa, Fresno, Kern, Kings, Ma...
62280 Alameda, Contra Costa, Fresno, Kern, Kings, Ma...
100710 Alameda, Contra Costa, Fresno, Kern, Kings, Ma...
100711 Alameda, Contra Costa, Fresno, Kern, Kings, Ma...
Program Name Total Project Cost \
42458 Transit and Intercity Rail Capital Program 1980252533
62279 Transit and Intercity Rail Capital Program 904600000
62280 Transit and Intercity Rail Capital Program 904600000
100710 Transit and Intercity Rail Capital Program 904600000
100711 Transit and Intercity Rail Capital Program 904600000
Total Project GHGReductions
42458 734000
62279 52428
62280 6554
100710 21845
100711 43690