In [1]:
# Project: California Equity
## File: initial_view/overview_hypotesting_20241031.ipynb
### Author: David P. Adams
### Date: 2024-10-31
In [2]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
In [3]:
## set directory
import os
os.chdir('/home/dadams/Repos/california_equity_git')
In [4]:
# read in the data
data = pd.read_csv('data_raw/cci_programs_data.csv', low_memory=False)
In [5]:
columns_data = pd.DataFrame(data.columns)

# export the 'DataFrame' to a csv file
columns_data.to_csv('data_raw/columns.csv', index=False)
In [6]:
# check the data types
data.dtypes

# save the data types to a csv file
data.dtypes.to_csv('data_raw/data_types.csv', header = False)
In [7]:
# describe the data
data.describe()
Out[7]:
Census Tract Total Project Cost Total Program GGRFFunding Total Project GHGReductions Annual Project GHGReductions Project Count Total GGRFDisadvantaged Community Funding Funding Benefiting Disadvantaged Communities Estimated Num Vehicles In Service Funding Within Disadvantage Communities ... Indirect Jobs Fte Induced Jobs Fte Compost Produced Tons Compost Produced Tons Yr Net Density DUA Applicants Assisted Invasive Cover 12 Months Invasive Cover 36 Months Project Acreage Intermediary Admin Expenses Calc
count 1.193700e+05 1.414290e+05 1.414290e+05 1.414290e+05 141429.000000 141429.000000 5.518700e+04 55187.000000 141429.000000 5.518700e+04 ... 141429.000000 141429.000000 141429.000000 141429.0 141429.000000 141429.000000 141429.000000 141429.000000 1.414290e+05 1.414290e+05
mean 6.053889e+09 9.206412e+05 7.791664e+04 7.717972e+02 0.205389 4.090872 2.736820e+04 110.217551 0.047331 2.030028e+04 ... 0.064567 0.117056 0.440977 0.0 0.082260 0.019642 0.010656 0.010578 9.771087e+00 1.911114e+03
std 2.641870e+07 3.736191e+07 1.011645e+06 2.371604e+04 3.361723 18.381861 6.327936e+05 1738.772195 1.972262 5.590536e+05 ... 1.444316 2.261146 45.712955 0.0 4.382593 1.406914 0.758250 0.757677 3.669526e+03 1.196317e+05
min 6.001400e+09 0.000000e+00 0.000000e+00 -2.785930e+05 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000e+00 ... 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
25% 6.037271e+09 4.000000e+03 3.500000e+03 8.000000e+00 0.000000 1.000000 0.000000e+00 0.000000 0.000000 0.000000e+00 ... 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
50% 6.059022e+09 8.000000e+03 7.500000e+03 1.500000e+01 0.000000 1.000000 0.000000e+00 0.000000 0.000000 0.000000e+00 ... 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
75% 6.073016e+09 2.150000e+04 1.900000e+04 4.000000e+01 0.000000 3.000000 5.500000e+03 0.000000 0.000000 1.500000e+03 ... 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
max 6.115041e+09 5.767700e+09 1.412670e+08 4.748924e+06 336.000000 2072.000000 6.443700e+07 102348.000000 503.000000 6.443700e+07 ... 110.170000 151.000000 10365.000000 0.0 706.000000 320.000000 85.000000 85.000000 1.380000e+06 2.000000e+07

8 rows × 82 columns

In [8]:
import geopandas as gpd

# Load the shapefile
shapefile_path = '/home/dadams/Repos/california_equity_git/california_enviroscreen/calif_enviroscreen_shape/CES4 Final Shapefile.shp'
gdf = gpd.read_file(shapefile_path)

# Print the head of the GeoDataFrame
print(gdf.head())
          Tract    ZIP         County    ApproxLoc  TotPop19    CIscore  \
0  6.083002e+09  93454  Santa Barbara  Santa Maria      4495  36.019653   
1  6.083002e+09  93455  Santa Barbara  Santa Maria     13173  37.030667   
2  6.083002e+09  93454  Santa Barbara  Santa Maria      2398  31.213140   
3  6.083002e+09  93455  Santa Barbara       Orcutt      4496   6.639331   
4  6.083002e+09  93455  Santa Barbara       Orcutt      4008  14.022852   

    CIscoreP     Ozone     OzoneP     PM2_5  ...  Elderly65  Hispanic  \
0  69.162885  0.034190  10.566273  7.567724  ...    12.5028   68.9210   
1  70.637922  0.035217  11.561917  7.624775  ...     5.3519   78.6229   
2  61.069087  0.034190  10.566273  7.548835  ...    12.8857   65.7214   
3   5.988401  0.036244  13.615432  7.660570  ...    14.4128   22.9537   
4  23.121533  0.036244  13.615432  7.663210  ...    18.8872   33.4082   

     White  AfricanAm  NativeAm  OtherMult    Shape_Leng    Shape_Area  \
0  20.8899     0.4004    0.2670     1.3126   6999.357689  2.847611e+06   
1  13.2240     2.5051    0.0000     0.9489  19100.578232  1.635292e+07   
2  30.6088     0.9591    0.0000     2.1685   4970.985897  1.352329e+06   
3  69.1948     0.9342    0.7117     2.5356   6558.956012  2.417717e+06   
4  59.7804     0.6986    1.4721     1.3723   6570.368730  2.608422e+06   

     AAPI                                           geometry  
0  8.2091  POLYGON ((-39795.07 -341919.191, -38126.384 -3...  
1  4.6990  POLYGON ((-39795.07 -341919.191, -39803.632 -3...  
2  0.5421  POLYGON ((-38115.747 -341130.248, -38126.384 -...  
3  3.6699  POLYGON ((-37341.662 -348530.437, -37252.307 -...  
4  3.2685  POLYGON ((-39465.107 -348499.262, -38244.305 -...  

[5 rows x 67 columns]
In [9]:
# 1. Fix the classification function
def classify_project_type_v2(row):
    program = str(row['Program Name']).lower()
    project_type = str(row['Project Type']).lower()
    
    # Individual incentive programs
    if program == 'low carbon transportation':
        if 'voucher' in project_type.lower():
            return 'Individual - Vehicle'
        return 'Transportation - Other'
    elif program in ['low-income weatherization program', 'woodsmoke reduction program']:
        return 'Individual - Home Energy'
    elif program == 'water-energy efficiency':
        return 'Water-Energy Projects'
    elif program == 'community air protection':
        return 'Air Quality Projects'
    elif program == 'climate smart agriculture':
        return 'Agricultural Projects'
    elif 'voucher' in project_type.lower():
        return 'Individual - Other'
    elif program == 'transformative climate communities':
        return 'Community Projects'
    
    # Institutional by recipient type
    elif pd.notna(row['PRIMARY_FUNDING_RECIPIENT_TYPE']):
        recipient = str(row['PRIMARY_FUNDING_RECIPIENT_TYPE'])
        if recipient in ['Government Agency', 'Government agency', 'California Native American Tribe']:
            return 'Institutional - Government'
        elif recipient == 'Educational institution':
            return 'Institutional - Education'
        elif recipient == 'Nonprofit':
            return 'Institutional - Nonprofit'
        elif recipient == 'Business':
            if row['Total Project Cost'] > 100000:
                return 'Institutional - Large Business'
            return 'Business - Small'
    
    return 'Other'

# 2. Add safe division function for metrics
def safe_divide(a, b):
    if b == 0:
        return 0
    return a / b

# 3. Recalculate metrics
data['project_category'] = data.apply(classify_project_type_v2, axis=1)
data['dac_funding_ratio'] = data.apply(lambda x: safe_divide(x['Total GGRFDisadvantaged Community Funding'], x['Total Project Cost']), axis=1)
data['ghg_per_dollar'] = data.apply(lambda x: safe_divide(x['Total Project GHGReductions'], x['Total Project Cost']), axis=1)

# 4. Rerun analysis
updated_analysis = data.groupby('project_category').agg({
    'Total Project Cost': ['count', 'sum', 'mean'],
    'Total Project GHGReductions': ['sum', 'mean'],
    'dac_funding_ratio': ['mean', 'median'],
    'ghg_per_dollar': ['mean', 'median']
}).round(4)

print("\nUpdated Project Category Distribution:")
print(data['project_category'].value_counts())
print("\nUpdated Analysis:")
print(updated_analysis)
Updated Project Category Distribution:
project_category
Transportation - Other            101427
Other                              14340
Individual - Home Energy            7427
Air Quality Projects                5514
Water-Energy Projects               5111
Individual - Vehicle                4538
Agricultural Projects               1688
Community Projects                   973
Institutional - Government           269
Institutional - Nonprofit            100
Institutional - Large Business        23
Institutional - Education             17
Business - Small                       2
Name: count, dtype: int64

Updated Analysis:
                               Total Project Cost                              \
                                            count           sum          mean   
project_category                                                                
Agricultural Projects                        1688     886514974  5.251866e+05   
Air Quality Projects                         5514    1211608583  2.197331e+05   
Business - Small                                2         65376  3.268800e+04   
Community Projects                            973     508817727  5.229370e+05   
Individual - Home Energy                     7427     320008647  4.308720e+04   
Individual - Vehicle                         4538    1286418141  2.834769e+05   
Institutional - Education                      17       5561672  3.271572e+05   
Institutional - Government                    269   16939974420  6.297388e+07   
Institutional - Large Business                 23     513921827  2.234443e+07   
Institutional - Nonprofit                     100    1402247011  1.402247e+07   
Other                                       14340  104572013338  7.292330e+06   
Transportation - Other                     101427    2473090031  2.438296e+04   
Water-Energy Projects                        5111      85125916  1.665543e+04   

                               Total Project GHGReductions              \
                                                       sum        mean   
project_category                                                         
Agricultural Projects                             24249378  14365.7453   
Air Quality Projects                                250801     45.4844   
Business - Small                                         0      0.0000   
Community Projects                                  154010    158.2837   
Individual - Home Energy                            693914     93.4313   
Individual - Vehicle                                993992    219.0375   
Institutional - Education                              457     26.8824   
Institutional - Government                         6580289  24462.0409   
Institutional - Large Business                      388663  16898.3913   
Institutional - Nonprofit                          4791834  47918.3400   
Other                                             67313019   4694.0738   
Transportation - Other                             3312930     32.6632   
Water-Energy Projects                               425225     83.1980   

                               dac_funding_ratio         ghg_per_dollar  \
                                            mean  median           mean   
project_category                                                          
Agricultural Projects                     0.2388  0.0000         0.0106   
Air Quality Projects                         NaN     NaN         0.0002   
Business - Small                             NaN     NaN         0.0000   
Community Projects                           NaN     NaN         0.0004   
Individual - Home Energy                  0.6504  0.7405         0.0038   
Individual - Vehicle                      0.1660  0.0245         0.0017   
Institutional - Education                    NaN     NaN         0.0008   
Institutional - Government                0.0000  0.0000         0.0022   
Institutional - Large Business               NaN     NaN         0.0038   
Institutional - Nonprofit                 0.0000  0.0000         0.0182   
Other                                     0.1483  0.0000         0.0023   
Transportation - Other                    0.4190  0.0000         0.0027   
Water-Energy Projects                     0.4359  0.0000         0.1027   

                                        
                                median  
project_category                        
Agricultural Projects           0.0051  
Air Quality Projects            0.0000  
Business - Small                0.0000  
Community Projects              0.0000  
Individual - Home Energy        0.0026  
Individual - Vehicle            0.0010  
Institutional - Education       0.0000  
Institutional - Government      0.0001  
Institutional - Large Business  0.0005  
Institutional - Nonprofit       0.0004  
Other                           0.0001  
Transportation - Other          0.0032  
Water-Energy Projects           0.0096  

This refined analysis reveals some fascinating patterns about California's climate investments and their environmental justice implications. Let's break down the key findings:

  1. Program Distribution and Scale:
  • Transportation dominates numerically:
    • Transportation-Other: 101,427 projects (71.6% of all projects)
    • Individual Vehicle incentives: 4,538 projects
  • Individual programs have high reach but moderate costs:
    • Home Energy: 7,427 projects (~$43K average)
    • Vehicle incentives: 4,538 projects (~$283K average)
  • Institutional projects are fewer but larger:
    • Government: 269 projects (~$63M average)
    • Nonprofits: 100 projects (~$14M average)
  1. Environmental Justice (DAC Funding Ratio):
  • Home Energy programs show strongest DAC focus (65% DAC funding ratio)
  • Water-Energy (44%) and Transportation-Other (42%) also show good DAC targeting
  • Individual Vehicle programs have lower DAC funding (17%)
  • Institutional programs show minimal DAC funding ratios
  1. GHG Reduction Efficiency (GHG per dollar):
  • Water-Energy Projects are most efficient (0.1027 tons/$)
  • Nonprofit projects show high efficiency (0.0182 tons/$)
  • Agricultural Projects also perform well (0.0106 tons/$)
  • Individual incentives show lower efficiency:
    • Home Energy: 0.0038 tons/$
    • Vehicle: 0.0017 tons/$

Key Environmental Justice Implications:

  1. Individual vs Institutional Trade-offs:
  • Individual programs (especially home energy) are better at reaching disadvantaged communities
  • But institutional programs tend to be more cost-effective for GHG reductions
  • This suggests a potential tension between equity and efficiency goals
  1. Program-Specific Patterns:
  • Home Energy programs show the best balance of DAC reach and moderate GHG efficiency
  • Vehicle incentives have lower DAC reach and efficiency, supporting your hypothesis about wealth bias
  • Water-Energy projects show promising combination of good DAC reach and high efficiency
  1. Policy Implications:
  • Consider expanding successful programs like Water-Energy that combine good DAC reach with high efficiency
  • Look for ways to improve DAC access to vehicle incentives
  • Consider bundling institutional and individual programs to balance efficiency and equity goals

Would you like to explore any of these aspects in more detail? We could:

  1. Drill deeper into geographic distribution
  2. Analyze temporal trends in these patterns
  3. Look at specific program characteristics that drive DAC success
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data for plotting
plot_data = data.groupby('project_category').agg({
    'Total Project Cost': ['count', 'mean'],
    'dac_funding_ratio': 'mean',
    'ghg_per_dollar': 'mean'
}).reset_index()

# Flatten column names
plot_data.columns = ['project_category', 'count', 'avg_cost', 'dac_ratio', 'ghg_efficiency']

# Create figure with multiple subplots
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 20))

# 1. Project Counts
sns.barplot(data=plot_data, 
           x='count', 
           y='project_category', 
           ax=ax1,
           palette='viridis')
ax1.set_title('Number of Projects by Category')
ax1.set_xlabel('Number of Projects')

# 2. DAC Funding Ratio
mask = plot_data['dac_ratio'].notna()  # Filter out NaN values
sns.barplot(data=plot_data[mask], 
           x='dac_ratio', 
           y='project_category', 
           ax=ax2,
           palette='viridis')
ax2.set_title('Disadvantaged Community Funding Ratio by Category')
ax2.set_xlabel('DAC Funding Ratio')

# 3. GHG Efficiency
mask = plot_data['ghg_efficiency'].notna()  # Filter out NaN values
sns.barplot(data=plot_data[mask], 
           x='ghg_efficiency', 
           y='project_category', 
           ax=ax3,
           palette='viridis')
ax3.set_title('GHG Reduction Efficiency by Category (tons CO2e per dollar)')
ax3.set_xlabel('GHG Reduction per Dollar')

plt.tight_layout()
plt.show()

# Create a scatter plot to show relationship between project size and efficiency
plt.figure(figsize=(12, 8))
plt.scatter(plot_data['avg_cost'], 
           plot_data['ghg_efficiency'], 
           alpha=0.6)

# Add labels for each point
for i, row in plot_data.iterrows():
    plt.annotate(row['project_category'], 
                (row['avg_cost'], row['ghg_efficiency']),
                xytext=(5, 5), 
                textcoords='offset points')

plt.xscale('log')  # Use log scale for cost due to wide range
plt.xlabel('Average Project Cost (log scale)')
plt.ylabel('GHG Reduction Efficiency (tons CO2e per dollar)')
plt.title('Project Cost vs GHG Reduction Efficiency')
plt.grid(True)
plt.show()
/tmp/ipykernel_846869/3534785368.py:18: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=plot_data,
/tmp/ipykernel_846869/3534785368.py:28: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=plot_data[mask],
/tmp/ipykernel_846869/3534785368.py:38: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=plot_data[mask],
No description has been provided for this image
No description has been provided for this image
In [11]:
# First let's see how many counties we have data for and their distribution
county_analysis = data.groupby(['County', 'project_category']).agg({
    'Total Project Cost': ['count', 'sum'],
    'Total Project GHGReductions': 'sum',
    'dac_funding_ratio': 'mean'
}).round(2)

# Reset index for easier manipulation
county_analysis = county_analysis.reset_index()

# Flatten column names
county_analysis.columns = ['County', 'project_category', 'project_count', 'total_cost', 'total_ghg', 'dac_ratio']

print("\nTop 10 Counties by Project Count:")
print(data['County'].value_counts().head(10))

print("\nProject Categories by County (top 5 counties):")
top_counties = data['County'].value_counts().head(5).index
for county in top_counties:
    print(f"\n{county}:")
    county_cats = data[data['County'] == county]['project_category'].value_counts()
    print(county_cats)

# Calculate percentage of projects in each county by category
county_category_pcts = pd.crosstab(data['County'], 
                                 data['project_category'], 
                                 values=data['Total Project Cost'],
                                 aggfunc='sum',
                                 normalize='index') * 100

print("\nPercentage of Funding by Category in Top 5 Counties:")
print(county_category_pcts.loc[top_counties].round(2))
Top 10 Counties by Project Count:
County
Los Angeles       38032
Orange            10694
San Diego          8468
Fresno             7854
Santa Clara        7154
Alameda            6381
Riverside          6190
San Bernardino     5159
Sacramento         4781
San Joaquin        4202
Name: count, dtype: int64

Project Categories by County (top 5 counties):

Los Angeles:
project_category
Transportation - Other            30411
Individual - Home Energy           2233
Individual - Vehicle               2187
Water-Energy Projects              1945
Other                               794
Air Quality Projects                352
Community Projects                   79
Institutional - Government           19
Institutional - Nonprofit             6
Agricultural Projects                 3
Institutional - Large Business        3
Name: count, dtype: int64

Orange:
project_category
Transportation - Other        9866
Individual - Vehicle           290
Individual - Home Energy       184
Water-Energy Projects          169
Other                          129
Air Quality Projects            51
Institutional - Government       4
Community Projects               1
Name: count, dtype: int64

San Diego:
project_category
Transportation - Other        7571
Other                          375
Individual - Vehicle           294
Air Quality Projects           105
Individual - Home Energy        83
Agricultural Projects           30
Institutional - Government       6
Institutional - Nonprofit        3
Community Projects               1
Name: count, dtype: int64

Fresno:
project_category
Transportation - Other            2541
Other                             2144
Air Quality Projects              1747
Individual - Home Energy           769
Water-Energy Projects              338
Agricultural Projects              142
Individual - Vehicle                96
Community Projects                  64
Institutional - Government           7
Institutional - Education            3
Institutional - Large Business       2
Institutional - Nonprofit            1
Name: count, dtype: int64

Santa Clara:
project_category
Transportation - Other            6214
Water-Energy Projects              530
Individual - Vehicle               177
Other                               89
Individual - Home Energy            74
Air Quality Projects                50
Agricultural Projects               14
Institutional - Government           3
Institutional - Nonprofit            2
Institutional - Large Business       1
Name: count, dtype: int64

Percentage of Funding by Category in Top 5 Counties:
project_category  Agricultural Projects  Air Quality Projects  \
County                                                          
Los Angeles                        0.00                  0.36   
Orange                             0.00                  1.44   
San Diego                          0.05                  1.61   
Fresno                             6.12                 10.44   
Santa Clara                        0.03                  0.61   

project_category  Business - Small  Community Projects  \
County                                                   
Los Angeles                    0.0                0.14   
Orange                         0.0                0.15   
San Diego                      0.0                0.00   
Fresno                         0.0               17.55   
Santa Clara                    0.0                0.00   

project_category  Individual - Home Energy  Individual - Vehicle  \
County                                                             
Los Angeles                           0.06                  0.91   
Orange                                0.09                  2.90   
San Diego                             0.10                  1.45   
Fresno                                4.74                  2.50   
Santa Clara                           0.12                  2.07   

project_category  Institutional - Education  Institutional - Government  \
County                                                                    
Los Angeles                            0.00                        5.35   
Orange                                 0.00                       37.65   
San Diego                              0.00                        2.90   
Fresno                                 0.02                        0.73   
Santa Clara                            0.00                       15.35   

project_category  Institutional - Large Business  Institutional - Nonprofit  \
County                                                                        
Los Angeles                                 0.26                       0.71   
Orange                                      0.00                       0.00   
San Diego                                   0.00                       2.54   
Fresno                                      4.07                       0.01   
Santa Clara                                 3.88                       0.04   

project_category  Other  Transportation - Other  Water-Energy Projects  
County                                                                  
Los Angeles       91.48                    0.73                   0.00  
Orange            51.20                    6.46                   0.12  
San Diego         88.01                    3.33                   0.00  
Fresno            47.73                    5.73                   0.36  
Santa Clara       73.37                    4.43                   0.09  
In [12]:
# Define a more distinct color palette that groups similar categories
colors = {
    'Agricultural Projects': '#228B22',      # Forest Green
    'Air Quality Projects': '#87CEEB',       # Sky Blue
    'Business - Small': '#DDA0DD',          # Plum
    'Community Projects': '#FF7F50',        # Coral
    'Individual - Home Energy': '#FFD700',   # Gold
    'Individual - Vehicle': '#DAA520',       # Goldenrod
    'Institutional - Education': '#4B0082',  # Indigo
    'Institutional - Government': '#800000', # Maroon
    'Institutional - Large Business': '#4682B4', # Steel Blue
    'Institutional - Nonprofit': '#2E8B57',  # Sea Green
    'Other': '#808080',                     # Grey
    'Transportation - Other': '#FF4500',     # Orange Red
    'Water-Energy Projects': '#00CED1'       # Turquoise
}

plt.figure(figsize=(15, 8))
county_category_pcts.loc[top_counties].plot(kind='bar', 
                                          stacked=True,
                                          color=[colors[x] for x in county_category_pcts.columns])
plt.title('Distribution of Project Funding by Category Across Top Counties')
plt.xlabel('County')
plt.ylabel('Percentage of Total Funding')
plt.legend(bbox_to_anchor=(1.05, 1), 
          loc='upper left',
          title='Project Categories')
plt.tight_layout()
plt.show()

# Create a visualization for DAC funding ratio by county
plt.figure(figsize=(15, 8))
sns.boxplot(data=data[data['dac_funding_ratio'].notna()],
            x='County',
            y='dac_funding_ratio',
            order=top_counties,
            palette='viridis')
plt.title('DAC Funding Ratio Distribution by County')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
<Figure size 1500x800 with 0 Axes>
No description has been provided for this image
/tmp/ipykernel_846869/2919179271.py:33: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=data[data['dac_funding_ratio'].notna()],
No description has been provided for this image
In [13]:
# Clearer organization 
def classify_for_ej_analysis(row):
    program = str(row['Program Name']).lower()
    project_type = str(row['Project Type']).lower()
    
    # Individual incentives
    if program == 'low carbon transportation' and 'voucher' in project_type.lower():
        return 'Individual - Vehicle'
    elif program == 'low-income weatherization program':
        return 'Individual - Solar/Energy'
    elif program == 'woodsmoke reduction program':
        return 'Individual - Solar/Energy'
    # Large projects
    elif row['Total Project Cost'] > 100000:  # threshold for "large" projects
        return 'Large Projects'
    else:
        return 'Other'

data['ej_category'] = data.apply(classify_for_ej_analysis, axis=1)

# Analyze environmental justice metrics by category
ej_analysis = data.groupby('ej_category').agg({
    'Total Project Cost': ['count', 'sum', 'mean'],
    'dac_funding_ratio': ['mean', 'median'],
    'Is Benefit Disadvantaged Communities': 'mean',
    'Is Low Income Communities': 'mean',
    'Total Project GHGReductions': 'sum'
}).round(4)

print("Environmental Justice Analysis by Category:")
print(ej_analysis)

# Calculate percentage of funds going to different income communities
ej_distribution = data.groupby('ej_category').agg({
    'Total Project Cost': 'sum',
    'Total GGRFDisadvantaged Community Funding': 'sum',
    'Low Income Amount': 'sum'
})

ej_distribution['DAC_percentage'] = (ej_distribution['Total GGRFDisadvantaged Community Funding'] / 
                                   ej_distribution['Total Project Cost'] * 100)
ej_distribution['LowIncome_percentage'] = (ej_distribution['Low Income Amount'] / 
                                         ej_distribution['Total Project Cost'] * 100)

print("\nPercentage of Funding to Disadvantaged and Low-Income Communities:")
print(ej_distribution[['DAC_percentage', 'LowIncome_percentage']].round(2))

# Visualize the distribution
plt.figure(figsize=(12, 6))
bar_width = 0.35
categories = ej_distribution.index

x = np.arange(len(categories))
plt.bar(x - bar_width/2, ej_distribution['DAC_percentage'], 
        bar_width, label='DAC Funding %', alpha=0.8)
plt.bar(x + bar_width/2, ej_distribution['LowIncome_percentage'], 
        bar_width, label='Low Income Funding %', alpha=0.8)

plt.xlabel('Project Category')
plt.ylabel('Percentage of Total Funding')
plt.title('Distribution of Funding to Disadvantaged and Low-Income Communities')
plt.xticks(x, categories, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Let's also look at the per-project impact
impact_analysis = data.groupby('ej_category').agg({
    'Total Project GHGReductions': 'mean',
    'Total Project Cost': 'mean'
}).round(2)

impact_analysis['GHG_reduction_per_dollar'] = (impact_analysis['Total Project GHGReductions'] / 
                                             impact_analysis['Total Project Cost']).round(4)

print("\nImpact Analysis:")
print(impact_analysis)
Environmental Justice Analysis by Category:
                          Total Project Cost                              \
                                       count           sum          mean   
ej_category                                                                
Individual - Solar/Energy               7427     320008647  4.308720e+04   
Individual - Vehicle                    4538    1286418141  2.834769e+05   
Large Projects                         10009  127060954384  1.269467e+07   
Other                                 119455    1537986491  1.287503e+04   

                          dac_funding_ratio          \
                                       mean  median   
ej_category                                           
Individual - Solar/Energy            0.6504  0.7405   
Individual - Vehicle                 0.1660  0.0245   
Large Projects                       0.2761  0.0000   
Other                                0.4129  0.0000   

                          Is Benefit Disadvantaged Communities  \
                                                          mean   
ej_category                                                      
Individual - Solar/Energy                               0.7824   
Individual - Vehicle                                    0.3455   
Large Projects                                          0.0664   
Other                                                   0.3275   

                          Is Low Income Communities  \
                                               mean   
ej_category                                           
Individual - Solar/Energy                    0.9338   
Individual - Vehicle                         0.4125   
Large Projects                               0.5602   
Other                                        0.5708   

                          Total Project GHGReductions  
                                                  sum  
ej_category                                            
Individual - Solar/Energy                      693914  
Individual - Vehicle                           993992  
Large Projects                              103527784  
Other                                         3938822  

Percentage of Funding to Disadvantaged and Low-Income Communities:
                           DAC_percentage  LowIncome_percentage
ej_category                                                    
Individual - Solar/Energy           41.46                  4.97
Individual - Vehicle                 5.13                  2.01
Large Projects                       0.91                  1.47
Other                               10.11                 16.38
No description has been provided for this image
Impact Analysis:
                           Total Project GHGReductions  Total Project Cost  \
ej_category                                                                  
Individual - Solar/Energy                        93.43            43087.20   
Individual - Vehicle                            219.04           283476.89   
Large Projects                                10343.47         12694670.24   
Other                                            32.97            12875.03   

                           GHG_reduction_per_dollar  
ej_category                                          
Individual - Solar/Energy                    0.0022  
Individual - Vehicle                         0.0008  
Large Projects                               0.0008  
Other                                        0.0026  

Patterns of Equity in California's Climate Investments¶

  1. Individual Solar/Energy Programs Show Strong Equity Focus:
  • Highest DAC funding ratio (0.65 mean, 0.74 median)
  • 78% benefit disadvantaged communities
  • 93% benefit low-income communities
  • 41.46% of funds go to DACs, highest among all categories
  • However, relatively small total investment ($320M)
  1. Individual Vehicle Programs Show Wealth Disparity:
  • Much lower DAC funding ratio (0.17 mean, 0.02 median)
  • Only 34.5% benefit disadvantaged communities
  • Only 5.13% of funds go to DACs
  • Larger individual project costs (~$283K vs $43K for solar)
  • This supports your hypothesis about vehicle incentives benefiting wealthier communities
  1. Large Projects Show Mixed Results:
  • Low DAC funding ratio (0.28 mean, 0.00 median)
  • Only 6.6% benefit disadvantaged communities
  • But 56% benefit low-income communities
  • Largest total investment ($127B)
  • Highest total GHG reductions (103.5M tons)

Key Findings:

  1. the hypothesis about vehicle incentives primarily benefiting wealthier communities is strongly supported by the data
  2. However, individual solar/energy programs are actually very successful at reaching disadvantaged communities
  3. Large projects, while achieving the most total GHG reductions, have mixed equity outcomes - good at reaching low-income communities but not DACs
In [14]:
# 1. Analyze the overall spatial distribution of investments
county_analysis = data.groupby('County').agg({
    'Total Project Cost': ['count', 'sum'],
    'Total Project GHGReductions': 'sum',
    'dac_funding_ratio': 'mean',
    'Is Benefit Disadvantaged Communities': 'mean',
    'Is Low Income Communities': 'mean'
}).round(4)

# Add per capita metrics (we would need to merge with county population data)
# Add GHG reduction per dollar by county

# 2. Analyze vulnerable communities specifically
vulnerability_analysis = pd.DataFrame({
    'Total Projects': [
        len(data),
        len(data[data['Is Benefit Disadvantaged Communities'] == 1]),
        len(data[data['Is Low Income Communities'] == 1])
    ],
    'Total Investment': [
        data['Total Project Cost'].sum(),
        data[data['Is Benefit Disadvantaged Communities'] == 1]['Total Project Cost'].sum(),
        data[data['Is Low Income Communities'] == 1]['Total Project Cost'].sum()
    ],
    'GHG Reduction': [
        data['Total Project GHGReductions'].sum(),
        data[data['Is Benefit Disadvantaged Communities'] == 1]['Total Project GHGReductions'].sum(),
        data[data['Is Low Income Communities'] == 1]['Total Project GHGReductions'].sum()
    ]
}, index=['All Communities', 'Disadvantaged Communities', 'Low Income Communities'])

# 3. Project type distribution in vulnerable communities
project_vulnerability = data.groupby(['project_category', 'Is Benefit Disadvantaged Communities']).agg({
    'Total Project Cost': ['count', 'sum'],
    'Total Project GHGReductions': 'sum'
}).round(4)

print("Overall Project Distribution by County:")
print(county_analysis.sort_values(('Total Project Cost', 'sum'), ascending=False).head(10))

print("\nVulnerable Communities Analysis:")
print(vulnerability_analysis)

print("\nProject Types in Vulnerable Communities:")
print(project_vulnerability)

# Visualizations
plt.figure(figsize=(15, 8))
# Plot investment distribution...
plt.subplot(1, 2, 1)
county_analysis[('Total Project Cost', 'sum')].sort_values(ascending=False).head(10).plot(kind='bar')
plt.title('Top 10 Counties by Total Investment')
plt.ylabel('Total Investment')
plt.xlabel('County')

# ...and GHG reductions
plt.subplot(1, 2, 2)
county_analysis[('Total Project GHGReductions', 'sum')].sort_values(ascending=False).head(10).plot(kind='bar')
plt.title('Top 10 Counties by GHG Reductions')
plt.ylabel('Total GHG Reductions')
plt.xlabel('County')

plt.tight_layout()
plt.show()
Overall Project Distribution by County:
                                                   Total Project Cost  \
                                                                count   
County                                                                  
Los Angeles                                                     38032   
Alameda                                                          6381   
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...                  9   
Sacramento                                                       4781   
San Diego                                                        8468   
San Francisco                                                    2748   
Santa Clara                                                      7154   
San Francisco, Santa Clara                                          6   
Orange                                                          10694   
San Diego, Santa Barbara, Ventura                                  23   

                                                                 \
                                                            sum   
County                                                            
Los Angeles                                         69936060480   
Alameda                                             10379456877   
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...   7333200000   
Sacramento                                           5858784824   
San Diego                                            4870580185   
San Francisco                                        3813460158   
Santa Clara                                          3178166479   
San Francisco, Santa Clara                           2998442533   
Orange                                               2383485407   
San Diego, Santa Barbara, Ventura                    2235315130   

                                                   Total Project GHGReductions  \
                                                                           sum   
County                                                                           
Los Angeles                                                           11307708   
Alameda                                                               10518799   
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...                      546406   
Sacramento                                                             1975903   
San Diego                                                              2233426   
San Francisco                                                          1850982   
Santa Clara                                                            1793848   
San Francisco, Santa Clara                                             1538067   
Orange                                                                 1180528   
San Diego, Santa Barbara, Ventura                                       791432   

                                                   dac_funding_ratio  \
                                                                mean   
County                                                                 
Los Angeles                                                   0.5751   
Alameda                                                       0.2983   
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...               NaN   
Sacramento                                                    0.4194   
San Diego                                                     0.1004   
San Francisco                                                 0.1057   
Santa Clara                                                   0.2247   
San Francisco, Santa Clara                                    0.0101   
Orange                                                        0.3994   
San Diego, Santa Barbara, Ventura                                NaN   

                                                   Is Benefit Disadvantaged Communities  \
                                                                                   mean   
County                                                                                    
Los Angeles                                                                      0.5420   
Alameda                                                                          0.2612   
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...                               0.0000   
Sacramento                                                                       0.3403   
San Diego                                                                        0.0993   
San Francisco                                                                    0.1121   
Santa Clara                                                                      0.2200   
San Francisco, Santa Clara                                                       0.1667   
Orange                                                                           0.3571   
San Diego, Santa Barbara, Ventura                                                0.0000   

                                                   Is Low Income Communities  
                                                                        mean  
County                                                                        
Los Angeles                                                           0.6534  
Alameda                                                               0.5557  
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...                    1.0000  
Sacramento                                                            0.5162  
San Diego                                                             0.5265  
San Francisco                                                         0.5761  
Santa Clara                                                           0.4911  
San Francisco, Santa Clara                                            0.8333  
Orange                                                                0.6461  
San Diego, Santa Barbara, Ventura                                     1.0000  

Vulnerable Communities Analysis:
                           Total Projects  Total Investment  GHG Reduction
All Communities                    141429      130205367663      109154512
Disadvantaged Communities           47167       14111685888       12626682
Low Income Communities              82595      103691488875       55739585

Project Types in Vulnerable Communities:
                                                                    Total Project Cost  \
                                                                                 count   
project_category               Is Benefit Disadvantaged Communities                      
Agricultural Projects          False                                              1490   
                               True                                                198   
Air Quality Projects           False                                              5514   
Business - Small               False                                                 2   
Community Projects             False                                               973   
Individual - Home Energy       False                                              1616   
                               True                                               5811   
Individual - Vehicle           False                                              2970   
                               True                                               1568   
Institutional - Education      False                                                17   
Institutional - Government     False                                               269   
Institutional - Large Business False                                                23   
Institutional - Nonprofit      False                                               100   
Other                          False                                             13788   
                               True                                                552   
Transportation - Other         False                                             64674   
                               True                                              36753   
Water-Energy Projects          False                                              2826   
                               True                                               2285   

                                                                                  \
                                                                             sum   
project_category               Is Benefit Disadvantaged Communities                
Agricultural Projects          False                                   824640269   
                               True                                     61874705   
Air Quality Projects           False                                  1211608583   
Business - Small               False                                       65376   
Community Projects             False                                   508817727   
Individual - Home Energy       False                                   117426101   
                               True                                    202582546   
Individual - Vehicle           False                                   839347111   
                               True                                    447071030   
Institutional - Education      False                                     5561672   
Institutional - Government     False                                 16939974420   
Institutional - Large Business False                                   513921827   
Institutional - Nonprofit      False                                  1402247011   
Other                          False                                 91784138193   
                               True                                  12787875145   
Transportation - Other         False                                  1888149934   
                               True                                    584940097   
Water-Energy Projects          False                                    57783551   
                               True                                     27342365   

                                                                    Total Project GHGReductions  
                                                                                            sum  
project_category               Is Benefit Disadvantaged Communities                              
Agricultural Projects          False                                                   22903912  
                               True                                                     1345466  
Air Quality Projects           False                                                     250801  
Business - Small               False                                                          0  
Community Projects             False                                                     154010  
Individual - Home Energy       False                                                     203507  
                               True                                                      490407  
Individual - Vehicle           False                                                     699758  
                               True                                                      294234  
Institutional - Education      False                                                        457  
Institutional - Government     False                                                    6580289  
Institutional - Large Business False                                                     388663  
Institutional - Nonprofit      False                                                    4791834  
Other                          False                                                   57935224  
                               True                                                     9377795  
Transportation - Other         False                                                    2392628  
                               True                                                      920302  
Water-Energy Projects          False                                                     226747  
                               True                                                      198478  
/tmp/ipykernel_846869/3616794259.py:63: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all Axes decorations.
  plt.tight_layout()
No description has been provided for this image
In [15]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

# First, summarize the data at tract level
tract_summary = data.groupby('Census Tract').agg({
    'Total Project Cost': ['count', 'sum'],
    'Total Project GHGReductions': 'sum',
    'Is Benefit Disadvantaged Communities': 'mean',
    'Is Low Income Communities': 'mean'
}).round(4)

# Reset the index to make Census Tract a column
tract_summary = tract_summary.reset_index()

# Create a few different maps to show different aspects
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 20))

# Function to create a map with proper legend
def create_map(data, column, title, ax, cmap):
    im = ax.scatter(data['Census Tract'], 
                   data[column],
                   c=data[column],
                   cmap=cmap)
    plt.colorbar(im, ax=ax, label=title)
    ax.set_title(title)
    ax.axis('off')

# 1. Project Count Map
create_map(tract_summary, 
          ('Total Project Cost', 'count'),
          'Number of CCI Projects by Census Tract',
          ax1,
          'YlOrRd')

# 2. Total Investment Map
create_map(tract_summary,
          ('Total Project Cost', 'sum'),
          'Total CCI Investment by Census Tract',
          ax2,
          'viridis')

# 3. DAC Benefits Map
create_map(tract_summary,
          ('Is Benefit Disadvantaged Communities', 'mean'),
          'Proportion of Projects Benefiting DACs',
          ax3,
          'RdYlBu')

# 4. GHG Reductions Map
create_map(tract_summary,
          ('Total Project GHGReductions', 'sum'),
          'Total GHG Reductions by Census Tract',
          ax4,
          'YlGn')

plt.tight_layout()
plt.show()

# Let's also look at the basic statistics
print("\nSpatial Distribution Statistics:")
print("\nTop 10 Census Tracts by Total Investment:")
print(tract_summary.nlargest(10, ('Total Project Cost', 'sum')))

print("\nTop 10 Census Tracts by Project Count:")
print(tract_summary.nlargest(10, ('Total Project Cost', 'count')))

print("\nTop 10 Census Tracts by GHG Reductions:")
print(tract_summary.nlargest(10, ('Total Project GHGReductions', 'sum')))

# Additional visualization of investment intensity
plt.figure(figsize=(15, 10))
tract_summary['investment_per_project'] = tract_summary[('Total Project Cost', 'sum')] / tract_summary[('Total Project Cost', 'count')]
plt.scatter(tract_summary[('Is Benefit Disadvantaged Communities', 'mean')],
           tract_summary['investment_per_project'],
           alpha=0.5)
plt.xlabel('Proportion of Projects Benefiting DACs')
plt.ylabel('Investment per Project ($)')
plt.title('Investment Intensity vs DAC Benefits')
plt.show()
No description has been provided for this image
Spatial Distribution Statistics:

Top 10 Census Tracts by Total Investment:
      Census Tract Total Project Cost            Total Project GHGReductions  \
                                count        sum                         sum   
1686  6.037206e+09                369  128838282                       71147   
1701  6.037208e+09                246   79854037                       21955   
1250  6.037113e+09                115   67551314                       45625   
5715  6.073006e+09                 38   65309556                        1546   
3340  6.037901e+09                 61   37506825                       44030   
2391  6.037408e+09                207   34961176                       31158   
3482  6.037980e+09                 67   30672161                       23886   
6642  6.081602e+09                 61   26686653                       14972   
7164  6.085512e+09                 49   24118370                       17130   
2268  6.037403e+09                 81   23674333                       11183   

     Is Benefit Disadvantaged Communities Is Low Income Communities  
                                     mean                      mean  
1686                               0.5176                    0.0108  
1701                               0.7886                    0.0447  
1250                               0.0087                    0.0435  
5715                               0.0000                    0.5789  
3340                               0.0000                    0.9672  
2391                               0.8164                    0.7923  
3482                               0.0000                    0.0149  
6642                               0.0000                    0.9508  
7164                               0.0000                    0.0204  
2268                               0.3333                    0.8148  

Top 10 Census Tracts by Project Count:
      Census Tract Total Project Cost            Total Project GHGReductions  \
                                count        sum                         sum   
1686  6.037206e+09                369  128838282                       71147   
1701  6.037208e+09                246   79854037                       21955   
2391  6.037408e+09                207   34961176                       31158   
6443  6.077002e+09                132     616605                         650   
6448  6.077002e+09                121     401633                         284   
1250  6.037113e+09                115   67551314                       45625   
6452  6.077003e+09                113     438571                         580   
2670  6.037504e+09                103   13744994                       12914   
6449  6.077002e+09                102     534104                         625   
2276  6.037403e+09                 99    5585697                       12839   

     Is Benefit Disadvantaged Communities Is Low Income Communities  
                                     mean                      mean  
1686                               0.5176                    0.0108  
1701                               0.7886                    0.0447  
2391                               0.8164                    0.7923  
6443                               0.0985                    0.9848  
6448                               0.0661                    0.9835  
1250                               0.0087                    0.0435  
6452                               0.0885                    0.9469  
2670                               0.1942                    0.0583  
6449                               0.1275                    0.9804  
2276                               0.0000                    0.1818  

Top 10 Census Tracts by GHG Reductions:
      Census Tract Total Project Cost            Total Project GHGReductions  \
                                count        sum                         sum   
419   6.009000e+09                 13    6718556                      735067   
7330  6.091010e+09                  8    1842014                      730641   
7963  6.113011e+09                 21    9324110                      297675   
7411  6.095253e+09                 17    2631219                      163765   
4432  6.061022e+09                 13    1199979                       78245   
1686  6.037206e+09                369  128838282                       71147   
3677  6.053011e+09                 26    5634176                       53453   
1250  6.037113e+09                115   67551314                       45625   
3340  6.037901e+09                 61   37506825                       44030   
4166  6.059086e+09                 84   18951369                       32458   

     Is Benefit Disadvantaged Communities Is Low Income Communities  
                                     mean                      mean  
419                                0.0000                    0.1538  
7330                               0.0000                    0.7500  
7963                               0.0000                    0.1429  
7411                               0.0000                    0.2353  
4432                               0.0000                    0.0000  
1686                               0.5176                    0.0108  
3677                               0.7692                    0.3462  
1250                               0.0087                    0.0435  
3340                               0.0000                    0.9672  
4166                               0.3214                    0.2262  
No description has been provided for this image

Analysis of Spatial Distribution and Environmental Justice Implications¶

Key Findings:¶

  1. Individual vs. Large Project Analysis Key Findings:
  • Individual Program Distribution:

    • Home Energy/Solar Programs:
      • Strong equity performance (78% benefit DACs)
      • High DAC funding ratio (0.65)
      • 93% benefit low-income communities
      • Lower total investment ($320M)
    • Vehicle Programs:
      • Lower equity performance (35% benefit DACs)
      • Low DAC funding ratio (0.17)
      • Only 41% benefit low-income communities
      • Higher individual costs (~$283K vs $43K for solar)
  • Large Project Performance:

    • Higher total investment ($127B)
    • Highest total GHG reductions (103.5M tons)
    • Mixed equity outcomes:
      • Low DAC funding ratio (0.28)
      • Only 6.6% benefit DACs
      • But 56% benefit low-income communities
  1. Spatial Distribution Analysis Key Findings:
  • Geographic Concentration:

    • Urban dominance (LA: 27% of projects, 54% of funds)
    • Top 3 counties (LA, Alameda, San Diego) = 37% of projects
    • Clear urban-rural divide in investment distribution
  • Environmental Justice Implications:

    • DACs receive 33% of projects but only 11% of funding
    • Low-income communities show better performance (58% of projects, 80% of funding)
    • Significant variation in DAC benefit rates across regions

Next Steps:

  1. Individual vs. Large Project Analysis
  • Detailed program design analysis:
    • What makes home energy programs more successful at reaching DACs?
    • What barriers exist in vehicle programs?
    • How can large project DAC benefits be improved?
  • Cost-effectiveness comparison:
    • Calculate and compare GHG reduction per dollar across programs
    • Analyze trade-offs between equity and efficiency
  • Policy recommendations:
    • Identify best practices from successful programs
    • Suggest modifications for underperforming programs
  1. Spatial Distribution Analysis
  • Enhanced geographic analysis:
    • Create detailed maps showing investment patterns
    • Analyze urban-rural disparities
    • Examine relationship between CalEnviroScreen scores and investment patterns
  • Environmental justice metrics:
    • Develop composite indicators of investment equity
    • Analyze temporal changes in distribution
    • Compare to demographic and socioeconomic patterns
  1. Integration and Synthesis
  • Combine findings from both approaches to:
    • Identify overlapping patterns
    • Develop comprehensive policy recommendations
    • Create framework for evaluating climate investment equity
  1. Specific Data Analysis Tasks:
# 1. Create program efficiency metrics
program_efficiency = data.groupby('project_category').agg({
    'Total Project GHGReductions': 'sum',
    'Total Project Cost': 'sum'
}).assign(
    efficiency = lambda x: x['Total Project GHGReductions'] / x['Total Project Cost']
)

# 2. Analyze spatial equity
spatial_equity = merged_data.groupby('CES_quartile').agg({
    'Total Project Cost': ['sum', 'mean', 'count'],
    'Total Project GHGReductions': ['sum', 'mean']
})

# 3. Create temporal analysis
temporal_patterns = data.groupby(['project_category', 'Date Operational']).agg({
    'Total Project Cost': 'sum',
    'Is Benefit Disadvantaged Communities': 'mean'
})
In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Read the data (I see you already have this loaded as 'data')

# 1. First, let's create our core analytical metrics
def calculate_program_metrics(df):
    metrics = df.groupby('Program Name').agg({
        'Total Project Cost': ['count', 'sum', 'mean'],
        'Total Project GHGReductions': ['sum', 'mean'],
        'Total GGRFDisadvantaged Community Funding': ['sum', 'mean'],
        'Is Benefit Disadvantaged Communities': 'mean',
        'Is Low Income Communities': 'mean'
    }).round(2)
    
    # Add efficiency metrics
    metrics['GHG_per_dollar'] = (metrics[('Total Project GHGReductions', 'sum')] / 
                                metrics[('Total Project Cost', 'sum')]).round(4)
    
    metrics['DAC_funding_ratio'] = (metrics[('Total GGRFDisadvantaged Community Funding', 'sum')] / 
                                   metrics[('Total Project Cost', 'sum')]).round(4)
    
    return metrics

program_metrics = calculate_program_metrics(data)

# Display top programs by different metrics
print("\nTop 5 Programs by Total Investment:")
print(program_metrics.sort_values(('Total Project Cost', 'sum'), ascending=False).head())

print("\nTop 5 Programs by GHG Reduction Efficiency:")
print(program_metrics.sort_values('GHG_per_dollar', ascending=False).head())

print("\nTop 5 Programs by DAC Funding Ratio:")
print(program_metrics.sort_values('DAC_funding_ratio', ascending=False).head())
Top 5 Programs by Total Investment:
                                                   Total Project Cost  \
                                                                count   
Program Name                                                            
Transit and Intercity Rail Capital Program                        245   
Low Carbon Transit Operations Program                            1003   
Affordable Housing and Sustainable Communities ...                151   
Low Carbon Transportation                                      105965   
Community Air Protection                                         5514   

                                                                  \
                                                             sum   
Program Name                                                       
Transit and Intercity Rail Capital Program          101461750666   
Low Carbon Transit Operations Program                 8777590799   
Affordable Housing and Sustainable Communities ...    8492944393   
Low Carbon Transportation                             3759508172   
Community Air Protection                              1211608583   

                                                                  \
                                                            mean   
Program Name                                                       
Transit and Intercity Rail Capital Program          4.141296e+08   
Low Carbon Transit Operations Program               8.751337e+06   
Affordable Housing and Sustainable Communities ...  5.624466e+07   
Low Carbon Transportation                           3.547877e+04   
Community Air Protection                            2.197331e+05   

                                                   Total Project GHGReductions  \
                                                                           sum   
Program Name                                                                     
Transit and Intercity Rail Capital Program                            23458701   
Low Carbon Transit Operations Program                                  6971510   
Affordable Housing and Sustainable Communities ...                     3590596   
Low Carbon Transportation                                              4306922   
Community Air Protection                                                250801   

                                                              \
                                                        mean   
Program Name                                                   
Transit and Intercity Rail Capital Program          95749.80   
Low Carbon Transit Operations Program                6950.66   
Affordable Housing and Sustainable Communities ...  23778.78   
Low Carbon Transportation                              40.64   
Community Air Protection                               45.48   

                                                   Total GGRFDisadvantaged Community Funding  \
                                                                                         sum   
Program Name                                                                                   
Transit and Intercity Rail Capital Program                                       477700000.0   
Low Carbon Transit Operations Program                                            127826618.0   
Affordable Housing and Sustainable Communities ...                               241985013.0   
Low Carbon Transportation                                                        324003740.0   
Community Air Protection                                                                 0.0   

                                                                \
                                                          mean   
Program Name                                                     
Transit and Intercity Rail Capital Program          9952083.33   
Low Carbon Transit Operations Program                399458.18   
Affordable Housing and Sustainable Communities ...  4033083.55   
Low Carbon Transportation                              7644.66   
Community Air Protection                                   NaN   

                                                   Is Benefit Disadvantaged Communities  \
                                                                                   mean   
Program Name                                                                              
Transit and Intercity Rail Capital Program                                         0.16   
Low Carbon Transit Operations Program                                              0.21   
Affordable Housing and Sustainable Communities ...                                 0.22   
Low Carbon Transportation                                                          0.36   
Community Air Protection                                                           0.00   

                                                   Is Low Income Communities  \
                                                                        mean   
Program Name                                                                   
Transit and Intercity Rail Capital Program                              0.73   
Low Carbon Transit Operations Program                                   0.64   
Affordable Housing and Sustainable Communities ...                      0.60   
Low Carbon Transportation                                               0.59   
Community Air Protection                                                0.87   

                                                   GHG_per_dollar  \
                                                                    
Program Name                                                        
Transit and Intercity Rail Capital Program                 0.0002   
Low Carbon Transit Operations Program                      0.0008   
Affordable Housing and Sustainable Communities ...         0.0004   
Low Carbon Transportation                                  0.0011   
Community Air Protection                                   0.0002   

                                                   DAC_funding_ratio  
                                                                      
Program Name                                                          
Transit and Intercity Rail Capital Program                    0.0047  
Low Carbon Transit Operations Program                         0.0146  
Affordable Housing and Sustainable Communities ...            0.0285  
Low Carbon Transportation                                     0.0862  
Community Air Protection                                      0.0000  

Top 5 Programs by GHG Reduction Efficiency:
                                                   Total Project Cost  \
                                                                count   
Program Name                                                            
Fluorinated Gases Emission Reduction Incentives                    15   
Sustainable Agricultural Lands Conservation Pro...                 96   
Climate Smart Agriculture                                        1688   
Forest Health Program                                             258   
Food Production Investment Program                                 56   

                                                                           \
                                                          sum        mean   
Program Name                                                                
Fluorinated Gases Emission Reduction Incentives             0        0.00   
Sustainable Agricultural Lands Conservation Pro...  212276194  2211210.35   
Climate Smart Agriculture                           886514974   525186.60   
Forest Health Program                               837651776  3246712.31   
Food Production Investment Program                  181509329  3241238.02   

                                                   Total Project GHGReductions  \
                                                                           sum   
Program Name                                                                     
Fluorinated Gases Emission Reduction Incentives                          36715   
Sustainable Agricultural Lands Conservation Pro...                    15080237   
Climate Smart Agriculture                                             24249378   
Forest Health Program                                                 20575142   
Food Production Investment Program                                     2974529   

                                                               \
                                                         mean   
Program Name                                                    
Fluorinated Gases Emission Reduction Incentives       2447.67   
Sustainable Agricultural Lands Conservation Pro...  157085.80   
Climate Smart Agriculture                            14365.75   
Forest Health Program                                79748.61   
Food Production Investment Program                   53116.59   

                                                   Total GGRFDisadvantaged Community Funding  \
                                                                                         sum   
Program Name                                                                                   
Fluorinated Gases Emission Reduction Incentives                                          0.0   
Sustainable Agricultural Lands Conservation Pro...                                 4264000.0   
Climate Smart Agriculture                                                         29472350.0   
Forest Health Program                                                                    0.0   
Food Production Investment Program                                                       0.0   

                                                               \
                                                         mean   
Program Name                                                    
Fluorinated Gases Emission Reduction Incentives           NaN   
Sustainable Agricultural Lands Conservation Pro...  101523.81   
Climate Smart Agriculture                            50814.40   
Forest Health Program                                    0.00   
Food Production Investment Program                        NaN   

                                                   Is Benefit Disadvantaged Communities  \
                                                                                   mean   
Program Name                                                                              
Fluorinated Gases Emission Reduction Incentives                                    0.00   
Sustainable Agricultural Lands Conservation Pro...                                 0.01   
Climate Smart Agriculture                                                          0.12   
Forest Health Program                                                              0.00   
Food Production Investment Program                                                 0.00   

                                                   Is Low Income Communities  \
                                                                        mean   
Program Name                                                                   
Fluorinated Gases Emission Reduction Incentives                         0.60   
Sustainable Agricultural Lands Conservation Pro...                      0.03   
Climate Smart Agriculture                                               0.35   
Forest Health Program                                                   0.30   
Food Production Investment Program                                      0.71   

                                                   GHG_per_dollar  \
                                                                    
Program Name                                                        
Fluorinated Gases Emission Reduction Incentives               inf   
Sustainable Agricultural Lands Conservation Pro...         0.0710   
Climate Smart Agriculture                                  0.0274   
Forest Health Program                                      0.0246   
Food Production Investment Program                         0.0164   

                                                   DAC_funding_ratio  
                                                                      
Program Name                                                          
Fluorinated Gases Emission Reduction Incentives                  NaN  
Sustainable Agricultural Lands Conservation Pro...            0.0201  
Climate Smart Agriculture                                     0.0332  
Forest Health Program                                         0.0000  
Food Production Investment Program                            0.0000  

Top 5 Programs by DAC Funding Ratio:
                                     Total Project Cost             \
                                                  count        sum   
Program Name                                                         
Low-Income Weatherization Program                  6468  307899153   
Urban and Community Forestry Program                967  107767408   
Water-Energy Efficiency                            5111   85125916   
Urban Greening Program                              177  270240730   
Wetlands and Watershed Restoration                   22   86150135   

                                                 Total Project GHGReductions  \
                                            mean                         sum   
Program Name                                                                   
Low-Income Weatherization Program       47603.46                      594567   
Urban and Community Forestry Program   111445.10                      479295   
Water-Energy Efficiency                 16655.43                      425225   
Urban Greening Program                1526783.79                       54285   
Wetlands and Watershed Restoration    3915915.23                      999950   

                                                \
                                          mean   
Program Name                                     
Low-Income Weatherization Program        91.92   
Urban and Community Forestry Program    495.65   
Water-Energy Efficiency                  83.20   
Urban Greening Program                  306.69   
Wetlands and Watershed Restoration    45452.27   

                                     Total GGRFDisadvantaged Community Funding  \
                                                                           sum   
Program Name                                                                     
Low-Income Weatherization Program                                  132666048.0   
Urban and Community Forestry Program                                34138019.0   
Water-Energy Efficiency                                             23286225.0   
Urban Greening Program                                              68717491.0   
Wetlands and Watershed Restoration                                  13382907.0   

                                                  \
                                            mean   
Program Name                                       
Low-Income Weatherization Program       22383.34   
Urban and Community Forestry Program    60635.91   
Water-Energy Efficiency                  4556.10   
Urban Greening Program                 848364.09   
Wetlands and Watershed Restoration    1115242.25   

                                     Is Benefit Disadvantaged Communities  \
                                                                     mean   
Program Name                                                                
Low-Income Weatherization Program                                    0.90   
Urban and Community Forestry Program                                 0.21   
Water-Energy Efficiency                                              0.45   
Urban Greening Program                                               0.19   
Wetlands and Watershed Restoration                                   0.09   

                                     Is Low Income Communities GHG_per_dollar  \
                                                          mean                  
Program Name                                                                    
Low-Income Weatherization Program                         0.97         0.0019   
Urban and Community Forestry Program                      0.09         0.0044   
Water-Energy Efficiency                                   0.00         0.0050   
Urban Greening Program                                    0.27         0.0002   
Wetlands and Watershed Restoration                        0.27         0.0116   

                                     DAC_funding_ratio  
                                                        
Program Name                                            
Low-Income Weatherization Program               0.4309  
Urban and Community Forestry Program            0.3168  
Water-Energy Efficiency                         0.2736  
Urban Greening Program                          0.2543  
Wetlands and Watershed Restoration              0.1553  
In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Let's first get a clear view of our data
print("\nTotal Investment by Program (Top 10):")
investment_by_program = data.groupby('Program Name')['Total Project Cost'].agg(['count', 'sum', 'mean'])
print(investment_by_program.nlargest(10, 'sum'))

# Calculate our key metrics
program_metrics = data.groupby('Program Name').agg({
    'Total Project Cost': ['count', 'sum', 'mean'],
    'Total Project GHGReductions': ['sum', 'mean'],
    'Total GGRFDisadvantaged Community Funding': ['sum', 'mean'],
    'Is Benefit Disadvantaged Communities': 'mean'
}).round(2)

# Add efficiency metrics
program_metrics['GHG_per_dollar'] = (program_metrics[('Total Project GHGReductions', 'sum')] / 
                                   program_metrics[('Total Project Cost', 'sum')]).round(4)

program_metrics['DAC_funding_ratio'] = (program_metrics[('Total GGRFDisadvantaged Community Funding', 'sum')] / 
                                      program_metrics[('Total Project Cost', 'sum')]).round(4)

# Clean data for visualization
clean_data = program_metrics[
    (program_metrics['GHG_per_dollar'] != float('inf')) & 
    (program_metrics['DAC_funding_ratio'].notna())
]

# Create visualization
plt.figure(figsize=(15, 10))

# Basic scatter plot
scatter = plt.scatter(clean_data['DAC_funding_ratio'],
                     clean_data['GHG_per_dollar'],
                     s=clean_data[('Total Project Cost', 'sum')]/1e7,
                     alpha=0.6)

# Add labels for major programs
for idx, row in clean_data.iterrows():
    if row[('Total Project Cost', 'sum')] > 500000000:  # Only label major programs
        plt.annotate(idx[:30] + '...' if len(idx) > 30 else idx,
                    (row['DAC_funding_ratio'], row['GHG_per_dollar']),
                    xytext=(5, 5),
                    textcoords='offset points',
                    fontsize=8)

plt.xlabel('DAC Funding Ratio')
plt.ylabel('GHG Reduction per Dollar')
plt.title('Program Performance: Equity vs. Climate Impact\nSize = Total Investment')

plt.tight_layout()
plt.show()

# Analysis of key metrics
print("\nProgram Performance Analysis:")

# Top programs by equity
print("\n1. Top 5 Programs by DAC Funding Ratio:")
print(clean_data.nlargest(5, 'DAC_funding_ratio')[
    ['DAC_funding_ratio', 'GHG_per_dollar', ('Total Project Cost', 'sum')]
])

# Top programs by GHG efficiency
print("\n2. Top 5 Programs by GHG Reduction Efficiency:")
print(clean_data.nlargest(5, 'GHG_per_dollar')[
    ['DAC_funding_ratio', 'GHG_per_dollar', ('Total Project Cost', 'sum')]
])

# Investment distribution
print("\n3. Investment Distribution Summary:")
print(clean_data[('Total Project Cost', 'sum')].describe())
Total Investment by Program (Top 10):
                                                     count           sum  \
Program Name                                                               
Transit and Intercity Rail Capital Program             245  101461750666   
Low Carbon Transit Operations Program                 1003    8777590799   
Affordable Housing and Sustainable Communities ...     151    8492944393   
Low Carbon Transportation                           105965    3759508172   
Community Air Protection                              5514    1211608583   
Climate Smart Agriculture                             1688     886514974   
Forest Health Program                                  258     837651776   
Funding Agricultural Replacement Measures for E...    8554     771297976   
Waste Diversion                                        265     643549843   
Fire Prevention Program                                600     614281793   

                                                            mean  
Program Name                                                      
Transit and Intercity Rail Capital Program          4.141296e+08  
Low Carbon Transit Operations Program               8.751337e+06  
Affordable Housing and Sustainable Communities ...  5.624466e+07  
Low Carbon Transportation                           3.547877e+04  
Community Air Protection                            2.197331e+05  
Climate Smart Agriculture                           5.251866e+05  
Forest Health Program                               3.246712e+06  
Funding Agricultural Replacement Measures for E...  9.016811e+04  
Waste Diversion                                     2.428490e+06  
Fire Prevention Program                             1.023803e+06  
/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1465: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  x = float(self.convert_xunits(x))
/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1467: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  y = float(self.convert_yunits(y))
No description has been provided for this image
Program Performance Analysis:

1. Top 5 Programs by DAC Funding Ratio:
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[27], line 62
     60 # Top programs by equity
     61 print("\n1. Top 5 Programs by DAC Funding Ratio:")
---> 62 print(clean_data.nlargest(5, 'DAC_funding_ratio')[
     63     ['DAC_funding_ratio', 'GHG_per_dollar', ('Total Project Cost', 'sum')]
     64 ])
     66 # Top programs by GHG efficiency
     67 print("\n2. Top 5 Programs by GHG Reduction Efficiency:")

File ~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4108, in DataFrame.__getitem__(self, key)
   4106     if is_iterator(key):
   4107         key = list(key)
-> 4108     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4110 # take() does not accept boolean indexers
   4111 if getattr(indexer, "dtype", None) == bool:

File ~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pandas/core/indexes/multi.py:2763, in MultiIndex._get_indexer_strict(self, key, axis_name)
   2760 if len(keyarr) and not isinstance(keyarr[0], tuple):
   2761     indexer = self._get_indexer_level_0(keyarr)
-> 2763     self._raise_if_missing(key, indexer, axis_name)
   2764     return self[indexer], indexer
   2766 return super()._get_indexer_strict(key, axis_name)

File ~/Repos/california_equity_git/.venv/lib/python3.12/site-packages/pandas/core/indexes/multi.py:2781, in MultiIndex._raise_if_missing(self, key, indexer, axis_name)
   2779 cmask = check == -1
   2780 if cmask.any():
-> 2781     raise KeyError(f"{keyarr[cmask]} not in index")
   2782 # We get here when levels still contain values which are not
   2783 # actually in Index anymore
   2784 raise KeyError(f"{keyarr} not in index")

KeyError: "[('Total Project Cost', 'sum')] not in index"
In [23]:
# install sklearn library
!pip install scikit-learn
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Requirement already satisfied: numpy>=1.19.5 in ./.venv/lib/python3.12/site-packages (from scikit-learn) (2.1.2)
Requirement already satisfied: scipy>=1.6.0 in ./.venv/lib/python3.12/site-packages (from scikit-learn) (1.14.1)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.2 threadpoolctl-3.5.0
In [26]:
# Let's first look at the raw total investment amounts
print("Raw Investment Totals by Program:")
print(clean_data[('Total Project Cost', 'sum')].sort_values(ascending=False).head(10))

# Modified analysis with corrected calculations
def create_program_typology_v2(df):
    # Calculate thresholds
    ghg_med = df['GHG_per_dollar'].median()
    dac_med = df['DAC_funding_ratio'].median()
    size_med = df[('Total Project Cost', 'sum')].median()
    
    # Create categories with corrected investment calculations
    conditions = [
        (df['GHG_per_dollar'] > ghg_med) & (df['DAC_funding_ratio'] > dac_med),
        (df['GHG_per_dollar'] > ghg_med) & (df['DAC_funding_ratio'] <= dac_med),
        (df['GHG_per_dollar'] <= ghg_med) & (df['DAC_funding_ratio'] > dac_med),
        (df['GHG_per_dollar'] <= ghg_med) & (df['DAC_funding_ratio'] <= dac_med)
    ]
    
    choices = [
        'High Performer (Both)',
        'GHG Efficient',
        'Equity Focused',
        'Below Median'
    ]
    
    df['Performance_Type'] = np.select(conditions, choices, default='Other')
    df['Scale_Type'] = np.where(df[('Total Project Cost', 'sum')] > size_med, 
                               'Large Scale', 
                               'Small Scale')
    
    return df

# Rerun analysis with corrected calculations
analysis_data_v2 = clean_data.copy()
analysis_data_v2 = create_program_typology_v2(analysis_data_v2)

# Summary statistics with corrected investment amounts
print("\nRevised Program Type Characteristics:")
for ptype in analysis_data_v2['Performance_Type'].unique():
    subset = analysis_data_v2[analysis_data_v2['Performance_Type'] == ptype]
    print(f"\n{ptype}:")
    print(f"Number of programs: {len(subset)}")
    print(f"Average GHG efficiency: {subset['GHG_per_dollar'].mean():.4f}")
    print(f"Average DAC ratio: {subset['DAC_funding_ratio'].mean():.4f}")
    print(f"Average project cost: ${subset[('Total Project Cost', 'mean')].mean():,.2f}")
    print(f"Largest program: ${subset[('Total Project Cost', 'sum')].max():,.2f}")
    print("\nExample programs:")
    print(subset.index.tolist()[:3])

# Create visualization with corrected data
plt.figure(figsize=(15, 10))

# Plot each program type with different colors and corrected sizes
for ptype in analysis_data_v2['Performance_Type'].unique():
    mask = analysis_data_v2['Performance_Type'] == ptype
    subset = analysis_data_v2[mask]
    
    plt.scatter(subset['DAC_funding_ratio'],
               subset['GHG_per_dollar'],
               s=subset[('Total Project Cost', 'mean')]/1e5,  # Adjusted scaling factor
               alpha=0.6,
               label=ptype)

plt.xlabel('DAC Funding Ratio')
plt.ylabel('GHG Reduction per Dollar')
plt.title('Program Performance Types (Corrected Investment Scaling)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Add annotations for major programs
for idx, row in analysis_data_v2.iterrows():
    if row[('Total Project Cost', 'mean')] > 1e6:  # Adjust threshold as needed
        plt.annotate(idx[:30] + '...' if len(idx) > 30 else idx,
                    (row['DAC_funding_ratio'], row['GHG_per_dollar']),
                    xytext=(5, 5),
                    textcoords='offset points',
                    fontsize=8)

plt.tight_layout()
plt.show()
Raw Investment Totals by Program:
Program Name
Transit and Intercity Rail Capital Program                                   101461750666
Low Carbon Transit Operations Program                                          8777590799
Affordable Housing and Sustainable Communities Program                         8492944393
Low Carbon Transportation                                                      3759508172
Community Air Protection                                                       1211608583
Climate Smart Agriculture                                                       886514974
Forest Health Program                                                           837651776
Funding Agricultural Replacement Measures for Emission Reductions Program       771297976
Waste Diversion                                                                 643549843
Fire Prevention Program                                                         614281793
Name: (Total Project Cost, sum), dtype: int64

Revised Program Type Characteristics:

Below Median:
Number of programs: 16
Average GHG efficiency: 0.0000
Average DAC ratio: 0.0000
Average project cost: $1,352,617.47
Largest program: $1,211,608,583.00

Example programs:
['Active Transportation Program', 'Climate Change Research Program', 'Climate Ready Program ']

High Performer (Both):
Number of programs: 10
Average GHG efficiency: 0.0129
Average DAC ratio: 0.1410
Average project cost: $7,428,798.66
Largest program: $8,777,590,799.00

Example programs:
['Affordable Housing and Sustainable Communities Program', 'Climate Smart Agriculture', 'Low Carbon Transit Operations Program']

GHG Efficient:
Number of programs: 9
Average GHG efficiency: 0.0089
Average DAC ratio: 0.0000
Average project cost: $1,967,095.81
Largest program: $837,651,776.00

Example programs:
['Climate Adaptation and Resiliency Program', 'Food Production Investment Program', 'Forest Health Program']

Equity Focused:
Number of programs: 3
Average GHG efficiency: 0.0001
Average DAC ratio: 0.1038
Average project cost: $138,638,075.83
Largest program: $101,461,750,666.00

Example programs:
['Community Assistance for Climate Equity Program', 'Transit and Intercity Rail Capital Program', 'Urban Greening Program']
/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1465: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  x = float(self.convert_xunits(x))
/home/dadams/Repos/california_equity_git/.venv/lib/python3.12/site-packages/matplotlib/text.py:1467: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  y = float(self.convert_yunits(y))
No description has been provided for this image
In [28]:
# Let's create a summary visualization that highlights our main findings
plt.figure(figsize=(15, 12))

# Create four subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 16))

# 1. Program Scale Distribution (Top 10 by Investment)
top_10_programs = clean_data[('Total Project Cost', 'sum')].nlargest(10)
ax1.barh(range(len(top_10_programs)), top_10_programs/1e9)  # Convert to billions
ax1.set_yticks(range(len(top_10_programs)))
ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_programs.index])
ax1.set_xlabel('Total Investment (Billions $)')
ax1.set_title('Top 10 Programs by Investment')

# 2. GHG Efficiency vs Project Size
ax2.scatter(clean_data[('Total Project Cost', 'mean')]/1e6, 
           clean_data['GHG_per_dollar'],
           alpha=0.6)
ax2.set_xlabel('Average Project Size (Millions $)')
ax2.set_ylabel('GHG Reduction per Dollar')
ax2.set_title('GHG Efficiency vs Project Size')

# 3. DAC Funding Ratio vs Program Size
ax3.scatter(clean_data[('Total Project Cost', 'sum')]/1e9,
           clean_data['DAC_funding_ratio'],
           alpha=0.6)
ax3.set_xlabel('Total Program Investment (Billions $)')
ax3.set_ylabel('DAC Funding Ratio')
ax3.set_title('Equity vs Program Size')

# 4. Project Count vs Average Size
ax4.scatter(clean_data[('Total Project Cost', 'count')],
           clean_data[('Total Project Cost', 'mean')]/1e6,
           alpha=0.6)
ax4.set_xlabel('Number of Projects')
ax4.set_ylabel('Average Project Size (Millions $)')
ax4.set_title('Project Count vs Average Size')

plt.tight_layout()
plt.show()
<Figure size 1500x1200 with 0 Axes>
No description has been provided for this image
In [36]:
# install seaborn library
!pip install seaborn
Requirement already satisfied: seaborn in ./.venv/lib/python3.12/site-packages (0.13.2)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in ./.venv/lib/python3.12/site-packages (from seaborn) (2.1.2)
Requirement already satisfied: pandas>=1.2 in ./.venv/lib/python3.12/site-packages (from seaborn) (2.2.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in ./.venv/lib/python3.12/site-packages (from seaborn) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.0)
Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.7)
Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)
Requirement already satisfied: pillow>=8 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.0.0)
Requirement already satisfied: pyparsing>=2.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.0)
Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set up the figure with a clean, modern style
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3

fig = plt.figure(figsize=(20, 15))

# 1. Project Volume vs Investment (Enhanced)
ax1 = plt.subplot(2, 2, 1)
scatter = plt.scatter(clean_data[('Total Project Cost', 'count')],
                     clean_data[('Total Project Cost', 'mean')]/1e6,
                     s=clean_data[('Total Project Cost', 'sum')]/1e8,
                     alpha=0.7,
                     c=range(len(clean_data)),
                     cmap='viridis')
ax1.set_xlabel('Number of Projects', fontsize=12, fontweight='bold')
ax1.set_ylabel('Average Project Size (Millions $)', fontsize=12, fontweight='bold')
ax1.set_title('Program Scale Analysis\nBubble size represents total investment', 
              fontsize=14, pad=20, fontweight='bold')

# Add annotations for notable programs
for idx, row in clean_data.iterrows():
    if row[('Total Project Cost', 'sum')] > 5e9:
        plt.annotate(idx[:30],
                    (row[('Total Project Cost', 'count')], 
                     row[('Total Project Cost', 'mean')]/1e6),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8,
                    bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))

# 2. Program Size Distribution (Enhanced)
ax2 = plt.subplot(2, 2, 2)
sizes = clean_data[('Total Project Cost', 'sum')]/1e6
plt.hist(np.log10(sizes[sizes > 0]), 
         bins=20, 
         color='dodgerblue',
         edgecolor='white',
         alpha=0.7)
ax2.set_xlabel('Log10(Total Program Size) in Millions $', fontsize=12, fontweight='bold')
ax2.set_ylabel('Count', fontsize=12, fontweight='bold')
ax2.set_title('Distribution of Program Sizes\nNatural groupings by investment scale', 
              fontsize=14, pad=20, fontweight='bold')

# 3. Top 10 Programs (Enhanced)
ax3 = plt.subplot(2, 2, 3)
top_10 = clean_data[('Total Project Cost', 'sum')].nlargest(10)/1e9
colors = plt.cm.viridis(np.linspace(0, 0.8, len(top_10)))
bars = ax3.barh(range(len(top_10)), top_10, color=colors)
ax3.set_yticks(range(len(top_10)))
ax3.set_yticklabels([name[:40] + '...' if len(name) > 40 else name 
                     for name in top_10.index],
                    fontsize=8)
ax3.set_xlabel('Total Investment (Billions $)', fontsize=12, fontweight='bold')
ax3.set_title('Top 10 Programs by Investment\nHighlighting investment concentration', 
              fontsize=14, pad=20, fontweight='bold')

# Add value labels to bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax3.text(width, bar.get_y() + bar.get_height()/2,
             f'${width:.1f}B',
             ha='left', va='center', fontsize=8,
             bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))

# 4. Geographic Distribution (Enhanced)
ax4 = plt.subplot(2, 2, 4)
top_15_counties = county_analysis['total_investment'].nlargest(15)/1e6
colors = plt.cm.viridis(np.linspace(0, 0.8, len(top_15_counties)))
bars = ax4.barh(range(len(top_15_counties)), top_15_counties, color=colors)
ax4.set_yticks(range(len(top_15_counties)))
ax4.set_yticklabels(top_15_counties.index, fontsize=8)
ax4.set_xlabel('Total Investment (Millions $)', fontsize=12, fontweight='bold')
ax4.set_title('Investment by County\nTop 15 counties by total investment', 
              fontsize=14, pad=20, fontweight='bold')

# Add value labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax4.text(width, bar.get_y() + bar.get_height()/2,
             f'${width:.0f}M',
             ha='left', va='center', fontsize=8,
             bbox=dict(facecolor='white', edgecolor='none', alpha=0.8))

plt.tight_layout()
plt.show()

# Print summary statistics for context
print("\nProgram Scale Categories:")
quartiles = np.percentile(sizes[sizes > 0], [25, 50, 75])
print("\nInvestment Scale Categories (in Millions $):")
print(f"Small Programs: < ${quartiles[0]:.1f}M")
print(f"Medium Programs: ${quartiles[0]:.1f}M - ${quartiles[1]:.1f}M")
print(f"Large Programs: ${quartiles[1]:.1f}M - ${quartiles[2]:.1f}M")
print(f"Mega Programs: > ${quartiles[2]:.1f}M")
No description has been provided for this image
Program Scale Categories:

Investment Scale Categories (in Millions $):
Small Programs: < $20.0M
Medium Programs: $20.0M - $97.0M
Large Programs: $97.0M - $587.9M
Mega Programs: > $587.9M
In [39]:
# First, let's prepare our county-level metrics
county_analysis = data.groupby('County').agg({
    'Total Project Cost': ['count', 'sum', 'mean'],
    'Total Project GHGReductions': 'sum',
    'Is Benefit Disadvantaged Communities': 'mean',
    'Is Low Income Communities': 'mean'
})

# Flatten column names
county_analysis.columns = [
    'project_count', 'total_investment', 'avg_project_size',
    'total_ghg_reduction', 'dac_rate', 'low_income_rate'
]

# Calculate per-project metrics
county_analysis['ghg_per_dollar'] = county_analysis['total_ghg_reduction'] / county_analysis['total_investment']
county_analysis['investment_per_project'] = county_analysis['total_investment'] / county_analysis['project_count']

# Create visualization of spatial patterns
fig = plt.figure(figsize=(20, 15))

# 1. Investment Distribution
plt.subplot(2, 2, 1)
top_15 = county_analysis['total_investment'].sort_values(ascending=True).tail(15)/1e6
plt.barh(range(len(top_15)), top_15, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15)), top_15.index)
plt.xlabel('Total Investment (Millions $)')
plt.title('Top 15 Counties by Total Investment', fontsize=12, pad=15)

# 2. Project Count Distribution
plt.subplot(2, 2, 2)
top_15_count = county_analysis['project_count'].sort_values(ascending=True).tail(15)
plt.barh(range(len(top_15_count)), top_15_count, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15_count)), top_15_count.index)
plt.xlabel('Number of Projects')
plt.title('Top 15 Counties by Project Count', fontsize=12, pad=15)

# 3. DAC Benefit Rate
plt.subplot(2, 2, 3)
top_15_dac = county_analysis['dac_rate'].sort_values(ascending=True).tail(15)
plt.barh(range(len(top_15_dac)), top_15_dac*100, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15_dac)), top_15_dac.index)
plt.xlabel('Percentage of Projects Benefiting DACs')
plt.title('Top 15 Counties by DAC Benefit Rate', fontsize=12, pad=15)

# 4. GHG Efficiency
plt.subplot(2, 2, 4)
top_15_ghg = county_analysis['ghg_per_dollar'].sort_values(ascending=True).tail(15)
plt.barh(range(len(top_15_ghg)), top_15_ghg, color='dodgerblue', alpha=0.7)
plt.yticks(range(len(top_15_ghg)), top_15_ghg.index)
plt.xlabel('GHG Reduction per Dollar')
plt.title('Top 15 Counties by GHG Efficiency', fontsize=12, pad=15)

plt.tight_layout()
plt.show()

# Print detailed analysis
print("\nSpatial Distribution Analysis:")
print("\n1. Investment Concentration:")
investment_share = (county_analysis['total_investment'].nlargest(5).sum() / 
                   county_analysis['total_investment'].sum() * 100)
print(f"Top 5 counties account for {investment_share:.1f}% of total investment")

print("\n2. Project Distribution:")
project_share = (county_analysis['project_count'].nlargest(5).sum() / 
                county_analysis['project_count'].sum() * 100)
print(f"Top 5 counties account for {project_share:.1f}% of all projects")

print("\n3. Regional Equity Analysis:")
print("\nDAC Benefit Rate by Region:")
print(county_analysis['dac_rate'].describe().round(3))

print("\n4. Project Type Distribution:")
# Look at program types in top 5 counties
top_5_counties = county_analysis['total_investment'].nlargest(5).index
for county in top_5_counties:
    print(f"\n{county} Program Distribution:")
    county_programs = data[data['County'] == county]['Program Name'].value_counts().head(3)
    print(county_programs)
No description has been provided for this image
Spatial Distribution Analysis:

1. Investment Concentration:
Top 5 counties account for 75.6% of total investment

2. Project Distribution:
Top 5 counties account for 51.1% of all projects

3. Regional Equity Analysis:

DAC Benefit Rate by Region:
count    299.000
mean       0.149
std        0.297
min        0.000
25%        0.000
50%        0.000
75%        0.125
max        1.000
Name: dac_rate, dtype: float64

4. Project Type Distribution:

Los Angeles Program Distribution:
Program Name
Low Carbon Transportation            32598
Low-Income Weatherization Program     2233
Water-Energy Efficiency               1945
Name: count, dtype: int64

Alameda Program Distribution:
Program Name
Low Carbon Transportation    5677
Water-Energy Efficiency       244
Community Air Protection      223
Name: count, dtype: int64

Alameda, Contra Costa, Fresno, Kern, Kings, Madera, Merced, Sacramento, San Joaquin, Santa Clara, Stanislaus, Tulare Program Distribution:
Program Name
Transit and Intercity Rail Capital Program    9
Name: count, dtype: int64

Sacramento Program Distribution:
Program Name
Low Carbon Transportation            3556
Water-Energy Efficiency               422
Low-Income Weatherization Program     297
Name: count, dtype: int64

San Diego Program Distribution:
Program Name
Low Carbon Transportation            7865
Community Air Protection              105
Low-Income Weatherization Program      83
Name: count, dtype: int64
In [40]:
# First, let's identify multi-county projects
def is_multi_county(county):
    return ',' in str(county)

# Create multi-county analysis dataframe
multi_county_data = data[data['County'].apply(is_multi_county)].copy()

# Analyze these collaborations
print("\nMulti-County Collaboration Analysis:")
print(f"\nTotal number of multi-county projects: {len(multi_county_data)}")
print(f"Total investment in multi-county projects: ${multi_county_data['Total Project Cost'].sum()/1e9:.2f}B")

# Look at the specific collaborations
print("\nMajor Multi-County Collaborations:")
collaboration_summary = multi_county_data.groupby('County').agg({
    'Total Project Cost': ['count', 'sum', 'mean'],
    'Total Project GHGReductions': 'sum',
    'Is Benefit Disadvantaged Communities': 'mean',
    'Total GGRFDisadvantaged Community Funding': 'sum'
}).round(2)

# Flatten column names
collaboration_summary.columns = [
    'project_count', 'total_investment', 'avg_project_size',
    'total_ghg_reduction', 'dac_benefit_rate', 'dac_funding'
]

# Calculate efficiency metrics
collaboration_summary['ghg_per_dollar'] = collaboration_summary['total_ghg_reduction'] / collaboration_summary['total_investment']
collaboration_summary['dac_funding_ratio'] = collaboration_summary['dac_funding'] / collaboration_summary['total_investment']

# Sort by total investment
print(collaboration_summary.sort_values('total_investment', ascending=False).head(10))

# Create visualizations
plt.figure(figsize=(20, 12))

# 1. Project Size Comparison
plt.subplot(2, 2, 1)
avg_size = data['Total Project Cost'].mean()
multi_avg_size = multi_county_data['Total Project Cost'].mean()

plt.bar(['Single-County Projects', 'Multi-County Projects'], 
       [avg_size/1e6, multi_avg_size/1e6],
       color=['lightblue', 'darkblue'])
plt.ylabel('Average Project Size (Millions $)')
plt.title('Project Size Comparison')

# 2. GHG Efficiency Comparison
plt.subplot(2, 2, 2)
single_ghg_efficiency = (data[~data['County'].apply(is_multi_county)]['Total Project GHGReductions'].sum() / 
                        data[~data['County'].apply(is_multi_county)]['Total Project Cost'].sum())
multi_ghg_efficiency = (multi_county_data['Total Project GHGReductions'].sum() / 
                       multi_county_data['Total Project Cost'].sum())

plt.bar(['Single-County Projects', 'Multi-County Projects'],
       [single_ghg_efficiency, multi_ghg_efficiency],
       color=['lightblue', 'darkblue'])
plt.ylabel('GHG Reduction per Dollar')
plt.title('GHG Efficiency Comparison')

# 3. Top Multi-County Collaborations
plt.subplot(2, 2, 3)
top_collaborations = collaboration_summary.sort_values('total_investment', ascending=True).tail(5)
plt.barh(range(len(top_collaborations)), top_collaborations['total_investment']/1e9)
plt.yticks(range(len(top_collaborations)), 
          [name[:50] + '...' if len(name) > 50 else name for name in top_collaborations.index])
plt.xlabel('Total Investment (Billions $)')
plt.title('Top 5 Multi-County Collaborations by Investment')

# 4. DAC Benefit Rate Comparison
plt.subplot(2, 2, 4)
plt.bar(['Single-County Projects', 'Multi-County Projects'],
       [data[~data['County'].apply(is_multi_county)]['Is Benefit Disadvantaged Communities'].mean(),
        multi_county_data['Is Benefit Disadvantaged Communities'].mean()],
       color=['lightblue', 'darkblue'])
plt.ylabel('Average DAC Benefit Rate')
plt.title('DAC Benefit Rate Comparison')

plt.tight_layout()
plt.show()

# Analyze types of projects that work well in collaboration
print("\nMost Common Types of Multi-County Projects:")
print(multi_county_data['Program Name'].value_counts().head())

# Look at success factors
print("\nSuccess Metrics for Multi-County Projects:")
print("\nDAC Benefit Rate Distribution:")
print(multi_county_data['Is Benefit Disadvantaged Communities'].describe())

print("\nGHG Reduction Distribution (tons per $):")
multi_county_data['ghg_efficiency'] = multi_county_data['Total Project GHGReductions'] / multi_county_data['Total Project Cost']
print(multi_county_data['ghg_efficiency'].describe())
Multi-County Collaboration Analysis:

Total number of multi-county projects: 874
Total investment in multi-county projects: $18.31B

Major Multi-County Collaborations:
                                                    project_count  \
County                                                              
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...              9   
San Francisco, Santa Clara                                      6   
San Diego, Santa Barbara, Ventura                              23   
San Francisco, San Mateo, Santa Clara                           3   
Los Angeles, Ventura                                            3   
Marin, Sonoma                                                  15   
Los Angeles, San Bernardino                                    29   
Humboldt, Mendocino                                             5   
Contra Costa, San Francisco                                     3   
Los Angeles, San Diego, San Luis Obispo, Santa ...              2   

                                                    total_investment  \
County                                                                 
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...        7333200000   
San Francisco, Santa Clara                                2998442533   
San Diego, Santa Barbara, Ventura                         2235315130   
San Francisco, San Mateo, Santa Clara                     2145553263   
Los Angeles, Ventura                                       877252312   
Marin, Sonoma                                              363189831   
Los Angeles, San Bernardino                                177831134   
Humboldt, Mendocino                                        155426149   
Contra Costa, San Francisco                                135489931   
Los Angeles, San Diego, San Luis Obispo, Santa ...         130936629   

                                                    avg_project_size  \
County                                                                 
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...      8.148000e+08   
San Francisco, Santa Clara                              4.997404e+08   
San Diego, Santa Barbara, Ventura                       9.718761e+07   
San Francisco, San Mateo, Santa Clara                   7.151844e+08   
Los Angeles, Ventura                                    2.924174e+08   
Marin, Sonoma                                           2.421266e+07   
Los Angeles, San Bernardino                             6.132108e+06   
Humboldt, Mendocino                                     3.108523e+07   
Contra Costa, San Francisco                             4.516331e+07   
Los Angeles, San Diego, San Luis Obispo, Santa ...      6.546831e+07   

                                                    total_ghg_reduction  \
County                                                                    
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...               546406   
San Francisco, Santa Clara                                      1538067   
San Diego, Santa Barbara, Ventura                                791432   
San Francisco, San Mateo, Santa Clara                           2100845   
Los Angeles, Ventura                                              74882   
Marin, Sonoma                                                    234508   
Los Angeles, San Bernardino                                       22220   
Humboldt, Mendocino                                               24160   
Contra Costa, San Francisco                                       60369   
Los Angeles, San Diego, San Luis Obispo, Santa ...               104667   

                                                    dac_benefit_rate  \
County                                                                 
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...              0.00   
San Francisco, Santa Clara                                      0.17   
San Diego, Santa Barbara, Ventura                               0.00   
San Francisco, San Mateo, Santa Clara                           0.33   
Los Angeles, Ventura                                            0.00   
Marin, Sonoma                                                   0.00   
Los Angeles, San Bernardino                                     0.48   
Humboldt, Mendocino                                             0.00   
Contra Costa, San Francisco                                     0.33   
Los Angeles, San Diego, San Luis Obispo, Santa ...              0.00   

                                                    dac_funding  \
County                                                            
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...          0.0   
San Francisco, Santa Clara                           20000000.0   
San Diego, Santa Barbara, Ventura                           0.0   
San Francisco, San Mateo, Santa Clara                  935322.0   
Los Angeles, Ventura                                        0.0   
Marin, Sonoma                                               0.0   
Los Angeles, San Bernardino                          10812422.0   
Humboldt, Mendocino                                         0.0   
Contra Costa, San Francisco                            631879.0   
Los Angeles, San Diego, San Luis Obispo, Santa ...          0.0   

                                                    ghg_per_dollar  \
County                                                               
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...        0.000075   
San Francisco, Santa Clara                                0.000513   
San Diego, Santa Barbara, Ventura                         0.000354   
San Francisco, San Mateo, Santa Clara                     0.000979   
Los Angeles, Ventura                                      0.000085   
Marin, Sonoma                                             0.000646   
Los Angeles, San Bernardino                               0.000125   
Humboldt, Mendocino                                       0.000155   
Contra Costa, San Francisco                               0.000446   
Los Angeles, San Diego, San Luis Obispo, Santa ...        0.000799   

                                                    dac_funding_ratio  
County                                                                 
Alameda, Contra Costa, Fresno, Kern, Kings, Mad...           0.000000  
San Francisco, Santa Clara                                   0.006670  
San Diego, Santa Barbara, Ventura                            0.000000  
San Francisco, San Mateo, Santa Clara                        0.000436  
Los Angeles, Ventura                                         0.000000  
Marin, Sonoma                                                0.000000  
Los Angeles, San Bernardino                                  0.060802  
Humboldt, Mendocino                                          0.000000  
Contra Costa, San Francisco                                  0.004664  
Los Angeles, San Diego, San Luis Obispo, Santa ...           0.000000  
No description has been provided for this image
Most Common Types of Multi-County Projects:
Program Name
Urban and Community Forestry Program          333
Low Carbon Transit Operations Program         189
Transit and Intercity Rail Capital Program     87
Safe and Affordable Drinking Water Fund        47
Climate Smart Agriculture                      43
Name: count, dtype: int64

Success Metrics for Multi-County Projects:

DAC Benefit Rate Distribution:
count       874
unique        2
top       False
freq        756
Name: Is Benefit Disadvantaged Communities, dtype: object

GHG Reduction Distribution (tons per $):
count    578.000000
mean       0.003167
std        0.017891
min       -0.002485
25%        0.000000
50%        0.000220
75%        0.002056
max        0.400316
Name: ghg_efficiency, dtype: float64
In [41]:
# 1. Deep Dive into Successful Collaborations
# Let's create a success metric combining equity and efficiency
def calculate_success_score(row):
    # Normalize GHG efficiency and DAC rates to 0-1 scale
    ghg_norm = (row['ghg_per_dollar'] - min_ghg) / (max_ghg - min_ghg)
    dac_norm = row['dac_benefit_rate']
    return (ghg_norm + dac_norm) / 2

# Add success scores
collaboration_success = collaboration_summary.copy()
min_ghg = collaboration_summary['ghg_per_dollar'].min()
max_ghg = collaboration_summary['ghg_per_dollar'].max()
collaboration_success['success_score'] = collaboration_summary.apply(calculate_success_score, axis=1)

# Create visualization of success patterns
plt.figure(figsize=(20, 15))

# 1. Success Stories Matrix
plt.subplot(2, 2, 1)
plt.scatter(collaboration_success['ghg_per_dollar'], 
           collaboration_success['dac_benefit_rate'],
           s=collaboration_success['total_investment']/1e7,
           alpha=0.6)
plt.xlabel('GHG Efficiency (Reduction per Dollar)')
plt.ylabel('DAC Benefit Rate')
plt.title('Multi-County Collaboration Performance Matrix\nSize = Total Investment')

# 2. Program Type Analysis
program_type_analysis = multi_county_data.groupby('Program Name').agg({
    'Total Project Cost': ['count', 'sum', 'mean'],
    'Total Project GHGReductions': 'sum',
    'Is Benefit Disadvantaged Communities': 'mean'
}).round(2)

program_type_analysis.columns = ['project_count', 'total_investment', 
                               'avg_project_size', 'total_ghg', 'dac_rate']
program_type_analysis['ghg_efficiency'] = program_type_analysis['total_ghg'] / program_type_analysis['total_investment']

# Plot program type performance
plt.subplot(2, 2, 2)
top_programs = program_type_analysis.nlargest(5, 'total_investment')
plt.bar(range(len(top_programs)), top_programs['ghg_efficiency'])
plt.xticks(range(len(top_programs)), 
          [name[:20] + '...' if len(name) > 20 else name for name in top_programs.index],
          rotation=45)
plt.title('GHG Efficiency by Program Type')

# 3. Partnership Size Analysis
plt.subplot(2, 2, 3)
partnership_sizes = multi_county_data['County'].str.count(',') + 1
size_performance = pd.DataFrame({
    'partner_count': partnership_sizes,
    'ghg_efficiency': multi_county_data['Total Project GHGReductions'] / multi_county_data['Total Project Cost'],
    'dac_rate': multi_county_data['Is Benefit Disadvantaged Communities']
})

avg_by_size = size_performance.groupby('partner_count').mean()
plt.plot(avg_by_size.index, avg_by_size['ghg_efficiency'], marker='o')
plt.xlabel('Number of Partner Counties')
plt.ylabel('Average GHG Efficiency')
plt.title('Performance by Partnership Size')

# 4. Regional Patterns
plt.subplot(2, 2, 4)
# Create regional categories based on county groupings
def categorize_region(counties):
    if 'Los Angeles' in counties:
        return 'Southern California'
    elif any(county in counties for county in ['San Francisco', 'Alameda', 'Santa Clara']):
        return 'Bay Area'
    elif any(county in counties for county in ['Fresno', 'Kern', 'Kings']):
        return 'Central Valley'
    else:
        return 'Other'

collaboration_success['region'] = collaboration_success.index.map(categorize_region)
regional_performance = collaboration_success.groupby('region').agg({
    'ghg_per_dollar': 'mean',
    'dac_benefit_rate': 'mean',
    'total_investment': 'sum'
})

plt.bar(regional_performance.index, regional_performance['ghg_per_dollar'])
plt.xticks(rotation=45)
plt.title('GHG Efficiency by Region')

plt.tight_layout()
plt.show()

# Print detailed analysis
print("\n1. Most Successful Collaborations:")
print(collaboration_success.nlargest(5, 'success_score')[
    ['total_investment', 'ghg_per_dollar', 'dac_benefit_rate', 'success_score']
])

print("\n2. Program Type Success Factors:")
print(program_type_analysis.nlargest(5, 'ghg_efficiency'))

print("\n3. Optimal Partnership Size:")
print(avg_by_size)

print("\n4. Regional Performance Patterns:")
print(regional_performance.round(4))

# Additional insights on equity-efficiency trade-offs
print("\nEquity-Efficiency Trade-off Analysis:")
correlation = collaboration_success['ghg_per_dollar'].corr(collaboration_success['dac_benefit_rate'])
print(f"\nCorrelation between GHG efficiency and DAC benefit rate: {correlation:.4f}")
No description has been provided for this image
1. Most Successful Collaborations:
                                                   total_investment  \
County                                                                
Marin, San Francisco                                        2466450   
Los Angeles, Orange, Riverside, San Diego                    766345   
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba            991000   
Napa, San Joaquin                                           1557570   
Lake, Sacramento                                            2697479   

                                                   ghg_per_dollar  \
County                                                              
Marin, San Francisco                                     0.010916   
Los Angeles, Orange, Riverside, San Diego                0.008612   
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba        0.008322   
Napa, San Joaquin                                        0.005557   
Lake, Sacramento                                         0.004171   

                                                   dac_benefit_rate  \
County                                                                
Marin, San Francisco                                            1.0   
Los Angeles, Orange, Riverside, San Diego                       1.0   
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba               1.0   
Napa, San Joaquin                                               1.0   
Lake, Sacramento                                                1.0   

                                                   success_score  
County                                                            
Marin, San Francisco                                    0.515503  
Los Angeles, Orange, Riverside, San Diego               0.512379  
El Dorado, Placer, Sacramento, Sutter, Yolo, Yuba       0.511986  
Napa, San Joaquin                                       0.508237  
Lake, Sacramento                                        0.506358  

2. Program Type Success Factors:
                                                    project_count  \
Program Name                                                        
Forest Health Program                                           6   
Food Production Investment Program                              9   
Wetlands and Watershed Restoration                              1   
Sustainable Agricultural Lands Conservation Pro...              1   
Urban and Community Forestry Program                          333   

                                                    total_investment  \
Program Name                                                           
Forest Health Program                                       20238006   
Food Production Investment Program                          20586793   
Wetlands and Watershed Restoration                            920666   
Sustainable Agricultural Lands Conservation Pro...          17690045   
Urban and Community Forestry Program                        37197703   

                                                    avg_project_size  \
Program Name                                                           
Forest Health Program                                     3373001.00   
Food Production Investment Program                        2287421.44   
Wetlands and Watershed Restoration                         920666.00   
Sustainable Agricultural Lands Conservation Pro...       17690045.00   
Urban and Community Forestry Program                       111704.81   

                                                    total_ghg  dac_rate  \
Program Name                                                              
Forest Health Program                                  830634       0.0   
Food Production Investment Program                     501007       0.0   
Wetlands and Watershed Restoration                      15166       0.0   
Sustainable Agricultural Lands Conservation Pro...     251149       0.0   
Urban and Community Forestry Program                   195214       0.2   

                                                    ghg_efficiency  
Program Name                                                        
Forest Health Program                                     0.041043  
Food Production Investment Program                        0.024336  
Wetlands and Watershed Restoration                        0.016473  
Sustainable Agricultural Lands Conservation Pro...        0.014197  
Urban and Community Forestry Program                      0.005248  

3. Optimal Partnership Size:
               ghg_efficiency  dac_rate
partner_count                          
2                    0.004588  0.149798
3                    0.002167  0.064000
4                    0.003443  0.052632
5                    0.000880  0.130435
6                    0.002270  0.236842
7                    0.002930  0.281250
8                    0.001286  0.307692
9                    0.000373  0.000000
10                   0.000014  0.250000
11                   0.000000  0.000000
12                   0.000160  0.000000
13                   0.000011  0.000000
14                   0.003643  0.111111
16                   0.004098  0.111111
19                   0.000000  0.000000
22                   0.000000  0.000000
24                   0.000000  0.000000
25                   0.000000  0.000000
29                   0.000000  0.000000
34                   0.000000  0.000000
47                   0.000000  0.000000

4. Regional Performance Patterns:
                     ghg_per_dollar  dac_benefit_rate  total_investment
region                                                                 
Bay Area                     0.0015            0.1488       13255750738
Central Valley               0.0022            0.1816         294777141
Other                        0.0073            0.1191        3029393261
Southern California          0.0039            0.1862        1731340906

Equity-Efficiency Trade-off Analysis:

Correlation between GHG efficiency and DAC benefit rate: -0.0398
In [47]:
# Filter for more recent period
multi_county_data['Year'] = pd.to_datetime(multi_county_data['Date Operational']).dt.year
recent_data = multi_county_data[multi_county_data['Year'] >= 2010]

# Create temporal analysis
temporal_analysis = recent_data.groupby('Year').agg({
    'Total Project Cost': ['count', 'sum', 'mean'],
    'Total Project GHGReductions': 'sum',
    'Is Benefit Disadvantaged Communities': 'mean'
})

# Visualize evolution
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# 1. Project Volume
axes[0,0].plot(temporal_analysis.index, 
               temporal_analysis[('Total Project Cost', 'count')], 
               marker='o', color='darkblue', linewidth=2)
axes[0,0].set_title('Number of Multi-County Projects by Year (2010-2024)', 
                    fontsize=12, pad=15)
axes[0,0].set_ylabel('Number of Projects')
axes[0,0].grid(True, alpha=0.3)

# 2. Investment Size
axes[0,1].plot(temporal_analysis.index, 
               temporal_analysis[('Total Project Cost', 'mean')]/1e6, 
               marker='o', color='darkblue', linewidth=2)
axes[0,1].set_title('Average Project Size by Year', 
                    fontsize=12, pad=15)
axes[0,1].set_ylabel('Average Size (Millions $)')
axes[0,1].grid(True, alpha=0.3)

# 3. Equity Performance
axes[1,0].plot(temporal_analysis.index, 
               temporal_analysis[('Is Benefit Disadvantaged Communities', 'mean')], 
               marker='o', color='darkblue', linewidth=2)
axes[1,0].set_title('DAC Benefit Rate by Year', 
                    fontsize=12, pad=15)
axes[1,0].set_ylabel('Proportion Benefiting DACs')
axes[1,0].grid(True, alpha=0.3)

# 4. Partnership Size Evolution
recent_data['partnership_size'] = recent_data['County'].str.count(',') + 1
partnership_evolution = recent_data.groupby('Year')['partnership_size'].mean()

axes[1,1].plot(partnership_evolution.index, 
               partnership_evolution.values, 
               marker='o', color='darkblue', linewidth=2)
axes[1,1].set_title('Average Number of Partner Counties by Year', 
                    fontsize=12, pad=15)
axes[1,1].set_ylabel('Average Partners per Project')
axes[1,1].grid(True, alpha=0.3)

# Adjust x-axis for all subplots
for ax in axes.flat:
    ax.set_xlim(2014, 2024)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print summary statistics for three periods: pre-COVID, COVID, and post-COVID
print("\nPeriod Comparison:")
pre_covid = recent_data[recent_data['Year'] < 2020]
covid = recent_data[recent_data['Year'].isin([2020, 2021])]
post_covid = recent_data[recent_data['Year'] > 2021]

metrics = {
    'Number of Projects': lambda x: len(x),
    'Average Project Size ($M)': lambda x: x['Total Project Cost'].mean()/1e6,
    'DAC Benefit Rate': lambda x: x['Is Benefit Disadvantaged Communities'].mean(),
    'Average Partners': lambda x: x['partnership_size'].mean()
}

comparison = pd.DataFrame({
    'Pre-COVID (2010-2019)': {metric: func(pre_covid) for metric, func in metrics.items()},
    'COVID Period (2020-2021)': {metric: func(covid) for metric, func in metrics.items()},
    'Post-COVID (2022+)': {metric: func(post_covid) for metric, func in metrics.items()}
})

print(comparison.round(2))

# Analyze types of projects during COVID period
print("\nMost Common Project Types During COVID Period:")
print(covid['Program Name'].value_counts().head())

# Look at largest COVID-period collaborations
print("\nLargest COVID-Period Collaborations:")
print(covid.nlargest(5, 'Total Project Cost')[
    ['County', 'Program Name', 'Total Project Cost', 'Total Project GHGReductions']
].round(2))
/tmp/ipykernel_846869/2239187850.py:43: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_data['partnership_size'] = recent_data['County'].str.count(',') + 1
No description has been provided for this image
Period Comparison:
                           Pre-COVID (2010-2019)  COVID Period (2020-2021)  \
Number of Projects                        211.00                    304.00   
Average Project Size ($M)                   8.24                     31.78   
DAC Benefit Rate                            0.37                      0.02   
Average Partners                            3.70                      5.53   

                           Post-COVID (2022+)  
Number of Projects                     286.00  
Average Project Size ($M)               23.08  
DAC Benefit Rate                         0.01  
Average Partners                         3.92  

Most Common Project Types During COVID Period:
Program Name
Urban and Community Forestry Program          144
Low Carbon Transit Operations Program          46
Safe and Affordable Drinking Water Fund        34
Climate Smart Agriculture                      26
Transit and Intercity Rail Capital Program     15
Name: count, dtype: int64

Largest COVID-Period Collaborations:
                                                   County  \
42458                          San Francisco, Santa Clara   
62279   Alameda, Contra Costa, Fresno, Kern, Kings, Ma...   
62280   Alameda, Contra Costa, Fresno, Kern, Kings, Ma...   
100710  Alameda, Contra Costa, Fresno, Kern, Kings, Ma...   
100711  Alameda, Contra Costa, Fresno, Kern, Kings, Ma...   

                                      Program Name  Total Project Cost  \
42458   Transit and Intercity Rail Capital Program          1980252533   
62279   Transit and Intercity Rail Capital Program           904600000   
62280   Transit and Intercity Rail Capital Program           904600000   
100710  Transit and Intercity Rail Capital Program           904600000   
100711  Transit and Intercity Rail Capital Program           904600000   

        Total Project GHGReductions  
42458                        734000  
62279                         52428  
62280                          6554  
100710                        21845  
100711                        43690