Files
california-equity-git/research_analysis_script.py
2025-04-10 00:03:30 -07:00

2209 lines
111 KiB
Python

if 'collaboration_trends' in rq2:
collab_trends = rq2['collaboration_trends']
f.write("### Changes in Collaboration Patterns\n\n")
if 'time_correlation' in collab_trends:
corr = collab_trends['time_correlation']
f.write(f"The correlation between time (year) and collaboration intensity (average number of agencies) is {corr.get('correlation', 0):.2f} ")
if corr.get('significant', False):
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
else:
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
if corr.get('correlation', 0) > 0:
f.write("This indicates a general trend of increasing collaboration over the study period.\n\n")
else:
f.write("This indicates a general trend of decreasing collaboration over the study period.\n\n")
if 'regional_distribution' in collab_trends:
f.write("The distribution of projects across different regional scopes has changed over time.\n\n")
f.write("![Regional Trends](regional_trends.png)\n\n")
if 'time_regression' in rq2:
regression = rq2['time_regression']
f.write("### Time Series Regression Analysis\n\n")
if 'agencies_model' in regression:
model = regression['agencies_model']
f.write("#### Trend in Collaboration Intensity\n\n")
if 'interpretation' in model:
f.write(f"{model['interpretation']} ")
if model.get('significant', False):
f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
else:
f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
if 'efficiency_model' in regression:
model = regression['efficiency_model']
f.write("#### Trend in GHG Efficiency\n\n")
if 'interpretation' in model:
f.write(f"{model['interpretation']} ")
if model.get('significant', False):
f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
else:
f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
if 'dac_model' in regression:
model = regression['dac_model']
f.write("#### Trend in DAC Benefit\n\n")
if 'interpretation' in model:
f.write(f"{model['interpretation']} ")
if model.get('significant', False):
f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
else:
f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
# Regional Variations
f.write("## Regional Variations in Collaboration Effectiveness\n\n")
if 'regional_patterns' in findings:
regional = findings['regional_patterns']
if 'regional_metrics' in regional:
metrics = regional['regional_metrics']
f.write("### Regional Distribution and Characteristics\n\n")
f.write("The analysis identified significant variations in collaboration patterns and outcomes across different regions of California.\n\n")
f.write("![Regional Metrics](regional_metrics.png)\n\n")
f.write("### Efficiency-Equity Tradeoffs by Region\n\n")
if 'efficiency_equity_tradeoff' in regional:
tradeoff = regional['efficiency_equity_tradeoff']
f.write("The analysis examined the tradeoff between GHG efficiency and equity outcomes across regions.\n\n")
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
top_region = tradeoff['top_regions'][0]
f.write(f"The {top_region} region achieves the best overall balance between efficiency and equity outcomes. ")
# Include top 3 regions if available
if len(tradeoff['top_regions']) >= 3:
top_3 = ", ".join(tradeoff['top_regions'][:3])
f.write(f"The top three regions in terms of combined efficiency and equity performance are: {top_3}.\n\n")
else:
f.write("\n\n")
f.write("![Efficiency-Equity Tradeoff](efficiency_equity_tradeoff.png)\n\n")
if 'efficiency_equity_correlation' in regional:
corr = regional['efficiency_equity_correlation']
corr_val = corr.get('correlation', 0)
f.write(f"The correlation between GHG efficiency cost and DAC benefit percentage across regions is {corr_val:.2f} ")
if corr.get('significant', False):
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
else:
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
if corr_val > 0.3:
f.write("This positive correlation suggests a tradeoff between efficiency and equity, where regions with better efficiency (lower cost per ton) tend to have lower DAC benefits.\n\n")
elif corr_val < -0.3:
f.write("This negative correlation suggests synergy between efficiency and equity, where regions with better efficiency (lower cost per ton) tend to have higher DAC benefits.\n\n")
else:
f.write("This weak correlation suggests that efficiency and equity outcomes may be largely independent of each other across regions.\n\n")
if 'regional_anova' in regional:
anova = regional['regional_anova']
f.write("### Statistical Comparison of Regions\n\n")
if 'ghg_efficiency' in anova:
efficiency = anova['ghg_efficiency']
f.write("#### Regional Differences in GHG Efficiency\n\n")
if efficiency.get('significant', False):
f.write(f"There are statistically significant differences in GHG efficiency across regions (F={efficiency.get('f_value', 0):.2f}, p={efficiency.get('p_value', 1):.4f}).\n\n")
if 'pairwise_tests' in efficiency:
sig_pairs = [pair for pair, result in efficiency['pairwise_tests'].items() if result.get('significant', False)]
if len(sig_pairs) > 0:
f.write("The significant regional differences include:\n\n")
for pair in sig_pairs[:3]: # Show up to 3 significant pairs
f.write(f"- {pair}\n")
if len(sig_pairs) > 3:
f.write(f"- And {len(sig_pairs) - 3} other significant regional differences\n")
f.write("\n")
else:
f.write(f"There are no statistically significant differences in GHG efficiency across regions (F={efficiency.get('f_value', 0):.2f}, p={efficiency.get('p_value', 1):.4f}).\n\n")
if 'dac_benefit' in anova:
dac = anova['dac_benefit']
f.write("#### Regional Differences in DAC Benefit\n\n")
if dac.get('significant', False):
f.write(f"There are statistically significant differences in DAC benefit percentage across regions (F={dac.get('f_value', 0):.2f}, p={dac.get('p_value', 1):.4f}).\n\n")
if 'pairwise_tests' in dac:
sig_pairs = [pair for pair, result in dac['pairwise_tests'].items() if result.get('significant', False)]
if len(sig_pairs) > 0:
f.write("The significant regional differences include:\n\n")
for pair in sig_pairs[:3]: # Show up to 3 significant pairs
f.write(f"- {pair}\n")
if len(sig_pairs) > 3:
f.write(f"- And {len(sig_pairs) - 3} other significant regional differences\n")
f.write("\n")
else:
f.write(f"There are no statistically significant differences in DAC benefit percentage across regions (F={dac.get('f_value', 0):.2f}, p={dac.get('p_value', 1):.4f}).\n\n")
# Conclusions and Implications
f.write("## Conclusions and Implications\n\n")
f.write("### Key Findings\n\n")
# Summarize key findings from RQ1
f.write("#### Research Question 1: Collaboration Structure and Scale\n\n")
if 'rq1' in findings:
rq1 = findings['rq1']
conclusions_rq1 = []
# Multi-agency vs single-agency comparison
if 'ghg_efficiency' in rq1 and 'dac_benefit' in rq1:
eff = rq1['ghg_efficiency']
dac = rq1['dac_benefit']
eff_diff = eff.get('difference', 0)
dac_diff = dac.get('difference', 0)
if eff_diff < 0 and dac_diff > 0:
conclusions_rq1.append("Multi-agency programs demonstrate a clear tradeoff: they are less cost-efficient but achieve higher equity outcomes compared to single-agency programs.")
elif eff_diff < 0 and dac_diff < 0:
conclusions_rq1.append("Multi-agency programs underperform single-agency programs in both cost efficiency and equity outcomes, suggesting potential coordination challenges.")
elif eff_diff > 0 and dac_diff > 0:
conclusions_rq1.append("Multi-agency programs outperform single-agency programs in both cost efficiency and equity outcomes, demonstrating synergistic benefits of collaboration.")
elif eff_diff > 0 and dac_diff < 0:
conclusions_rq1.append("Multi-agency programs show better cost efficiency but lower equity outcomes compared to single-agency programs.")
# Agency count correlation findings
if 'agency_count_impact' in rq1:
agency_impact = rq1['agency_count_impact']
if 'efficiency_correlation' in agency_impact and 'dac_correlation' in agency_impact:
eff_corr = agency_impact['efficiency_correlation'].get('correlation', 0)
dac_corr = agency_impact['dac_correlation'].get('correlation', 0)
if abs(eff_corr) > 0.1:
if eff_corr > 0:
conclusions_rq1.append("Increasing the number of collaborating agencies tends to reduce cost efficiency (higher $ per ton).")
else:
conclusions_rq1.append("Increasing the number of collaborating agencies tends to improve cost efficiency (lower $ per ton).")
if abs(dac_corr) > 0.1:
if dac_corr > 0:
conclusions_rq1.append("Increasing the number of collaborating agencies tends to improve equity outcomes (higher DAC benefit).")
else:
conclusions_rq1.append("Increasing the number of collaborating agencies tends to reduce equity outcomes (lower DAC benefit).")
# Regional scope findings
if 'regional_scope_impact' in rq1:
scope_impact = rq1['regional_scope_impact']
if 'avg_agencies' in scope_impact and 'Multi-Regional' in scope_impact['avg_agencies']:
multi_regional_avg = scope_impact['avg_agencies']['Multi-Regional']
single_county_avg = scope_impact['avg_agencies'].get('Single County', 0)
if multi_regional_avg > single_county_avg:
conclusions_rq1.append(f"Projects with wider geographic scope involve significantly more collaborative partners (Multi-Regional: {multi_regional_avg:.1f} agencies vs. Single County: {single_county_avg:.1f} agencies).")
for conclusion in conclusions_rq1:
f.write(f"- {conclusion}\n")
f.write("\n")
# Summarize key findings from RQ2
f.write("#### Research Question 2: Temporal Trends\n\n")
if 'rq2' in findings:
rq2 = findings['rq2']
conclusions_rq2 = []
# Pre/post 2020 comparison
if 'period_comparison' in rq2:
period = rq2['period_comparison']
if 'avg_agencies_change' in period:
change = period.get('avg_agencies_change', 0)
if abs(change) > 5: # Only include if the change is substantial
if change > 0:
conclusions_rq2.append(f"Collaboration intensity increased significantly ({change:.1f}%) after 2020, indicating a shift toward more collaborative approaches.")
else:
conclusions_rq2.append(f"Collaboration intensity decreased significantly ({abs(change):.1f}%) after 2020, indicating a shift toward more independent approaches.")
if 'avg_funding_change' in period:
change = period.get('avg_funding_change', 0)
if abs(change) > 10: # Only include if the change is substantial
if change > 0:
conclusions_rq2.append(f"Average project funding increased substantially ({change:.1f}%) after 2020, suggesting a shift toward larger-scale initiatives.")
else:
conclusions_rq2.append(f"Average project funding decreased substantially ({abs(change):.1f}%) after 2020, suggesting a shift toward smaller-scale initiatives.")
if 'dac_benefit_change' in period:
change = period.get('dac_benefit_change', 0)
if abs(change) > 5: # Only include if the change is substantial
if change > 0:
conclusions_rq2.append(f"Equity outcomes improved ({change:.1f}% increase in DAC benefit) after 2020, reflecting increased emphasis on disadvantaged communities.")
else:
conclusions_rq2.append(f"Equity outcomes declined ({abs(change):.1f}% decrease in DAC benefit) after 2020, suggesting potential challenges in maintaining equity focus.")
# Time series regression findings
if 'time_regression' in rq2:
regression = rq2['time_regression']
if 'agencies_model' in regression and regression['agencies_model'].get('significant', False):
conclusions_rq2.append(regression['agencies_model'].get('interpretation', ''))
if 'efficiency_model' in regression and regression['efficiency_model'].get('significant', False):
conclusions_rq2.append(regression['efficiency_model'].get('interpretation', ''))
if 'dac_model' in regression and regression['dac_model'].get('significant', False):
conclusions_rq2.append(regression['dac_model'].get('interpretation', ''))
for conclusion in conclusions_rq2:
f.write(f"- {conclusion}\n")
f.write("\n")
# Summarize regional variations
f.write("#### Regional Variations\n\n")
if 'regional_patterns' in findings:
regional = findings['regional_patterns']
conclusions_regional = []
if 'efficiency_equity_tradeoff' in regional:
tradeoff = regional['efficiency_equity_tradeoff']
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
top_region = tradeoff['top_regions'][0]
conclusions_regional.append(f"The {top_region} region achieves the most balanced performance between efficiency and equity outcomes.")
if 'efficiency_equity_correlation' in regional:
corr = regional['efficiency_equity_correlation']
corr_val = corr.get('correlation', 0)
if abs(corr_val) > 0.3:
if corr_val > 0:
conclusions_regional.append("There is evidence of a tradeoff between efficiency and equity across regions, suggesting potential policy tensions.")
else:
conclusions_regional.append("There is evidence of synergy between efficiency and equity across regions, suggesting potential for policies that achieve both goals.")
if 'regional_anova' in regional:
anova = regional['regional_anova']
if 'ghg_efficiency' in anova and anova['ghg_efficiency'].get('significant', False):
conclusions_regional.append("There are statistically significant differences in GHG efficiency across regions, indicating that geographic context matters for cost-effectiveness.")
if 'dac_benefit' in anova and anova['dac_benefit'].get('significant', False):
conclusions_regional.append("There are statistically significant differences in DAC benefit across regions, suggesting variation in equity focus or capabilities.")
for conclusion in conclusions_regional:
f.write(f"- {conclusion}\n")
f.write("\n")
# Policy Implications
f.write("### Policy Implications\n\n")
implications = []
# Generate some implications based on findings
if 'rq1' in findings:
rq1 = findings['rq1']
if 'ghg_efficiency' in rq1 and 'dac_benefit' in rq1:
eff = rq1['ghg_efficiency']
dac = rq1['dac_benefit']
eff_diff = eff.get('difference', 0)
dac_diff = dac.get('difference', 0)
if eff_diff < 0 and dac_diff > 0:
implications.append("Policymakers face a tradeoff between efficiency and equity when designing collaborative structures. Multi-agency programs may prioritize equity at some cost to efficiency.")
elif eff_diff < 0 and dac_diff < 0:
implications.append("Coordination challenges in multi-agency programs may need to be addressed through improved governance structures and clearer role definitions.")
elif eff_diff > 0 and dac_diff > 0:
implications.append("Further investment in multi-agency collaborative structures is warranted given their superior performance in both efficiency and equity dimensions.")
if 'rq2' in findings:
rq2 = findings['rq2']
if 'period_comparison' in rq2:
period = rq2['period_comparison']
if 'avg_agencies_change' in period:
change = period.get('avg_agencies_change', 0)
if change > 0:
implications.append("The trend toward increased collaboration after 2020 suggests policy support for collaborative approaches is having an effect and could be further strengthened.")
if 'regional_patterns' in findings:
regional = findings['regional_patterns']
if 'efficiency_equity_tradeoff' in regional:
tradeoff = regional['efficiency_equity_tradeoff']
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
top_region = tradeoff['top_regions'][0]
implications.append(f"The successful balance achieved in the {top_region} region could serve as a model for other regions seeking to improve both efficiency and equity outcomes.")
# Add some general implications
implications.append("Careful design of collaboration structures is needed to manage potential tradeoffs between efficiency and equity goals.")
implications.append("Regional context matters for program implementation and outcomes, suggesting that customized approaches may be more effective than one-size-fits-all policies.")
implications.append("Regular monitoring and evaluation of collaboration patterns and outcomes can help identify best practices and areas for improvement.")
for implication in implications:
f.write(f"- {implication}\n")
f.write("\n")
# Future Research
f.write("### Future Research Directions\n\n")
future_research = []
future_research.append("Investigate the specific mechanisms through which multi-agency collaboration affects program outcomes, including both benefits and potential coordination challenges.")
future_research.append("Explore the role of different types of agencies (state, local, non-profit) in collaborative structures and their impact on program effectiveness.")
future_research.append("Conduct case studies of high-performing regions to identify transferable best practices for balancing efficiency and equity goals.")
future_research.append("Analyze the impact of policy changes around 2020 on program implementation and outcomes.")
future_research.append("Examine the long-term sustainability of climate investments and the durability of their greenhouse gas reduction benefits.")
for research in future_research:
f.write(f"- {research}\n")
logger.info(f"Research report generated and saved to {report_file}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='California Climate Investments Collaboration Analysis')
parser.add_argument('--data_path', type=str, required=True, help='Path to the cleaned CCI data CSV file')
parser.add_argument('--output_path', type=str, default='./output/research', help='Path to save outputs')
args = parser.parse_args()
# Run the research analysis
findings = analyze_research_questions(args.data_path, args.output_path)
if findings:
print(f"\nAnalysis complete! Results saved to {args.output_path}")
else:
print("\nAnalysis failed to complete successfully")
# The above code is creating a scatter plot visualization to compare efficiency vs. equity by region.
# Here is a breakdown of the code:
# 3. Regional ANOVA analysis
if ('ca_region' in analysis_df.columns and
'ghg_efficiency' in analysis_df.columns and
'dac_benefit_percentage' in analysis_df.columns):
regional_anova = {}
# ANOVA for GHG efficiency by region
try:
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)].copy()
if len(valid_df) > 0:
# Log transform efficiency for better normality
valid_df['log_efficiency'] = np.log(valid_df['ghg_efficiency'])
# Create ANOVA model
model = smf.ols('log_efficiency ~ C(ca_region)', data=valid_df)
results = model.fit()
# Perform ANOVA
anova_table = sm.stats.anova_lm(results, typ=2)
regional_anova['ghg_efficiency'] = {
'f_value': anova_table['F'][0],
'p_value': anova_table['PR(>F)'][0],
'significant': anova_table['PR(>F)'][0] < 0.05,
'r_squared': results.rsquared
}
# Post-hoc pairwise t-tests if ANOVA is significant
if regional_anova['ghg_efficiency']['significant']:
# Get regions with at least 10 projects
region_counts = valid_df['ca_region'].value_counts()
valid_regions = region_counts[region_counts >= 10].index
# Filter to valid regions
posthoc_df = valid_df[valid_df['ca_region'].isin(valid_regions)]
if len(valid_regions) > 1:
# Calculate pairwise t-tests
pairwise_results = {}
for i, region1 in enumerate(valid_regions):
for region2 in valid_regions[i+1:]:
group1 = posthoc_df[posthoc_df['ca_region'] == region1]['log_efficiency']
group2 = posthoc_df[posthoc_df['ca_region'] == region2]['log_efficiency']
try:
t_stat, p_value = stats.ttest_ind(
group1, group2, equal_var=False # Use Welch's t-test
)
# Store results with Bonferroni correction
n_comparisons = len(valid_regions) * (len(valid_regions) - 1) / 2
corrected_p = min(p_value * n_comparisons, 1.0)
pairwise_results[f"{region1} vs {region2}"] = {
't_statistic': t_stat,
'p_value': p_value,
'corrected_p': corrected_p,
'significant': corrected_p < 0.05
}
except Exception:
continue
regional_anova['ghg_efficiency']['pairwise_tests'] = pairwise_results
except Exception as e:
logger.warning(f"Could not perform ANOVA for GHG efficiency by region: {e}")
# ANOVA for DAC benefit by region
try:
# Create ANOVA model
model = smf.ols('dac_benefit_percentage ~ C(ca_region)', data=analysis_df)
results = model.fit()
# Perform ANOVA
anova_table = sm.stats.anova_lm(results, typ=2)
regional_anova['dac_benefit'] = {
'f_value': anova_table['F'][0],
'p_value': anova_table['PR(>F)'][0],
'significant': anova_table['PR(>F)'][0] < 0.05,
'r_squared': results.rsquared
}
# Post-hoc pairwise t-tests if ANOVA is significant
if regional_anova['dac_benefit']['significant']:
# Get regions with at least 10 projects
region_counts = analysis_df['ca_region'].value_counts()
valid_regions = region_counts[region_counts >= 10].index
# Filter to valid regions
posthoc_df = analysis_df[analysis_df['ca_region'].isin(valid_regions)]
if len(valid_regions) > 1:
# Calculate pairwise t-tests
pairwise_results = {}
for i, region1 in enumerate(valid_regions):
for region2 in valid_regions[i+1:]:
group1 = posthoc_df[posthoc_df['ca_region'] == region1]['dac_benefit_percentage']
group2 = posthoc_df[posthoc_df['ca_region'] == region2]['dac_benefit_percentage']
try:
t_stat, p_value = stats.ttest_ind(
group1, group2, equal_var=False # Use Welch's t-test
)
# Store results with Bonferroni correction
n_comparisons = len(valid_regions) * (len(valid_regions) - 1) / 2
corrected_p = min(p_value * n_comparisons, 1.0)
pairwise_results[f"{region1} vs {region2}"] = {
't_statistic': t_stat,
'p_value': p_value,
'corrected_p': corrected_p,
'significant': corrected_p < 0.05
}
except Exception:
continue
regional_anova['dac_benefit']['pairwise_tests'] = pairwise_results
except Exception as e:
logger.warning(f"Could not perform ANOVA for DAC benefit by region: {e}")
findings['regional_anova'] = regional_anova
return findings
def generate_research_report(findings, output_dir):
"""
Generate a comprehensive research report based on the analysis findings.
Parameters:
findings (dict): Dictionary containing research findings
output_dir (Path): Directory to save the report
"""
logger.info("Generating research report")
# Create report file
report_file = output_dir / "research_report.md"
with open(report_file, 'w') as f:
# Title and introduction
f.write("# California Climate Investments Collaboration Analysis\n\n")
f.write("## Research Report\n\n")
f.write("This report presents findings from an analysis of collaboration patterns in California's Climate Investments (CCI) program from 2015 to 2024, focusing on how inter-agency collaboration affects greenhouse gas (GHG) reduction efficiencies and equity outcomes.\n\n")
# Executive summary
f.write("## Executive Summary\n\n")
# Highlight key findings
exec_summary = []
# RQ1: Collaboration Structure Impact
if 'rq1' in findings:
rq1 = findings['rq1']
if 'ghg_efficiency' in rq1:
eff = rq1['ghg_efficiency']
diff = eff.get('difference', 0)
if diff < 0:
exec_summary.append(f"Multi-agency programs are less cost-efficient than single-agency programs (${eff.get('multi_agency_median', 0):,.2f} vs. ${eff.get('single_agency_median', 0):,.2f} per ton CO2e).")
else:
exec_summary.append(f"Multi-agency programs are more cost-efficient than single-agency programs (${eff.get('multi_agency_median', 0):,.2f} vs. ${eff.get('single_agency_median', 0):,.2f} per ton CO2e).")
if 'dac_benefit' in rq1:
dac = rq1['dac_benefit']
diff = dac.get('difference', 0)
if diff > 0:
exec_summary.append(f"Multi-agency programs achieve higher equity outcomes than single-agency programs ({dac.get('multi_agency_mean', 0):.2f}% vs. {dac.get('single_agency_mean', 0):.2f}% DAC benefit).")
else:
exec_summary.append(f"Multi-agency programs achieve lower equity outcomes than single-agency programs ({dac.get('multi_agency_mean', 0):.2f}% vs. {dac.get('single_agency_mean', 0):.2f}% DAC benefit).")
# RQ2: Temporal Trends
if 'rq2' in findings:
rq2 = findings['rq2']
if 'period_comparison' in rq2:
period = rq2['period_comparison']
if 'avg_agencies_change' in period:
change = period.get('avg_agencies_change', 0)
if change > 0:
exec_summary.append(f"Collaboration intensity increased by {change:.1f}% from pre-2020 to post-2020 periods.")
else:
exec_summary.append(f"Collaboration intensity decreased by {abs(change):.1f}% from pre-2020 to post-2020 periods.")
if 'avg_funding_change' in period:
change = period.get('avg_funding_change', 0)
if change > 0:
exec_summary.append(f"Average project funding increased by {change:.1f}% after 2020.")
else:
exec_summary.append(f"Average project funding decreased by {abs(change):.1f}% after 2020.")
if 'dac_benefit_change' in period:
change = period.get('dac_benefit_change', 0)
if change > 0:
exec_summary.append(f"DAC benefit percentage increased by {change:.1f}% after 2020.")
else:
exec_summary.append(f"DAC benefit percentage decreased by {abs(change):.1f}% after 2020.")
# Regional variations
if 'regional_patterns' in findings:
regional = findings['regional_patterns']
if 'efficiency_equity_tradeoff' in regional:
tradeoff = regional['efficiency_equity_tradeoff']
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
top_region = tradeoff['top_regions'][0]
exec_summary.append(f"The {top_region} region demonstrates the best balance between GHG efficiency and equity outcomes.")
if 'efficiency_equity_correlation' in regional:
corr = regional['efficiency_equity_correlation']
corr_val = corr.get('correlation', 0)
if abs(corr_val) > 0.3:
if corr_val > 0:
exec_summary.append(f"There is a positive correlation ({corr_val:.2f}) between GHG efficiency cost and equity outcomes, suggesting a tradeoff between these goals.")
else:
exec_summary.append(f"There is a negative correlation ({corr_val:.2f}) between GHG efficiency cost and equity outcomes, suggesting synergy between these goals.")
# Write executive summary bullets
for point in exec_summary:
f.write(f"- {point}\n")
f.write("\n## Research Questions\n\n")
f.write("1. How does the structure and scale of inter-agency collaboration within the California Climate Investments (CCI) program shape outputs related to greenhouse gas (GHG) reduction efficiencies and equity?\n")
f.write("2. What temporal trends relative to funding allocation, agency collaboration, and collaborative outputs related to efficiency and equity outcomes have occurred from 2015 to 2024?\n\n")
# Research Question 1: Collaboration Structure Impact
f.write("## Research Question 1: Impact of Collaboration Structure and Scale\n\n")
if 'rq1' in findings:
rq1 = findings['rq1']
f.write("### Multi-Agency vs. Single-Agency Program Comparison\n\n")
if 'project_counts' in rq1:
counts = rq1['project_counts']
f.write(f"The dataset includes {counts.get('multi_agency', 0):,} projects in multi-agency programs ({counts.get('multi_agency_percentage', 0):.1f}% of non-EV projects) and {counts.get('single_agency', 0):,} projects in single-agency programs.\n\n")
if 'ghg_efficiency' in rq1:
eff = rq1['ghg_efficiency']
f.write("#### GHG Efficiency Comparison\n\n")
f.write(f"The median GHG efficiency (cost per ton of CO2e reduced) for multi-agency programs is ${eff.get('multi_agency_median', 0):,.2f}, compared to ${eff.get('single_agency_median', 0):,.2f} for single-agency programs. This represents a difference of ${abs(eff.get('difference', 0)):,.2f} per ton ")
if eff.get('difference', 0) < 0:
f.write("in favor of single-agency programs.\n\n")
else:
f.write("in favor of multi-agency programs.\n\n")
if 'statistical_test' in eff:
test = eff['statistical_test']
if test.get('significant', False):
f.write(f"This difference is statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
else:
f.write(f"This difference is not statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
f.write("![GHG Efficiency Comparison](ghg_efficiency_comparison.png)\n\n")
if 'dac_benefit' in rq1:
dac = rq1['dac_benefit']
f.write("#### Disadvantaged Community (DAC) Benefit Comparison\n\n")
f.write(f"The mean DAC benefit percentage for multi-agency programs is {dac.get('multi_agency_mean', 0):.2f}%, compared to {dac.get('single_agency_mean', 0):.2f}% for single-agency programs. This represents a difference of {abs(dac.get('difference', 0)):.2f}% ")
if dac.get('difference', 0) > 0:
f.write("in favor of multi-agency programs.\n\n")
else:
f.write("in favor of single-agency programs.\n\n")
if 'statistical_test' in dac:
test = dac['statistical_test']
if test.get('significant', False):
f.write(f"This difference is statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
else:
f.write(f"This difference is not statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
f.write("![DAC Benefit Comparison](dac_benefit_comparison.png)\n\n")
f.write("### Impact of Number of Agencies\n\n")
if 'agency_count_impact' in rq1:
agency_impact = rq1['agency_count_impact']
f.write("The analysis examined how the number of collaborating agencies affects program outcomes.\n\n")
if 'efficiency_correlation' in agency_impact:
corr = agency_impact['efficiency_correlation']
f.write(f"The correlation between number of agencies and GHG efficiency is {corr.get('correlation', 0):.2f} ")
if corr.get('significant', False):
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
else:
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
if corr.get('correlation', 0) > 0:
f.write("This suggests that as the number of agencies increases, the cost per ton of GHG reduction tends to increase (less efficient).\n\n")
else:
f.write("This suggests that as the number of agencies increases, the cost per ton of GHG reduction tends to decrease (more efficient).\n\n")
if 'dac_correlation' in agency_impact:
corr = agency_impact['dac_correlation']
f.write(f"The correlation between number of agencies and DAC benefit percentage is {corr.get('correlation', 0):.2f} ")
if corr.get('significant', False):
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
else:
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
if corr.get('correlation', 0) > 0:
f.write("This suggests that as the number of agencies increases, the DAC benefit percentage tends to increase.\n\n")
else:
f.write("This suggests that as the number of agencies increases, the DAC benefit percentage tends to decrease.\n\n")
f.write("![Agency Count Impact](agency_count_impact.png)\n\n")
f.write("### Impact of Regional Scope\n\n")
if 'regional_scope_impact' in rq1:
scope_impact = rq1['regional_scope_impact']
f.write("The analysis examined how the regional scope of projects affects collaboration intensity and outcomes.\n\n")
if 'avg_agencies' in scope_impact:
f.write("#### Collaboration Intensity by Regional Scope\n\n")
# Check if Multi-Regional has the highest average
if 'Multi-Regional' in scope_impact['avg_agencies']:
multi_regional_avg = scope_impact['avg_agencies']['Multi-Regional']
single_county_avg = scope_impact['avg_agencies'].get('Single County', 0)
f.write(f"Multi-Regional projects involve an average of {multi_regional_avg:.2f} agencies, compared to {single_county_avg:.2f} for Single County projects. ")
f.write("This indicates that projects with wider geographic scope tend to involve more collaborative partners.\n\n")
if 'ghg_efficiency' in scope_impact and 'dac_benefit' in scope_impact:
f.write("#### Efficiency and Equity by Regional Scope\n\n")
# Identify best and worst scopes for efficiency
if len(scope_impact['ghg_efficiency']) > 0:
best_eff_scope = min(scope_impact['ghg_efficiency'].items(), key=lambda x: x[1])[0]
worst_eff_scope = max(scope_impact['ghg_efficiency'].items(), key=lambda x: x[1])[0]
f.write(f"The {best_eff_scope} scope achieves the best GHG efficiency (${scope_impact['ghg_efficiency'][best_eff_scope]:,.2f} per ton), while the {worst_eff_scope} scope has the least efficient outcomes (${scope_impact['ghg_efficiency'][worst_eff_scope]:,.2f} per ton).\n\n")
# Identify best and worst scopes for DAC benefit
if len(scope_impact['dac_benefit']) > 0:
best_dac_scope = max(scope_impact['dac_benefit'].items(), key=lambda x: x[1])[0]
worst_dac_scope = min(scope_impact['dac_benefit'].items(), key=lambda x: x[1])[0]
f.write(f"The {best_dac_scope} scope achieves the highest DAC benefit ({scope_impact['dac_benefit'][best_dac_scope]:.2f}%), while the {worst_dac_scope} scope has the lowest equity outcomes ({scope_impact['dac_benefit'][worst_dac_scope]:.2f}%).\n\n")
f.write("![Regional Scope Impact](regional_scope_impact.png)\n\n")
if 'regression_analysis' in rq1:
regression = rq1['regression_analysis']
f.write("### Regression Analysis\n\n")
if 'ghg_efficiency' in regression:
ghg_model = regression['ghg_efficiency']
if 'base_model' in ghg_model:
base = ghg_model['base_model']
f.write("#### GHG Efficiency Model\n\n")
f.write(f"A regression model predicting GHG efficiency from the number of agencies explains {base.get('r_squared', 0)*100:.1f}% of the variance (R²). ")
if base.get('p_value', 1) < 0.05:
f.write(f"The effect of agency count is statistically significant (p={base.get('p_value', 1):.4f}). ")
if base.get('coefficient', 0) > 0:
f.write("Each additional collaborating agency is associated with higher costs per ton of GHG reduction (less efficient).\n\n")
else:
f.write("Each additional collaborating agency is associated with lower costs per ton of GHG reduction (more efficient).\n\n")
else:
f.write(f"The effect of agency count is not statistically significant (p={base.get('p_value', 1):.4f}).\n\n")
if 'dac_benefit' in regression:
dac_model = regression['dac_benefit']
if 'base_model' in dac_model:
base = dac_model['base_model']
f.write("#### DAC Benefit Model\n\n")
f.write(f"A regression model predicting DAC benefit percentage from the number of agencies explains {base.get('r_squared', 0)*100:.1f}% of the variance (R²). ")
if base.get('p_value', 1) < 0.05:
f.write(f"The effect of agency count is statistically significant (p={base.get('p_value', 1):.4f}). ")
if base.get('coefficient', 0) > 0:
f.write("Each additional collaborating agency is associated with a higher DAC benefit percentage.\n\n")
else:
f.write("Each additional collaborating agency is associated with a lower DAC benefit percentage.\n\n")
else:
f.write(f"The effect of agency count is not statistically significant (p={base.get('p_value', 1):.4f}).\n\n")
# Research Question 2: Temporal Trends
f.write("## Research Question 2: Temporal Trends (2015-2024)\n\n")
if 'rq2' in findings:
rq2 = findings['rq2']
f.write("### Yearly Trends in Collaboration and Outcomes\n\n")
if 'yearly' in rq2:
yearly = rq2['yearly']
f.write("The analysis tracked changes in collaboration patterns and program outcomes from 2015 to 2024.\n\n")
f.write("![Yearly Trends](yearly_trends.png)\n\n")
f.write("### Pre-2020 vs. Post-2020 Comparison\n\n")
if 'period_comparison' in rq2:
period = rq2['period_comparison']
f.write("A comparison of the pre-2020 and post-2020 periods reveals significant changes in the CCI program.\n\n")
if 'project_count_change' in period:
change = period.get('project_count_change', 0)
pre_count = period['project_counts'].get('Pre-2020', 0)
post_count = period['project_counts'].get('Post-2020', 0)
f.write(f"The number of projects changed from {pre_count:,} (pre-2020) to {post_count:,} (post-2020), a {change:+.1f}% change.\n\n")
if 'avg_agencies_change' in period:
change = period.get('avg_agencies_change', 0)
pre_agencies = period['avg_agencies'].get('Pre-2020', 0)
post_agencies = period['avg_agencies'].get('Post-2020', 0)
f.write(f"The average number of agencies per program increased from {pre_agencies:.2f} to {post_agencies:.2f}, a {change:+.1f}% change. ")
if change > 0:
f.write("This indicates an increase in collaboration intensity after 2020.\n\n")
else:
f.write("This indicates a decrease in collaboration intensity after 2020.\n\n")
if 'avg_funding_change' in period:
change = period.get('avg_funding_change', 0)
pre_funding = period['avg_funding'].get('Pre-2020', 0)
post_funding = period['avg_funding'].get('Post-2020', 0)
f.write(f"The average project funding changed from ${pre_funding:,.2f} to ${post_funding:,.2f}, a {change:+.1f}% change.\n\n")
if 'ghg_efficiency_change' in period:
change = period.get('ghg_efficiency_change', 0)
pre_eff = period['median_ghg_efficiency'].get('Pre-2020', 0)
post_eff = period['median_ghg_efficiency'].get('Post-2020', 0)
f.write(f"The median GHG efficiency changed from ${pre_eff:,.2f} to ${post_eff:,.2f} per ton CO2e, a {change:+.1f}% change. ")
if change > 0:
f.write("This indicates a decrease in cost-effectiveness after 2020.\n\n")
else:
f.write("This indicates an increase in cost-effectiveness after 2020.\n\n")
if 'ghg_efficiency_test' in period and period['ghg_efficiency_test'].get('significant', False):
f.write("This change is statistically significant.\n\n")
if 'dac_benefit_change' in period:
change = period.get('dac_benefit_change', 0)
pre_dac = period['avg_dac_benefit'].get('Pre-2020', 0)
post_dac = period['avg_dac_benefit'].get('Post-2020', 0)
f.write(f"The average DAC benefit percentage changed from {pre_dac:.2f}% to {post_dac:.2f}%, a {change:+.1f}% change. ")
if change > 0:
f.write("This indicates improved equity outcomes after 2020.\n\n")
else:
f.write("This indicates decreased equity outcomes after 2020.\n\n")
if 'dac_benefit_test' in period and period['dac_benefit_test'].get('significant', False):
f.write("This change is statistically significant.\n\n")
f.write("![Pre/Post 2020 Comparison](pre_post_2020_comparison.png)\n\n")
if 'collaboration_trends' in rq2:
# Some logic here, or even just pass
pass # or continue handling 'collaboration_trends'
# Now define the function properly
def analyze_regional_variations(df, output_dir):
"""
Analyze regional variations in collaboration and outcomes.
Parameters:
df (pd.DataFrame): The CCI data
output_dir (Path): Directory to save outputs
Returns:
dict: Findings related to regional variations
"""
logger.info("Analyzing regional variations")
findings = {}
# Exclude EV vouchers for this analysis if they're marked
if 'is_ev_voucher' in df.columns:
analysis_df = df[~df['is_ev_voucher']].copy()
logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis")
else:
analysis_df = df.copy()
# Check if we have regional data
if 'ca_region' not in analysis_df.columns:
logger.error("California region data not available for regional analysis")
return findings
# 1. Calculate metrics by region
regional_metrics = {}
# Group by region
region_groups = analysis_df.groupby('ca_region')
# Project counts by region
regional_metrics['project_counts'] = region_groups.size().to_dict()
regional_metrics['project_percentages'] = (region_groups.size() / len(analysis_df) * 100).to_dict()
# Number of agencies by region
if 'num_agencies_in_program' in analysis_df.columns:
regional_metrics['avg_agencies'] = region_groups['num_agencies_in_program'].mean().to_dict()
regional_metrics['multi_agency_percentage'] = (region_groups['is_multi_agency'].mean() * 100).to_dict()
# Funding metrics by region
funding_col = [col for col in analysis_df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
if funding_col and len(funding_col) > 0:
funding_col = funding_col[0]
regional_metrics['total_funding'] = region_groups[funding_col].sum().to_dict()
regional_metrics['avg_funding'] = region_groups[funding_col].mean().to_dict()
# Calculate funding share
total_funding = analysis_df[funding_col].sum()
regional_metrics['funding_percentages'] = {region: funding / total_funding * 100
for region, funding in regional_metrics['total_funding'].items()}
# GHG efficiency by region
if 'ghg_efficiency' in analysis_df.columns:
# Use valid values only and median for robustness to outliers
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
if len(valid_df) > 0:
region_efficiency = valid_df.groupby('ca_region')['ghg_efficiency'].median()
regional_metrics['median_ghg_efficiency'] = region_efficiency.to_dict()
# DAC benefit by region
if 'dac_benefit_percentage' in analysis_df.columns:
regional_metrics['avg_dac_benefit'] = region_groups['dac_benefit_percentage'].mean().to_dict()
findings['regional_metrics'] = regional_metrics
# Create visualization for regional metrics
plt.figure(figsize=(15, 12))
# 1. Project distribution by region
plt.subplot(2, 2, 1)
if 'project_counts' in regional_metrics:
regions = list(regional_metrics['project_counts'].keys())
counts = list(regional_metrics['project_counts'].values())
# Sort by count
sorted_data = sorted(zip(regions, counts), key=lambda x: x[1], reverse=True)
regions, counts = zip(*sorted_data)
plt.barh(regions, counts)
plt.xlabel('Number of Projects')
plt.title('Project Distribution by Region')
plt.grid(True, axis='x', alpha=0.3)
# 2. Average number of agencies by region
plt.subplot(2, 2, 2)
if 'avg_agencies' in regional_metrics:
regions = list(regional_metrics['avg_agencies'].keys())
values = list(regional_metrics['avg_agencies'].values())
# Sort by average agencies
sorted_data = sorted(zip(regions, values), key=lambda x: x[1], reverse=True)
regions, values = zip(*sorted_data)
plt.barh(regions, values)
plt.xlabel('Average Number of Agencies')
plt.title('Collaboration Intensity by Region')
plt.grid(True, axis='x', alpha=0.3)
# 3. GHG efficiency by region
plt.subplot(2, 2, 3)
if 'median_ghg_efficiency' in regional_metrics:
regions = list(regional_metrics['median_ghg_efficiency'].keys())
values = list(regional_metrics['median_ghg_efficiency'].values())
# Sort by efficiency (lower is better)
sorted_data = sorted(zip(regions, values), key=lambda x: x[1])
regions, values = zip(*sorted_data)
plt.barh(regions, values)
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
plt.title('GHG Efficiency by Region')
plt.grid(True, axis='x', alpha=0.3)
# 4. DAC benefit by region
plt.subplot(2, 2, 4)
if 'avg_dac_benefit' in regional_metrics:
regions = list(regional_metrics['avg_dac_benefit'].keys())
values = list(regional_metrics['avg_dac_benefit'].values())
# Sort by DAC benefit (higher is better)
sorted_data = sorted(zip(regions, values), key=lambda x: x[1], reverse=True)
regions, values = zip(*sorted_data)
plt.barh(regions, values)
plt.xlabel('DAC Benefit Percentage')
plt.title('DAC Benefit by Region')
plt.grid(True, axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / "regional_metrics.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. Analyze efficiency-equity tradeoffs by region
if ('median_ghg_efficiency' in regional_metrics and
'avg_dac_benefit' in regional_metrics and
'avg_agencies' in regional_metrics):
# Create dataframe for analysis
tradeoff_data = {
'region': list(regional_metrics['median_ghg_efficiency'].keys()),
'efficiency': list(regional_metrics['median_ghg_efficiency'].values()),
'dac_benefit': list(regional_metrics['avg_dac_benefit'].values()),
'avg_agencies': list(regional_metrics['avg_agencies'].values())
}
tradeoff_df = pd.DataFrame(tradeoff_data)
# Calculate composite score (combines efficiency and equity)
# First, normalize the metrics
tradeoff_df['efficiency_norm'] = (tradeoff_df['efficiency'] - tradeoff_df['efficiency'].min()) / (tradeoff_df['efficiency'].max() - tradeoff_df['efficiency'].min())
tradeoff_df['dac_norm'] = (tradeoff_df['dac_benefit'] - tradeoff_df['dac_benefit'].min()) / (tradeoff_df['dac_benefit'].max() - tradeoff_df['dac_benefit'].min())
# Since lower efficiency is better, invert the normalized value
tradeoff_df['efficiency_inv'] = 1 - tradeoff_df['efficiency_norm']
# Calculate composite score (equal weight to efficiency and equity)
tradeoff_df['composite_score'] = (tradeoff_df['efficiency_inv'] + tradeoff_df['dac_norm']) / 2
# Store the results
findings['efficiency_equity_tradeoff'] = {
'metrics': tradeoff_df.to_dict(orient='records'),
'top_regions': tradeoff_df.sort_values('composite_score', ascending=False)['region'].tolist()
}
# Check for correlation between efficiency and equity
correlation, p_value = stats.pearsonr(tradeoff_df['efficiency'], tradeoff_df['dac_benefit'])
findings['efficiency_equity_correlation'] = {
'correlation_type': 'Pearson',
'correlation': correlation,
'p_value': p_value,
'significant': p_value < 0.05
}
# Create visualization for efficiency-equity tradeoff
plt.figure(figsize=(10, 8))
# Scatter plot with bubble size representing average number of agencies
scatter = plt.scatter(
tradeoff_df['efficiency'],
tradeoff_df['dac_benefit'],
s=tradeoff_df['avg_agencies'] * 50, # Size scaled by avg agencies
alpha=0.7,
c=tradeoff_df['composite_score'], # Color by composite score
cmap='viridis'
)
# Add labels for each region
for i, row in tradeoff_df.iterrows():
plt.annotate(
row['region'],
(row['efficiency'], row['dac_benefit']),
textcoords="offset points",
xytext=(5, 5),
ha='left'
)
# Add correlation information
plt.text(0.05, 0.95, f"Correlation: {correlation:.2f} (p={p_value:.4f})",
transform=plt.gca().transAxes, fontsize=10)
def analyze_temporal_trends(df, output_dir):
...
"""
Analyze temporal trends in collaboration and outcomes from 2015-2024.
Parameters:
df (pd.DataFrame): The CCI data
output_dir (Path): Directory to save outputs
Returns:
dict: Findings related to temporal trends
"""
logger.info("Analyzing temporal trends (2015-2024)")
findings = {}
# Exclude EV vouchers for this analysis if they're marked
if 'is_ev_voucher' in df.columns:
analysis_df = df[~df['is_ev_voucher']].copy()
logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis")
else:
analysis_df = df.copy()
# Check if we have the required temporal data
if 'funding_year' not in analysis_df.columns:
logger.error("Funding year data not available for temporal analysis")
return findings
# 1. Calculate yearly metrics
yearly_metrics = {}
# Group by funding year
yearly_groups = analysis_df.groupby('funding_year')
# Project counts by year
yearly_metrics['project_counts'] = yearly_groups.size().to_dict()
# Number of agencies by year
if 'num_agencies_in_program' in analysis_df.columns:
yearly_metrics['avg_agencies'] = yearly_groups['num_agencies_in_program'].mean().to_dict()
yearly_metrics['multi_agency_percentage'] = (yearly_groups['is_multi_agency'].mean() * 100).to_dict()
# Funding metrics by year
funding_col = [col for col in analysis_df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
if funding_col and len(funding_col) > 0:
funding_col = funding_col[0]
yearly_metrics['total_funding'] = yearly_groups[funding_col].sum().to_dict()
yearly_metrics['avg_funding'] = yearly_groups[funding_col].mean().to_dict()
# GHG efficiency by year
if 'ghg_efficiency' in analysis_df.columns:
# Use valid values only and median for robustness to outliers
yearly_valid = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
yearly_efficiency = yearly_valid.groupby('funding_year')['ghg_efficiency'].median()
yearly_metrics['median_ghg_efficiency'] = yearly_efficiency.to_dict()
# DAC benefit by year
if 'dac_benefit_percentage' in analysis_df.columns:
yearly_metrics['avg_dac_benefit'] = yearly_groups['dac_benefit_percentage'].mean().to_dict()
findings['yearly'] = yearly_metrics
# Create visualization for yearly trends
plt.figure(figsize=(15, 12))
# 1. Number of agencies over time
plt.subplot(2, 2, 1)
if 'avg_agencies' in yearly_metrics:
years = sorted(yearly_metrics['avg_agencies'].keys())
values = [yearly_metrics['avg_agencies'][year] for year in years]
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
plt.xlabel('Funding Year')
plt.ylabel('Average Number of Agencies')
plt.title('Collaboration Intensity Over Time')
plt.grid(True, alpha=0.3)
plt.legend()
# 2. Average project funding over time
plt.subplot(2, 2, 2)
if 'avg_funding' in yearly_metrics:
years = sorted(yearly_metrics['avg_funding'].keys())
values = [yearly_metrics['avg_funding'][year] for year in years]
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
plt.xlabel('Funding Year')
plt.ylabel('Average Project Funding ($)')
plt.title('Project Funding Over Time')
plt.grid(True, alpha=0.3)
plt.legend()
# 3. GHG efficiency over time
plt.subplot(2, 2, 3)
if 'median_ghg_efficiency' in yearly_metrics:
years = sorted(yearly_metrics['median_ghg_efficiency'].keys())
values = [yearly_metrics['median_ghg_efficiency'][year] for year in years]
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
plt.xlabel('Funding Year')
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
plt.title('GHG Efficiency Over Time')
plt.grid(True, alpha=0.3)
plt.legend()
# 4. DAC benefit over time
plt.subplot(2, 2, 4)
if 'avg_dac_benefit' in yearly_metrics:
years = sorted(yearly_metrics['avg_dac_benefit'].keys())
values = [yearly_metrics['avg_dac_benefit'][year] for year in years]
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
plt.xlabel('Funding Year')
plt.ylabel('DAC Benefit Percentage')
plt.title('DAC Benefit Over Time')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.savefig(output_dir / "yearly_trends.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. Compare pre-2020 and post-2020 periods
if 'period' in analysis_df.columns:
period_metrics = {}
# Group by period
period_groups = analysis_df.groupby('period')
# Project counts by period
period_metrics['project_counts'] = period_groups.size().to_dict()
# Calculate percentage change from pre-2020 to post-2020
pre_count = period_metrics['project_counts'].get('Pre-2020', 0)
post_count = period_metrics['project_counts'].get('Post-2020', 0)
period_metrics['project_count_change'] = ((post_count - pre_count) / pre_count * 100) if pre_count > 0 else np.nan
# Number of agencies by period
if 'num_agencies_in_program' in analysis_df.columns:
period_metrics['avg_agencies'] = period_groups['num_agencies_in_program'].mean().to_dict()
period_metrics['multi_agency_percentage'] = (period_groups['is_multi_agency'].mean() * 100).to_dict()
# Calculate percentage change
pre_agencies = period_metrics['avg_agencies'].get('Pre-2020', 0)
post_agencies = period_metrics['avg_agencies'].get('Post-2020', 0)
period_metrics['avg_agencies_change'] = ((post_agencies - pre_agencies) / pre_agencies * 100) if pre_agencies > 0 else np.nan
# Funding metrics by period
if funding_col:
period_metrics['total_funding'] = period_groups[funding_col].sum().to_dict()
period_metrics['avg_funding'] = period_groups[funding_col].mean().to_dict()
# Calculate percentage change
pre_funding = period_metrics['avg_funding'].get('Pre-2020', 0)
post_funding = period_metrics['avg_funding'].get('Post-2020', 0)
period_metrics['avg_funding_change'] = ((post_funding - pre_funding) / pre_funding * 100) if pre_funding > 0 else np.nan
# GHG efficiency by period
if 'ghg_efficiency' in analysis_df.columns:
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
if len(valid_df) > 0:
period_efficiency = valid_df.groupby('period')['ghg_efficiency'].median()
period_metrics['median_ghg_efficiency'] = period_efficiency.to_dict()
# Calculate percentage change
pre_efficiency = period_metrics['median_ghg_efficiency'].get('Pre-2020', 0)
post_efficiency = period_metrics['median_ghg_efficiency'].get('Post-2020', 0)
period_metrics['ghg_efficiency_change'] = ((post_efficiency - pre_efficiency) / pre_efficiency * 100) if pre_efficiency > 0 else np.nan
# Statistical test
try:
pre_group = valid_df[valid_df['period'] == 'Pre-2020']['ghg_efficiency']
post_group = valid_df[valid_df['period'] == 'Post-2020']['ghg_efficiency']
if len(pre_group) > 0 and len(post_group) > 0:
u_stat, p_value = stats.mannwhitneyu(
pre_group,
post_group,
alternative='two-sided'
)
period_metrics['ghg_efficiency_test'] = {
'test': 'Mann-Whitney U',
'u_statistic': u_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
except Exception as e:
logger.warning(f"Could not perform statistical test on GHG efficiency by period: {e}")
# DAC benefit by period
if 'dac_benefit_percentage' in analysis_df.columns:
period_metrics['avg_dac_benefit'] = period_groups['dac_benefit_percentage'].mean().to_dict()
# Calculate percentage change
pre_dac = period_metrics['avg_dac_benefit'].get('Pre-2020', 0)
post_dac = period_metrics['avg_dac_benefit'].get('Post-2020', 0)
period_metrics['dac_benefit_change'] = ((post_dac - pre_dac) / pre_dac * 100) if pre_dac > 0 else np.nan
# Statistical test
try:
pre_group = analysis_df[analysis_df['period'] == 'Pre-2020']['dac_benefit_percentage']
post_group = analysis_df[analysis_df['period'] == 'Post-2020']['dac_benefit_percentage']
if len(pre_group) > 0 and len(post_group) > 0:
t_stat, p_value = stats.ttest_ind(
pre_group,
post_group,
equal_var=False # Use Welch's t-test to handle unequal variances
)
period_metrics['dac_benefit_test'] = {
'test': "Welch's t-test",
't_statistic': t_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
except Exception as e:
logger.warning(f"Could not perform statistical test on DAC benefit by period: {e}")
findings['period_comparison'] = period_metrics
# Create visualization for pre/post-2020 comparison
plt.figure(figsize=(12, 10))
# 1. Project counts by period
plt.subplot(2, 2, 1)
if 'project_counts' in period_metrics:
periods = ['Pre-2020', 'Post-2020']
counts = [period_metrics['project_counts'].get(period, 0) for period in periods]
plt.bar(periods, counts)
plt.xlabel('Period')
plt.ylabel('Number of Projects')
plt.title('Project Counts: Pre-2020 vs. Post-2020')
# Add percentage change
if 'project_count_change' in period_metrics:
change = period_metrics['project_count_change']
plt.text(1, counts[1] * 1.05, f"{change:+.1f}%", ha='center')
plt.grid(True, axis='y', alpha=0.3)
# 2. Average number of agencies by period
plt.subplot(2, 2, 2)
if 'avg_agencies' in period_metrics:
periods = ['Pre-2020', 'Post-2020']
values = [period_metrics['avg_agencies'].get(period, 0) for period in periods]
plt.bar(periods, values)
plt.xlabel('Period')
plt.ylabel('Average Number of Agencies')
plt.title('Collaboration Intensity: Pre-2020 vs. Post-2020')
# Add percentage change
if 'avg_agencies_change' in period_metrics:
change = period_metrics['avg_agencies_change']
plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center')
plt.grid(True, axis='y', alpha=0.3)
# 3. GHG efficiency by period
plt.subplot(2, 2, 3)
if 'median_ghg_efficiency' in period_metrics:
periods = ['Pre-2020', 'Post-2020']
values = [period_metrics['median_ghg_efficiency'].get(period, 0) for period in periods]
plt.bar(periods, values)
plt.xlabel('Period')
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
plt.title('GHG Efficiency: Pre-2020 vs. Post-2020')
# Add percentage change
if 'ghg_efficiency_change' in period_metrics:
change = period_metrics['ghg_efficiency_change']
plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center')
# Add statistical significance
if 'ghg_efficiency_test' in period_metrics and period_metrics['ghg_efficiency_test'].get('significant', False):
plt.text(0.5, max(values) * 1.1, "Statistically significant difference (p<0.05)",
ha='center', fontsize=10)
plt.grid(True, axis='y', alpha=0.3)
# 4. DAC benefit by period
plt.subplot(2, 2, 4)
if 'avg_dac_benefit' in period_metrics:
periods = ['Pre-2020', 'Post-2020']
values = [period_metrics['avg_dac_benefit'].get(period, 0) for period in periods]
plt.bar(periods, values)
plt.xlabel('Period')
plt.ylabel('DAC Benefit Percentage')
plt.title('DAC Benefit: Pre-2020 vs. Post-2020')
# Add percentage change
if 'dac_benefit_change' in period_metrics:
change = period_metrics['dac_benefit_change']
plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center')
# Add statistical significance
if 'dac_benefit_test' in period_metrics and period_metrics['dac_benefit_test'].get('significant', False):
plt.text(0.5, max(values) * 1.1, "Statistically significant difference (p<0.05)",
ha='center', fontsize=10)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / "pre_post_2020_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 3. Analyze changes in collaboration patterns over time
if 'num_agencies_in_program' in analysis_df.columns and 'funding_year' in analysis_df.columns:
# Create a time series of collaboration indicators
collab_trends = {}
# Average agencies over time
collab_trends['avg_agencies'] = yearly_metrics.get('avg_agencies', {})
# Percentage of multi-agency projects over time
collab_trends['multi_agency_percentage'] = yearly_metrics.get('multi_agency_percentage', {})
# Analyze correlation between time and collaboration intensity
try:
years = sorted(collab_trends['avg_agencies'].keys())
values = [collab_trends['avg_agencies'][year] for year in years]
# Calculate correlation
correlation, p_value = stats.pearsonr(years, values)
collab_trends['time_correlation'] = {
'correlation_type': 'Pearson',
'correlation': correlation,
'p_value': p_value,
'significant': p_value < 0.05
}
except Exception as e:
logger.warning(f"Could not calculate correlation between time and collaboration: {e}")
# Regional distribution over time (if available)
if 'regional_scope' in analysis_df.columns:
# Create a cross-tabulation of funding year and regional scope
try:
crosstab = pd.crosstab(
analysis_df['funding_year'],
analysis_df['regional_scope'],
normalize='index'
).multiply(100)
collab_trends['regional_distribution'] = {}
for year in crosstab.index:
collab_trends['regional_distribution'][int(year)] = {}
for scope in crosstab.columns:
collab_trends['regional_distribution'][int(year)][scope] = crosstab.loc[year, scope]
except Exception as e:
logger.warning(f"Error calculating regional distribution over time: {e}")
findings['collaboration_trends'] = collab_trends
# Create visualization for collaboration trends
plt.figure(figsize=(12, 10))
# 1. Average agencies and multi-agency percentage over time
plt.subplot(2, 1, 1)
if 'avg_agencies' in collab_trends and 'multi_agency_percentage' in collab_trends:
years = sorted(collab_trends['avg_agencies'].keys())
avg_values = [collab_trends['avg_agencies'][year] for year in years]
pct_values = [collab_trends['multi_agency_percentage'][year] for year in years]
# Create figure with two y-axes
fig, ax1 = plt.subplots(figsize=(10, 6))
# Plot average agencies
color = 'tab:blue'
ax1.set_xlabel('Funding Year')
ax1.set_ylabel('Average Number of Agencies', color=color)
ax1.plot(years, avg_values, color=color, marker='o', linestyle='-', linewidth=2)
ax1.tick_params(axis='y', labelcolor=color)
# Create second y-axis for percentage
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Multi-Agency Projects (%)', color=color)
ax2.plot(years, pct_values, color=color, marker='s', linestyle='-', linewidth=2)
ax2.tick_params(axis='y', labelcolor=color)
# Add vertical line at 2020
plt.axvline(x=2020, color='black', linestyle='--', alpha=0.7)
# Add correlation information
if 'time_correlation' in collab_trends:
corr = collab_trends['time_correlation']['correlation']
p_val = collab_trends['time_correlation']['p_value']
plt.figtext(0.15, 0.85, f"Time-Collaboration Correlation: {corr:.2f} (p={p_val:.4f})",
fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
plt.title('Collaboration Intensity Over Time')
fig.tight_layout()
plt.savefig(output_dir / "collaboration_trends.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. Regional distribution over time (if available)
if 'regional_distribution' in collab_trends:
try:
# Convert the nested dict to a DataFrame for easier plotting
region_years = []
region_scopes = []
region_values = []
for year, scopes in collab_trends['regional_distribution'].items():
for scope, value in scopes.items():
region_years.append(year)
region_scopes.append(scope)
region_values.append(value)
region_df = pd.DataFrame({
'Year': region_years,
'Scope': region_scopes,
'Percentage': region_values
})
# Pivot for stacked area chart
pivot_df = region_df.pivot(index='Year', columns='Scope', values='Percentage')
# Define the order for regional scope
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
# Reorder columns if possible
ordered_columns = [col for col in scope_order if col in pivot_df.columns]
missing_columns = [col for col in pivot_df.columns if col not in ordered_columns]
pivot_df = pivot_df[ordered_columns + missing_columns]
# Create stacked area chart
plt.figure(figsize=(10, 6))
plt.stackplot(pivot_df.index, pivot_df.values.T, labels=pivot_df.columns, alpha=0.7)
plt.xlabel('Funding Year')
plt.ylabel('Percentage of Projects')
plt.title('Regional Scope Distribution Over Time')
plt.legend(loc='upper left')
plt.grid(True, alpha=0.3)
# Add vertical line at 2020
plt.axvline(x=2020, color='black', linestyle='--', alpha=0.7)
plt.savefig(output_dir / "regional_trends.png", dpi=300, bbox_inches='tight')
plt.close()
except Exception as e:
logger.warning(f"Error creating regional distribution visualization: {e}")
# 4. Time series regression analysis
time_regression = {}
if 'num_agencies_in_program' in analysis_df.columns and 'funding_year' in analysis_df.columns:
try:
# Create time variable and center it at 2020 for easier interpretation
analysis_df['year_centered'] = analysis_df['funding_year'] - 2020
# Regression model predicting number of agencies from year
model1 = sm.OLS.from_formula('num_agencies_in_program ~ year_centered', data=analysis_df)
results1 = model1.fit()
time_regression['agencies_model'] = {
'r_squared': results1.rsquared,
'coefficient': results1.params['year_centered'],
'p_value': results1.pvalues['year_centered'],
'significant': results1.pvalues['year_centered'] < 0.05
}
# Interpret the coefficient
coef = results1.params['year_centered']
if coef > 0:
time_regression['agencies_model']['interpretation'] = f"The number of agencies per program increases by {coef:.3f} each year."
else:
time_regression['agencies_model']['interpretation'] = f"The number of agencies per program decreases by {abs(coef):.3f} each year."
except Exception as e:
logger.warning(f"Could not perform time series regression for number of agencies: {e}")
# Regression model predicting GHG efficiency from year
if 'ghg_efficiency' in analysis_df.columns and 'funding_year' in analysis_df.columns:
try:
# Use log transformation for efficiency
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)].copy()
if len(valid_df) > 0:
valid_df['log_efficiency'] = np.log(valid_df['ghg_efficiency'])
valid_df['year_centered'] = valid_df['funding_year'] - 2020
model2 = sm.OLS.from_formula('log_efficiency ~ year_centered', data=valid_df)
results2 = model2.fit()
time_regression['efficiency_model'] = {
'r_squared': results2.rsquared,
'coefficient': results2.params['year_centered'],
'p_value': results2.pvalues['year_centered'],
'significant': results2.pvalues['year_centered'] < 0.05
}
# Interpret the coefficient (exponentiate for percentage change)
coef = results2.params['year_centered']
pct_change = (np.exp(coef) - 1) * 100
if pct_change > 0:
time_regression['efficiency_model']['interpretation'] = f"GHG efficiency worsens by approximately {pct_change:.1f}% each year (higher $ per ton)."
else:
time_regression['efficiency_model']['interpretation'] = f"GHG efficiency improves by approximately {abs(pct_change):.1f}% each year (lower $ per ton)."
except Exception as e:
logger.warning(f"Could not perform time series regression for GHG efficiency: {e}")
# Regression model predicting DAC benefit from year
if 'dac_benefit_percentage' in analysis_df.columns and 'funding_year' in analysis_df.columns:
try:
# Create time variable
analysis_df['year_centered'] = analysis_df['funding_year'] - 2020
model3 = sm.OLS.from_formula('dac_benefit_percentage ~ year_centered', data=analysis_df)
results3 = model3.fit()
time_regression['dac_model'] = {
'r_squared': results3.rsquared,
'coefficient': results3.params['year_centered'],
'p_value': results3.pvalues['year_centered'],
'significant': results3.pvalues['year_centered'] < 0.05
}
# Interpret the coefficient
coef = results3.params['year_centered']
if coef > 0:
time_regression['dac_model']['interpretation'] = f"DAC benefit percentage increases by {coef:.3f} percentage points each year."
else:
time_regression['dac_model']['interpretation'] = f"DAC benefit percentage decreases by {abs(coef):.3f} percentage points each year."
except Exception as e:
logger.warning(f"Could not perform time series regression for DAC benefit: {e}")
findings['time_regression'] = time_regression
return findings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import logging
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from scipy import stats
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("cci_research_analysis")
def analyze_research_questions(data_path, output_path="./output/research"):
"""
Analyze the key research questions about collaboration in CCI.
Parameters:
data_path (str): Path to the cleaned CCI data file
output_path (str): Path to save outputs
Returns:
dict: Dictionary containing research findings
"""
logger.info(f"Loading data from {data_path}")
# Load the data
try:
df = pd.read_csv(data_path)
except Exception as e:
logger.error(f"Error loading data: {e}")
return None
logger.info(f"Successfully loaded {len(df)} rows")
# Create output directory
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
# Initialize findings dictionary
findings = {}
# Analyze each research question
try:
# Research Question 1: How does structure and scale of collaboration affect outcomes?
findings['rq1'] = analyze_collaboration_structure_impact(df, output_dir)
# Research Question 2: What temporal trends have occurred from 2015-2024?
findings['rq2'] = analyze_temporal_trends(df, output_dir)
# Additional analysis of regional patterns and variations
findings['regional_patterns'] = analyze_regional_variations(df, output_dir)
# Generate comprehensive report
generate_research_report(findings, output_dir)
logger.info(f"Analysis complete, results saved to {output_dir}")
return findings
except Exception as e:
logger.error(f"Error in research analysis: {e}")
return None
def analyze_collaboration_structure_impact(df, output_dir):
"""
Analyze how collaboration structure and scale affect GHG efficiency and equity.
Parameters:
df (pd.DataFrame): The CCI data
output_dir (Path): Directory to save outputs
Returns:
dict: Findings related to collaboration structure impact
"""
logger.info("Analyzing collaboration structure impact")
findings = {}
# Exclude EV vouchers for this analysis if they're marked
if 'is_ev_voucher' in df.columns:
analysis_df = df[~df['is_ev_voucher']].copy()
logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis")
else:
analysis_df = df.copy()
# 1. Compare multi-agency vs single-agency projects
if 'is_multi_agency' in analysis_df.columns:
multi_df = analysis_df[analysis_df['is_multi_agency']]
single_df = analysis_df[~analysis_df['is_multi_agency']]
findings['project_counts'] = {
'multi_agency': len(multi_df),
'single_agency': len(single_df),
'multi_agency_percentage': len(multi_df) / len(analysis_df) * 100 if len(analysis_df) > 0 else 0
}
# GHG efficiency comparison
if 'ghg_efficiency' in analysis_df.columns:
# Use valid efficiency data only
valid_multi = multi_df[multi_df['ghg_efficiency'].notna() & (multi_df['ghg_efficiency'] > 0)]
valid_single = single_df[single_df['ghg_efficiency'].notna() & (single_df['ghg_efficiency'] > 0)]
if len(valid_multi) > 0 and len(valid_single) > 0:
# Calculate median efficiency ($ per ton CO2e)
multi_efficiency = valid_multi['ghg_efficiency'].median()
single_efficiency = valid_single['ghg_efficiency'].median()
findings['ghg_efficiency'] = {
'multi_agency_median': multi_efficiency,
'single_agency_median': single_efficiency,
'difference': single_efficiency - multi_efficiency,
'difference_percentage': ((single_efficiency - multi_efficiency) / single_efficiency * 100) if single_efficiency > 0 else 0
}
# Statistical test (Mann-Whitney U test for non-parametric comparison)
try:
u_stat, p_value = stats.mannwhitneyu(
valid_multi['ghg_efficiency'],
valid_single['ghg_efficiency'],
alternative='two-sided'
)
findings['ghg_efficiency']['statistical_test'] = {
'test': 'Mann-Whitney U',
'u_statistic': u_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
except Exception as e:
logger.warning(f"Could not perform statistical test on efficiency comparison: {e}")
# Create visualization for efficiency comparison
plt.figure(figsize=(10, 6))
# Use log scale for better visualization since efficiency can be highly skewed
bins = np.logspace(
np.log10(min(valid_multi['ghg_efficiency'].min(), valid_single['ghg_efficiency'].min())),
np.log10(max(valid_multi['ghg_efficiency'].quantile(0.95), valid_single['ghg_efficiency'].quantile(0.95))),
30
)
plt.hist(valid_single['ghg_efficiency'], bins=bins, alpha=0.5, label=f'Single-Agency (Median: ${single_efficiency:.2f})')
plt.hist(valid_multi['ghg_efficiency'], bins=bins, alpha=0.5, label=f'Multi-Agency (Median: ${multi_efficiency:.2f})')
plt.xscale('log')
plt.xlabel('GHG Efficiency ($ per ton CO2e) - Log Scale')
plt.ylabel('Number of Projects')
plt.title('GHG Efficiency: Multi-Agency vs. Single-Agency Programs')
plt.legend()
plt.grid(True, alpha=0.3)
# Save the figure
plt.savefig(output_dir / "ghg_efficiency_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# DAC benefit comparison
if 'dac_benefit_percentage' in analysis_df.columns:
findings['dac_benefit'] = {
'multi_agency_mean': multi_df['dac_benefit_percentage'].mean(),
'single_agency_mean': single_df['dac_benefit_percentage'].mean(),
'difference': multi_df['dac_benefit_percentage'].mean() - single_df['dac_benefit_percentage'].mean()
}
# Statistical test
try:
t_stat, p_value = stats.ttest_ind(
multi_df['dac_benefit_percentage'],
single_df['dac_benefit_percentage'],
equal_var=False # Use Welch's t-test to handle unequal variances
)
findings['dac_benefit']['statistical_test'] = {
'test': "Welch's t-test",
't_statistic': t_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
except Exception as e:
logger.warning(f"Could not perform statistical test on DAC benefit comparison: {e}")
# Create visualization for DAC benefit comparison
plt.figure(figsize=(10, 6))
# Create a boxplot with outlier points
sns.boxplot(x='is_multi_agency', y='dac_benefit_percentage', data=analysis_df)
# Add a swarm plot of the data points
sns.swarmplot(x='is_multi_agency', y='dac_benefit_percentage', data=analysis_df,
alpha=0.5, size=4, edgecolor='black', linewidth=0.5)
# Set labels and title
plt.xlabel('Multi-Agency Program')
plt.ylabel('DAC Benefit Percentage')
plt.title('DAC Benefit: Multi-Agency vs. Single-Agency Programs')
plt.xticks([0, 1], ['Single-Agency', 'Multi-Agency'])
plt.grid(True, axis='y', alpha=0.3)
# Save the figure
plt.savefig(output_dir / "dac_benefit_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. Analyze relationship between number of agencies and outcomes
if 'num_agencies_in_program' in analysis_df.columns:
# Group by number of agencies
agency_groups = analysis_df.groupby('num_agencies_in_program')
# Calculate metrics for each group
agency_metrics = {}
# Project counts by number of agencies
agency_counts = agency_groups.size()
agency_metrics['project_counts'] = agency_counts.to_dict()
# Filter to groups with at least 5 projects for more reliable metrics
valid_groups = agency_counts[agency_counts >= 5].index
# Calculate GHG efficiency by number of agencies
if 'ghg_efficiency' in analysis_df.columns:
efficiency_by_agency = agency_groups['ghg_efficiency'].median()
efficiency_by_agency = efficiency_by_agency[valid_groups]
agency_metrics['ghg_efficiency'] = efficiency_by_agency.to_dict()
# Calculate correlation using Spearman rank correlation
# (more robust to non-linear relationships and outliers)
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
if len(valid_df) > 0:
spearman_corr, p_value = stats.spearmanr(
valid_df['num_agencies_in_program'],
valid_df['ghg_efficiency']
)
agency_metrics['efficiency_correlation'] = {
'correlation_type': 'Spearman rank',
'correlation': spearman_corr,
'p_value': p_value,
'significant': p_value < 0.05
}
# Calculate DAC benefit by number of agencies
if 'dac_benefit_percentage' in analysis_df.columns:
dac_by_agency = agency_groups['dac_benefit_percentage'].mean()
dac_by_agency = dac_by_agency[valid_groups]
agency_metrics['dac_benefit'] = dac_by_agency.to_dict()
# Calculate correlation
valid_df = analysis_df[analysis_df['dac_benefit_percentage'].notna()]
if len(valid_df) > 0:
spearman_corr, p_value = stats.spearmanr(
valid_df['num_agencies_in_program'],
valid_df['dac_benefit_percentage']
)
agency_metrics['dac_correlation'] = {
'correlation_type': 'Spearman rank',
'correlation': spearman_corr,
'p_value': p_value,
'significant': p_value < 0.05
}
findings['agency_count_impact'] = agency_metrics
# Create visualization for agency count impact
plt.figure(figsize=(12, 10))
# 1. GHG Efficiency by number of agencies
plt.subplot(2, 1, 1)
if 'ghg_efficiency' in agency_metrics:
efficiency_df = pd.DataFrame(agency_metrics['ghg_efficiency'].items(), columns=['num_agencies', 'efficiency'])
efficiency_df = efficiency_df.sort_values('num_agencies')
plt.bar(efficiency_df['num_agencies'], efficiency_df['efficiency'])
plt.xlabel('Number of Agencies in Program')
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
plt.title('GHG Efficiency by Number of Agencies')
# Add correlation information
if 'efficiency_correlation' in agency_metrics:
corr = agency_metrics['efficiency_correlation']['correlation']
p_val = agency_metrics['efficiency_correlation']['p_value']
plt.text(0.05, 0.95, f"Correlation: {corr:.2f} (p={p_val:.4f})",
transform=plt.gca().transAxes, fontsize=10,
bbox=dict(facecolor='white', alpha=0.8))
plt.grid(True, axis='y', alpha=0.3)
# 2. DAC Benefit by number of agencies
plt.subplot(2, 1, 2)
if 'dac_benefit' in agency_metrics:
dac_df = pd.DataFrame(agency_metrics['dac_benefit'].items(), columns=['num_agencies', 'dac_benefit'])
dac_df = dac_df.sort_values('num_agencies')
plt.bar(dac_df['num_agencies'], dac_df['dac_benefit'])
plt.xlabel('Number of Agencies in Program')
plt.ylabel('DAC Benefit Percentage')
plt.title('DAC Benefit by Number of Agencies')
# Add correlation information
if 'dac_correlation' in agency_metrics:
corr = agency_metrics['dac_correlation']['correlation']
p_val = agency_metrics['dac_correlation']['p_value']
plt.text(0.05, 0.95, f"Correlation: {corr:.2f} (p={p_val:.4f})",
transform=plt.gca().transAxes, fontsize=10,
bbox=dict(facecolor='white', alpha=0.8))
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / "agency_count_impact.png", dpi=300, bbox_inches='tight')
plt.close()
# 3. Analyze impact of regional scope
if 'regional_scope' in analysis_df.columns:
# Group by regional scope
scope_groups = analysis_df.groupby('regional_scope')
# Calculate metrics for each scope
scope_metrics = {}
# Project counts by regional scope
scope_counts = scope_groups.size()
scope_metrics['project_counts'] = scope_counts.to_dict()
# Average number of agencies by regional scope
if 'num_agencies_in_program' in analysis_df.columns:
scope_agencies = scope_groups['num_agencies_in_program'].mean()
scope_metrics['avg_agencies'] = scope_agencies.to_dict()
# Calculate GHG efficiency by regional scope
if 'ghg_efficiency' in analysis_df.columns:
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
if len(valid_df) > 0:
scope_efficiency = valid_df.groupby('regional_scope')['ghg_efficiency'].median()
scope_metrics['ghg_efficiency'] = scope_efficiency.to_dict()
# Calculate DAC benefit by regional scope
if 'dac_benefit_percentage' in analysis_df.columns:
scope_dac = scope_groups['dac_benefit_percentage'].mean()
scope_metrics['dac_benefit'] = scope_dac.to_dict()
findings['regional_scope_impact'] = scope_metrics
# Create visualization for regional scope impact
plt.figure(figsize=(12, 10))
# 1. Average number of agencies by regional scope
plt.subplot(2, 2, 1)
if 'avg_agencies' in scope_metrics:
scope_df = pd.DataFrame(scope_metrics['avg_agencies'].items(),
columns=['regional_scope', 'avg_agencies'])
# Define the order for regional scope
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1)
scope_df = scope_df.sort_values('scope_order')
plt.bar(scope_df['regional_scope'], scope_df['avg_agencies'])
plt.xlabel('Regional Scope')
plt.ylabel('Average Number of Agencies')
plt.title('Agency Collaboration by Regional Scope')
plt.xticks(rotation=45)
plt.grid(True, axis='y', alpha=0.3)
# 2. GHG Efficiency by regional scope
plt.subplot(2, 2, 2)
if 'ghg_efficiency' in scope_metrics:
scope_df = pd.DataFrame(scope_metrics['ghg_efficiency'].items(),
columns=['regional_scope', 'efficiency'])
# Define the order for regional scope
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1)
scope_df = scope_df.sort_values('scope_order')
plt.bar(scope_df['regional_scope'], scope_df['efficiency'])
plt.xlabel('Regional Scope')
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
plt.title('GHG Efficiency by Regional Scope')
plt.xticks(rotation=45)
plt.grid(True, axis='y', alpha=0.3)
# 3. DAC Benefit by regional scope
plt.subplot(2, 2, 3)
if 'dac_benefit' in scope_metrics:
scope_df = pd.DataFrame(scope_metrics['dac_benefit'].items(),
columns=['regional_scope', 'dac_benefit'])
# Define the order for regional scope
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1)
scope_df = scope_df.sort_values('scope_order')
plt.bar(scope_df['regional_scope'], scope_df['dac_benefit'])
plt.xlabel('Regional Scope')
plt.ylabel('DAC Benefit Percentage')
plt.title('DAC Benefit by Regional Scope')
plt.xticks(rotation=45)
plt.grid(True, axis='y', alpha=0.3)
# 4. Efficiency vs. Equity by Regional Scope (scatter plot)
plt.subplot(2, 2, 4)
if 'ghg_efficiency' in scope_metrics and 'dac_benefit' in scope_metrics and 'avg_agencies' in scope_metrics:
# Combine the metrics
combined_df = pd.DataFrame({
'regional_scope': list(scope_metrics['ghg_efficiency'].keys()),
'efficiency': list(scope_metrics['ghg_efficiency'].values()),
'dac_benefit': list(scope_metrics['dac_benefit'].values()),
'avg_agencies': list(scope_metrics['avg_agencies'].values())
})
# Create scatter plot
plt.scatter(
combined_df['efficiency'],
combined_df['dac_benefit'],
s=combined_df['avg_agencies'] * 50, # Size based on number of agencies
alpha=0.7
)
# Add labels for each point
for i, row in combined_df.iterrows():
plt.annotate(
row['regional_scope'],
(row['efficiency'], row['dac_benefit']),
textcoords="offset points",
xytext=(5, 5),
ha='left'
)
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
plt.ylabel('DAC Benefit Percentage')
plt.title('Efficiency vs. Equity by Regional Scope')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(output_dir / "regional_scope_impact.png", dpi=300, bbox_inches='tight')
plt.close()
# 4. Regression analysis of collaboration factors on outcomes
# Create models to predict GHG efficiency and DAC benefit
regression_results = {}
# Filter out missing values and extreme outliers
if 'ghg_efficiency' in analysis_df.columns:
ghg_model_df = analysis_df[
analysis_df['ghg_efficiency'].notna() &
(analysis_df['ghg_efficiency'] > 0) &
(analysis_df['ghg_efficiency'] < analysis_df['ghg_efficiency'].quantile(0.95))
].copy()
if 'num_agencies_in_program' in ghg_model_df.columns:
# Log transform the GHG efficiency for better model fit
ghg_model_df['log_efficiency'] = np.log(ghg_model_df['ghg_efficiency'])
# Create model predicting log efficiency
try:
# Base model with just number of agencies
model1 = sm.OLS.from_formula('log_efficiency ~ num_agencies_in_program', data=ghg_model_df)
results1 = model1.fit()
# Model with number of agencies and regional scope (if available)
if 'regional_scope' in ghg_model_df.columns:
# Convert categorical to dummy variables
ghg_model_df = pd.get_dummies(ghg_model_df, columns=['regional_scope'], drop_first=True)
# Get dummy variable columns
scope_cols = [col for col in ghg_model_df.columns if col.startswith('regional_scope_')]
# Create formula with interaction terms
formula2 = 'log_efficiency ~ num_agencies_in_program + ' + ' + '.join(scope_cols)
model2 = sm.OLS.from_formula(formula2, data=ghg_model_df)
results2 = model2.fit()
# Store model results
regression_results['ghg_efficiency'] = {
'base_model': {
'r_squared': results1.rsquared,
'r_squared_adj': results1.rsquared_adj,
'p_value': results1.f_pvalue,
'coefficient': results1.params['num_agencies_in_program'],
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
},
'full_model': {
'r_squared': results2.rsquared,
'r_squared_adj': results2.rsquared_adj,
'p_value': results2.f_pvalue,
'coefficient': results2.params['num_agencies_in_program'],
'coefficient_p_value': results2.pvalues['num_agencies_in_program']
}
}
else:
# Store just the base model results
regression_results['ghg_efficiency'] = {
'base_model': {
'r_squared': results1.rsquared,
'r_squared_adj': results1.rsquared_adj,
'p_value': results1.f_pvalue,
'coefficient': results1.params['num_agencies_in_program'],
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
}
}
except Exception as e:
logger.warning(f"Could not perform regression analysis for GHG efficiency: {e}")
# Regression for DAC benefit
if 'dac_benefit_percentage' in analysis_df.columns:
dac_model_df = analysis_df[
analysis_df['dac_benefit_percentage'].notna()
].copy()
if 'num_agencies_in_program' in dac_model_df.columns:
try:
# Base model with just number of agencies
model1 = sm.OLS.from_formula('dac_benefit_percentage ~ num_agencies_in_program', data=dac_model_df)
results1 = model1.fit()
# Model with number of agencies and regional scope (if available)
if 'regional_scope' in dac_model_df.columns:
# Convert categorical to dummy variables
dac_model_df = pd.get_dummies(dac_model_df, columns=['regional_scope'], drop_first=True)
# Get dummy variable columns
scope_cols = [col for col in dac_model_df.columns if col.startswith('regional_scope_')]
# Create formula with interaction terms
formula2 = 'dac_benefit_percentage ~ num_agencies_in_program + ' + ' + '.join(scope_cols)
model2 = sm.OLS.from_formula(formula2, data=dac_model_df)
results2 = model2.fit()
# Store model results
regression_results['dac_benefit'] = {
'base_model': {
'r_squared': results1.rsquared,
'r_squared_adj': results1.rsquared_adj,
'p_value': results1.f_pvalue,
'coefficient': results1.params['num_agencies_in_program'],
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
},
'full_model': {
'r_squared': results2.rsquared,
'r_squared_adj': results2.rsquared_adj,
'p_value': results2.f_pvalue,
'coefficient': results2.params['num_agencies_in_program'],
'coefficient_p_value': results2.pvalues['num_agencies_in_program']
}
}
else:
# Store just the base model results
regression_results['dac_benefit'] = {
'base_model': {
'r_squared': results1.rsquared,
'r_squared_adj': results1.rsquared_adj,
'p_value': results1.f_pvalue,
'coefficient': results1.params['num_agencies_in_program'],
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
}
}
except Exception as e:
logger.warning(f"Could not perform regression analysis for DAC benefit: {e}")
findings['regression_analysis'] = regression_results
return findings