if 'collaboration_trends' in rq2: collab_trends = rq2['collaboration_trends'] f.write("### Changes in Collaboration Patterns\n\n") if 'time_correlation' in collab_trends: corr = collab_trends['time_correlation'] f.write(f"The correlation between time (year) and collaboration intensity (average number of agencies) is {corr.get('correlation', 0):.2f} ") if corr.get('significant', False): f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ") else: f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ") if corr.get('correlation', 0) > 0: f.write("This indicates a general trend of increasing collaboration over the study period.\n\n") else: f.write("This indicates a general trend of decreasing collaboration over the study period.\n\n") if 'regional_distribution' in collab_trends: f.write("The distribution of projects across different regional scopes has changed over time.\n\n") f.write("![Regional Trends](regional_trends.png)\n\n") if 'time_regression' in rq2: regression = rq2['time_regression'] f.write("### Time Series Regression Analysis\n\n") if 'agencies_model' in regression: model = regression['agencies_model'] f.write("#### Trend in Collaboration Intensity\n\n") if 'interpretation' in model: f.write(f"{model['interpretation']} ") if model.get('significant', False): f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n") else: f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n") if 'efficiency_model' in regression: model = regression['efficiency_model'] f.write("#### Trend in GHG Efficiency\n\n") if 'interpretation' in model: f.write(f"{model['interpretation']} ") if model.get('significant', False): f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n") else: f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n") if 'dac_model' in regression: model = regression['dac_model'] f.write("#### Trend in DAC Benefit\n\n") if 'interpretation' in model: f.write(f"{model['interpretation']} ") if model.get('significant', False): f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n") else: f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n") # Regional Variations f.write("## Regional Variations in Collaboration Effectiveness\n\n") if 'regional_patterns' in findings: regional = findings['regional_patterns'] if 'regional_metrics' in regional: metrics = regional['regional_metrics'] f.write("### Regional Distribution and Characteristics\n\n") f.write("The analysis identified significant variations in collaboration patterns and outcomes across different regions of California.\n\n") f.write("![Regional Metrics](regional_metrics.png)\n\n") f.write("### Efficiency-Equity Tradeoffs by Region\n\n") if 'efficiency_equity_tradeoff' in regional: tradeoff = regional['efficiency_equity_tradeoff'] f.write("The analysis examined the tradeoff between GHG efficiency and equity outcomes across regions.\n\n") if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0: top_region = tradeoff['top_regions'][0] f.write(f"The {top_region} region achieves the best overall balance between efficiency and equity outcomes. ") # Include top 3 regions if available if len(tradeoff['top_regions']) >= 3: top_3 = ", ".join(tradeoff['top_regions'][:3]) f.write(f"The top three regions in terms of combined efficiency and equity performance are: {top_3}.\n\n") else: f.write("\n\n") f.write("![Efficiency-Equity Tradeoff](efficiency_equity_tradeoff.png)\n\n") if 'efficiency_equity_correlation' in regional: corr = regional['efficiency_equity_correlation'] corr_val = corr.get('correlation', 0) f.write(f"The correlation between GHG efficiency cost and DAC benefit percentage across regions is {corr_val:.2f} ") if corr.get('significant', False): f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ") else: f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ") if corr_val > 0.3: f.write("This positive correlation suggests a tradeoff between efficiency and equity, where regions with better efficiency (lower cost per ton) tend to have lower DAC benefits.\n\n") elif corr_val < -0.3: f.write("This negative correlation suggests synergy between efficiency and equity, where regions with better efficiency (lower cost per ton) tend to have higher DAC benefits.\n\n") else: f.write("This weak correlation suggests that efficiency and equity outcomes may be largely independent of each other across regions.\n\n") if 'regional_anova' in regional: anova = regional['regional_anova'] f.write("### Statistical Comparison of Regions\n\n") if 'ghg_efficiency' in anova: efficiency = anova['ghg_efficiency'] f.write("#### Regional Differences in GHG Efficiency\n\n") if efficiency.get('significant', False): f.write(f"There are statistically significant differences in GHG efficiency across regions (F={efficiency.get('f_value', 0):.2f}, p={efficiency.get('p_value', 1):.4f}).\n\n") if 'pairwise_tests' in efficiency: sig_pairs = [pair for pair, result in efficiency['pairwise_tests'].items() if result.get('significant', False)] if len(sig_pairs) > 0: f.write("The significant regional differences include:\n\n") for pair in sig_pairs[:3]: # Show up to 3 significant pairs f.write(f"- {pair}\n") if len(sig_pairs) > 3: f.write(f"- And {len(sig_pairs) - 3} other significant regional differences\n") f.write("\n") else: f.write(f"There are no statistically significant differences in GHG efficiency across regions (F={efficiency.get('f_value', 0):.2f}, p={efficiency.get('p_value', 1):.4f}).\n\n") if 'dac_benefit' in anova: dac = anova['dac_benefit'] f.write("#### Regional Differences in DAC Benefit\n\n") if dac.get('significant', False): f.write(f"There are statistically significant differences in DAC benefit percentage across regions (F={dac.get('f_value', 0):.2f}, p={dac.get('p_value', 1):.4f}).\n\n") if 'pairwise_tests' in dac: sig_pairs = [pair for pair, result in dac['pairwise_tests'].items() if result.get('significant', False)] if len(sig_pairs) > 0: f.write("The significant regional differences include:\n\n") for pair in sig_pairs[:3]: # Show up to 3 significant pairs f.write(f"- {pair}\n") if len(sig_pairs) > 3: f.write(f"- And {len(sig_pairs) - 3} other significant regional differences\n") f.write("\n") else: f.write(f"There are no statistically significant differences in DAC benefit percentage across regions (F={dac.get('f_value', 0):.2f}, p={dac.get('p_value', 1):.4f}).\n\n") # Conclusions and Implications f.write("## Conclusions and Implications\n\n") f.write("### Key Findings\n\n") # Summarize key findings from RQ1 f.write("#### Research Question 1: Collaboration Structure and Scale\n\n") if 'rq1' in findings: rq1 = findings['rq1'] conclusions_rq1 = [] # Multi-agency vs single-agency comparison if 'ghg_efficiency' in rq1 and 'dac_benefit' in rq1: eff = rq1['ghg_efficiency'] dac = rq1['dac_benefit'] eff_diff = eff.get('difference', 0) dac_diff = dac.get('difference', 0) if eff_diff < 0 and dac_diff > 0: conclusions_rq1.append("Multi-agency programs demonstrate a clear tradeoff: they are less cost-efficient but achieve higher equity outcomes compared to single-agency programs.") elif eff_diff < 0 and dac_diff < 0: conclusions_rq1.append("Multi-agency programs underperform single-agency programs in both cost efficiency and equity outcomes, suggesting potential coordination challenges.") elif eff_diff > 0 and dac_diff > 0: conclusions_rq1.append("Multi-agency programs outperform single-agency programs in both cost efficiency and equity outcomes, demonstrating synergistic benefits of collaboration.") elif eff_diff > 0 and dac_diff < 0: conclusions_rq1.append("Multi-agency programs show better cost efficiency but lower equity outcomes compared to single-agency programs.") # Agency count correlation findings if 'agency_count_impact' in rq1: agency_impact = rq1['agency_count_impact'] if 'efficiency_correlation' in agency_impact and 'dac_correlation' in agency_impact: eff_corr = agency_impact['efficiency_correlation'].get('correlation', 0) dac_corr = agency_impact['dac_correlation'].get('correlation', 0) if abs(eff_corr) > 0.1: if eff_corr > 0: conclusions_rq1.append("Increasing the number of collaborating agencies tends to reduce cost efficiency (higher $ per ton).") else: conclusions_rq1.append("Increasing the number of collaborating agencies tends to improve cost efficiency (lower $ per ton).") if abs(dac_corr) > 0.1: if dac_corr > 0: conclusions_rq1.append("Increasing the number of collaborating agencies tends to improve equity outcomes (higher DAC benefit).") else: conclusions_rq1.append("Increasing the number of collaborating agencies tends to reduce equity outcomes (lower DAC benefit).") # Regional scope findings if 'regional_scope_impact' in rq1: scope_impact = rq1['regional_scope_impact'] if 'avg_agencies' in scope_impact and 'Multi-Regional' in scope_impact['avg_agencies']: multi_regional_avg = scope_impact['avg_agencies']['Multi-Regional'] single_county_avg = scope_impact['avg_agencies'].get('Single County', 0) if multi_regional_avg > single_county_avg: conclusions_rq1.append(f"Projects with wider geographic scope involve significantly more collaborative partners (Multi-Regional: {multi_regional_avg:.1f} agencies vs. Single County: {single_county_avg:.1f} agencies).") for conclusion in conclusions_rq1: f.write(f"- {conclusion}\n") f.write("\n") # Summarize key findings from RQ2 f.write("#### Research Question 2: Temporal Trends\n\n") if 'rq2' in findings: rq2 = findings['rq2'] conclusions_rq2 = [] # Pre/post 2020 comparison if 'period_comparison' in rq2: period = rq2['period_comparison'] if 'avg_agencies_change' in period: change = period.get('avg_agencies_change', 0) if abs(change) > 5: # Only include if the change is substantial if change > 0: conclusions_rq2.append(f"Collaboration intensity increased significantly ({change:.1f}%) after 2020, indicating a shift toward more collaborative approaches.") else: conclusions_rq2.append(f"Collaboration intensity decreased significantly ({abs(change):.1f}%) after 2020, indicating a shift toward more independent approaches.") if 'avg_funding_change' in period: change = period.get('avg_funding_change', 0) if abs(change) > 10: # Only include if the change is substantial if change > 0: conclusions_rq2.append(f"Average project funding increased substantially ({change:.1f}%) after 2020, suggesting a shift toward larger-scale initiatives.") else: conclusions_rq2.append(f"Average project funding decreased substantially ({abs(change):.1f}%) after 2020, suggesting a shift toward smaller-scale initiatives.") if 'dac_benefit_change' in period: change = period.get('dac_benefit_change', 0) if abs(change) > 5: # Only include if the change is substantial if change > 0: conclusions_rq2.append(f"Equity outcomes improved ({change:.1f}% increase in DAC benefit) after 2020, reflecting increased emphasis on disadvantaged communities.") else: conclusions_rq2.append(f"Equity outcomes declined ({abs(change):.1f}% decrease in DAC benefit) after 2020, suggesting potential challenges in maintaining equity focus.") # Time series regression findings if 'time_regression' in rq2: regression = rq2['time_regression'] if 'agencies_model' in regression and regression['agencies_model'].get('significant', False): conclusions_rq2.append(regression['agencies_model'].get('interpretation', '')) if 'efficiency_model' in regression and regression['efficiency_model'].get('significant', False): conclusions_rq2.append(regression['efficiency_model'].get('interpretation', '')) if 'dac_model' in regression and regression['dac_model'].get('significant', False): conclusions_rq2.append(regression['dac_model'].get('interpretation', '')) for conclusion in conclusions_rq2: f.write(f"- {conclusion}\n") f.write("\n") # Summarize regional variations f.write("#### Regional Variations\n\n") if 'regional_patterns' in findings: regional = findings['regional_patterns'] conclusions_regional = [] if 'efficiency_equity_tradeoff' in regional: tradeoff = regional['efficiency_equity_tradeoff'] if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0: top_region = tradeoff['top_regions'][0] conclusions_regional.append(f"The {top_region} region achieves the most balanced performance between efficiency and equity outcomes.") if 'efficiency_equity_correlation' in regional: corr = regional['efficiency_equity_correlation'] corr_val = corr.get('correlation', 0) if abs(corr_val) > 0.3: if corr_val > 0: conclusions_regional.append("There is evidence of a tradeoff between efficiency and equity across regions, suggesting potential policy tensions.") else: conclusions_regional.append("There is evidence of synergy between efficiency and equity across regions, suggesting potential for policies that achieve both goals.") if 'regional_anova' in regional: anova = regional['regional_anova'] if 'ghg_efficiency' in anova and anova['ghg_efficiency'].get('significant', False): conclusions_regional.append("There are statistically significant differences in GHG efficiency across regions, indicating that geographic context matters for cost-effectiveness.") if 'dac_benefit' in anova and anova['dac_benefit'].get('significant', False): conclusions_regional.append("There are statistically significant differences in DAC benefit across regions, suggesting variation in equity focus or capabilities.") for conclusion in conclusions_regional: f.write(f"- {conclusion}\n") f.write("\n") # Policy Implications f.write("### Policy Implications\n\n") implications = [] # Generate some implications based on findings if 'rq1' in findings: rq1 = findings['rq1'] if 'ghg_efficiency' in rq1 and 'dac_benefit' in rq1: eff = rq1['ghg_efficiency'] dac = rq1['dac_benefit'] eff_diff = eff.get('difference', 0) dac_diff = dac.get('difference', 0) if eff_diff < 0 and dac_diff > 0: implications.append("Policymakers face a tradeoff between efficiency and equity when designing collaborative structures. Multi-agency programs may prioritize equity at some cost to efficiency.") elif eff_diff < 0 and dac_diff < 0: implications.append("Coordination challenges in multi-agency programs may need to be addressed through improved governance structures and clearer role definitions.") elif eff_diff > 0 and dac_diff > 0: implications.append("Further investment in multi-agency collaborative structures is warranted given their superior performance in both efficiency and equity dimensions.") if 'rq2' in findings: rq2 = findings['rq2'] if 'period_comparison' in rq2: period = rq2['period_comparison'] if 'avg_agencies_change' in period: change = period.get('avg_agencies_change', 0) if change > 0: implications.append("The trend toward increased collaboration after 2020 suggests policy support for collaborative approaches is having an effect and could be further strengthened.") if 'regional_patterns' in findings: regional = findings['regional_patterns'] if 'efficiency_equity_tradeoff' in regional: tradeoff = regional['efficiency_equity_tradeoff'] if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0: top_region = tradeoff['top_regions'][0] implications.append(f"The successful balance achieved in the {top_region} region could serve as a model for other regions seeking to improve both efficiency and equity outcomes.") # Add some general implications implications.append("Careful design of collaboration structures is needed to manage potential tradeoffs between efficiency and equity goals.") implications.append("Regional context matters for program implementation and outcomes, suggesting that customized approaches may be more effective than one-size-fits-all policies.") implications.append("Regular monitoring and evaluation of collaboration patterns and outcomes can help identify best practices and areas for improvement.") for implication in implications: f.write(f"- {implication}\n") f.write("\n") # Future Research f.write("### Future Research Directions\n\n") future_research = [] future_research.append("Investigate the specific mechanisms through which multi-agency collaboration affects program outcomes, including both benefits and potential coordination challenges.") future_research.append("Explore the role of different types of agencies (state, local, non-profit) in collaborative structures and their impact on program effectiveness.") future_research.append("Conduct case studies of high-performing regions to identify transferable best practices for balancing efficiency and equity goals.") future_research.append("Analyze the impact of policy changes around 2020 on program implementation and outcomes.") future_research.append("Examine the long-term sustainability of climate investments and the durability of their greenhouse gas reduction benefits.") for research in future_research: f.write(f"- {research}\n") logger.info(f"Research report generated and saved to {report_file}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='California Climate Investments Collaboration Analysis') parser.add_argument('--data_path', type=str, required=True, help='Path to the cleaned CCI data CSV file') parser.add_argument('--output_path', type=str, default='./output/research', help='Path to save outputs') args = parser.parse_args() # Run the research analysis findings = analyze_research_questions(args.data_path, args.output_path) if findings: print(f"\nAnalysis complete! Results saved to {args.output_path}") else: print("\nAnalysis failed to complete successfully") # The above code is creating a scatter plot visualization to compare efficiency vs. equity by region. # Here is a breakdown of the code: # 3. Regional ANOVA analysis if ('ca_region' in analysis_df.columns and 'ghg_efficiency' in analysis_df.columns and 'dac_benefit_percentage' in analysis_df.columns): regional_anova = {} # ANOVA for GHG efficiency by region try: valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)].copy() if len(valid_df) > 0: # Log transform efficiency for better normality valid_df['log_efficiency'] = np.log(valid_df['ghg_efficiency']) # Create ANOVA model model = smf.ols('log_efficiency ~ C(ca_region)', data=valid_df) results = model.fit() # Perform ANOVA anova_table = sm.stats.anova_lm(results, typ=2) regional_anova['ghg_efficiency'] = { 'f_value': anova_table['F'][0], 'p_value': anova_table['PR(>F)'][0], 'significant': anova_table['PR(>F)'][0] < 0.05, 'r_squared': results.rsquared } # Post-hoc pairwise t-tests if ANOVA is significant if regional_anova['ghg_efficiency']['significant']: # Get regions with at least 10 projects region_counts = valid_df['ca_region'].value_counts() valid_regions = region_counts[region_counts >= 10].index # Filter to valid regions posthoc_df = valid_df[valid_df['ca_region'].isin(valid_regions)] if len(valid_regions) > 1: # Calculate pairwise t-tests pairwise_results = {} for i, region1 in enumerate(valid_regions): for region2 in valid_regions[i+1:]: group1 = posthoc_df[posthoc_df['ca_region'] == region1]['log_efficiency'] group2 = posthoc_df[posthoc_df['ca_region'] == region2]['log_efficiency'] try: t_stat, p_value = stats.ttest_ind( group1, group2, equal_var=False # Use Welch's t-test ) # Store results with Bonferroni correction n_comparisons = len(valid_regions) * (len(valid_regions) - 1) / 2 corrected_p = min(p_value * n_comparisons, 1.0) pairwise_results[f"{region1} vs {region2}"] = { 't_statistic': t_stat, 'p_value': p_value, 'corrected_p': corrected_p, 'significant': corrected_p < 0.05 } except Exception: continue regional_anova['ghg_efficiency']['pairwise_tests'] = pairwise_results except Exception as e: logger.warning(f"Could not perform ANOVA for GHG efficiency by region: {e}") # ANOVA for DAC benefit by region try: # Create ANOVA model model = smf.ols('dac_benefit_percentage ~ C(ca_region)', data=analysis_df) results = model.fit() # Perform ANOVA anova_table = sm.stats.anova_lm(results, typ=2) regional_anova['dac_benefit'] = { 'f_value': anova_table['F'][0], 'p_value': anova_table['PR(>F)'][0], 'significant': anova_table['PR(>F)'][0] < 0.05, 'r_squared': results.rsquared } # Post-hoc pairwise t-tests if ANOVA is significant if regional_anova['dac_benefit']['significant']: # Get regions with at least 10 projects region_counts = analysis_df['ca_region'].value_counts() valid_regions = region_counts[region_counts >= 10].index # Filter to valid regions posthoc_df = analysis_df[analysis_df['ca_region'].isin(valid_regions)] if len(valid_regions) > 1: # Calculate pairwise t-tests pairwise_results = {} for i, region1 in enumerate(valid_regions): for region2 in valid_regions[i+1:]: group1 = posthoc_df[posthoc_df['ca_region'] == region1]['dac_benefit_percentage'] group2 = posthoc_df[posthoc_df['ca_region'] == region2]['dac_benefit_percentage'] try: t_stat, p_value = stats.ttest_ind( group1, group2, equal_var=False # Use Welch's t-test ) # Store results with Bonferroni correction n_comparisons = len(valid_regions) * (len(valid_regions) - 1) / 2 corrected_p = min(p_value * n_comparisons, 1.0) pairwise_results[f"{region1} vs {region2}"] = { 't_statistic': t_stat, 'p_value': p_value, 'corrected_p': corrected_p, 'significant': corrected_p < 0.05 } except Exception: continue regional_anova['dac_benefit']['pairwise_tests'] = pairwise_results except Exception as e: logger.warning(f"Could not perform ANOVA for DAC benefit by region: {e}") findings['regional_anova'] = regional_anova return findings def generate_research_report(findings, output_dir): """ Generate a comprehensive research report based on the analysis findings. Parameters: findings (dict): Dictionary containing research findings output_dir (Path): Directory to save the report """ logger.info("Generating research report") # Create report file report_file = output_dir / "research_report.md" with open(report_file, 'w') as f: # Title and introduction f.write("# California Climate Investments Collaboration Analysis\n\n") f.write("## Research Report\n\n") f.write("This report presents findings from an analysis of collaboration patterns in California's Climate Investments (CCI) program from 2015 to 2024, focusing on how inter-agency collaboration affects greenhouse gas (GHG) reduction efficiencies and equity outcomes.\n\n") # Executive summary f.write("## Executive Summary\n\n") # Highlight key findings exec_summary = [] # RQ1: Collaboration Structure Impact if 'rq1' in findings: rq1 = findings['rq1'] if 'ghg_efficiency' in rq1: eff = rq1['ghg_efficiency'] diff = eff.get('difference', 0) if diff < 0: exec_summary.append(f"Multi-agency programs are less cost-efficient than single-agency programs (${eff.get('multi_agency_median', 0):,.2f} vs. ${eff.get('single_agency_median', 0):,.2f} per ton CO2e).") else: exec_summary.append(f"Multi-agency programs are more cost-efficient than single-agency programs (${eff.get('multi_agency_median', 0):,.2f} vs. ${eff.get('single_agency_median', 0):,.2f} per ton CO2e).") if 'dac_benefit' in rq1: dac = rq1['dac_benefit'] diff = dac.get('difference', 0) if diff > 0: exec_summary.append(f"Multi-agency programs achieve higher equity outcomes than single-agency programs ({dac.get('multi_agency_mean', 0):.2f}% vs. {dac.get('single_agency_mean', 0):.2f}% DAC benefit).") else: exec_summary.append(f"Multi-agency programs achieve lower equity outcomes than single-agency programs ({dac.get('multi_agency_mean', 0):.2f}% vs. {dac.get('single_agency_mean', 0):.2f}% DAC benefit).") # RQ2: Temporal Trends if 'rq2' in findings: rq2 = findings['rq2'] if 'period_comparison' in rq2: period = rq2['period_comparison'] if 'avg_agencies_change' in period: change = period.get('avg_agencies_change', 0) if change > 0: exec_summary.append(f"Collaboration intensity increased by {change:.1f}% from pre-2020 to post-2020 periods.") else: exec_summary.append(f"Collaboration intensity decreased by {abs(change):.1f}% from pre-2020 to post-2020 periods.") if 'avg_funding_change' in period: change = period.get('avg_funding_change', 0) if change > 0: exec_summary.append(f"Average project funding increased by {change:.1f}% after 2020.") else: exec_summary.append(f"Average project funding decreased by {abs(change):.1f}% after 2020.") if 'dac_benefit_change' in period: change = period.get('dac_benefit_change', 0) if change > 0: exec_summary.append(f"DAC benefit percentage increased by {change:.1f}% after 2020.") else: exec_summary.append(f"DAC benefit percentage decreased by {abs(change):.1f}% after 2020.") # Regional variations if 'regional_patterns' in findings: regional = findings['regional_patterns'] if 'efficiency_equity_tradeoff' in regional: tradeoff = regional['efficiency_equity_tradeoff'] if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0: top_region = tradeoff['top_regions'][0] exec_summary.append(f"The {top_region} region demonstrates the best balance between GHG efficiency and equity outcomes.") if 'efficiency_equity_correlation' in regional: corr = regional['efficiency_equity_correlation'] corr_val = corr.get('correlation', 0) if abs(corr_val) > 0.3: if corr_val > 0: exec_summary.append(f"There is a positive correlation ({corr_val:.2f}) between GHG efficiency cost and equity outcomes, suggesting a tradeoff between these goals.") else: exec_summary.append(f"There is a negative correlation ({corr_val:.2f}) between GHG efficiency cost and equity outcomes, suggesting synergy between these goals.") # Write executive summary bullets for point in exec_summary: f.write(f"- {point}\n") f.write("\n## Research Questions\n\n") f.write("1. How does the structure and scale of inter-agency collaboration within the California Climate Investments (CCI) program shape outputs related to greenhouse gas (GHG) reduction efficiencies and equity?\n") f.write("2. What temporal trends relative to funding allocation, agency collaboration, and collaborative outputs related to efficiency and equity outcomes have occurred from 2015 to 2024?\n\n") # Research Question 1: Collaboration Structure Impact f.write("## Research Question 1: Impact of Collaboration Structure and Scale\n\n") if 'rq1' in findings: rq1 = findings['rq1'] f.write("### Multi-Agency vs. Single-Agency Program Comparison\n\n") if 'project_counts' in rq1: counts = rq1['project_counts'] f.write(f"The dataset includes {counts.get('multi_agency', 0):,} projects in multi-agency programs ({counts.get('multi_agency_percentage', 0):.1f}% of non-EV projects) and {counts.get('single_agency', 0):,} projects in single-agency programs.\n\n") if 'ghg_efficiency' in rq1: eff = rq1['ghg_efficiency'] f.write("#### GHG Efficiency Comparison\n\n") f.write(f"The median GHG efficiency (cost per ton of CO2e reduced) for multi-agency programs is ${eff.get('multi_agency_median', 0):,.2f}, compared to ${eff.get('single_agency_median', 0):,.2f} for single-agency programs. This represents a difference of ${abs(eff.get('difference', 0)):,.2f} per ton ") if eff.get('difference', 0) < 0: f.write("in favor of single-agency programs.\n\n") else: f.write("in favor of multi-agency programs.\n\n") if 'statistical_test' in eff: test = eff['statistical_test'] if test.get('significant', False): f.write(f"This difference is statistically significant (p={test.get('p_value', 1):.4f}).\n\n") else: f.write(f"This difference is not statistically significant (p={test.get('p_value', 1):.4f}).\n\n") f.write("![GHG Efficiency Comparison](ghg_efficiency_comparison.png)\n\n") if 'dac_benefit' in rq1: dac = rq1['dac_benefit'] f.write("#### Disadvantaged Community (DAC) Benefit Comparison\n\n") f.write(f"The mean DAC benefit percentage for multi-agency programs is {dac.get('multi_agency_mean', 0):.2f}%, compared to {dac.get('single_agency_mean', 0):.2f}% for single-agency programs. This represents a difference of {abs(dac.get('difference', 0)):.2f}% ") if dac.get('difference', 0) > 0: f.write("in favor of multi-agency programs.\n\n") else: f.write("in favor of single-agency programs.\n\n") if 'statistical_test' in dac: test = dac['statistical_test'] if test.get('significant', False): f.write(f"This difference is statistically significant (p={test.get('p_value', 1):.4f}).\n\n") else: f.write(f"This difference is not statistically significant (p={test.get('p_value', 1):.4f}).\n\n") f.write("![DAC Benefit Comparison](dac_benefit_comparison.png)\n\n") f.write("### Impact of Number of Agencies\n\n") if 'agency_count_impact' in rq1: agency_impact = rq1['agency_count_impact'] f.write("The analysis examined how the number of collaborating agencies affects program outcomes.\n\n") if 'efficiency_correlation' in agency_impact: corr = agency_impact['efficiency_correlation'] f.write(f"The correlation between number of agencies and GHG efficiency is {corr.get('correlation', 0):.2f} ") if corr.get('significant', False): f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ") else: f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ") if corr.get('correlation', 0) > 0: f.write("This suggests that as the number of agencies increases, the cost per ton of GHG reduction tends to increase (less efficient).\n\n") else: f.write("This suggests that as the number of agencies increases, the cost per ton of GHG reduction tends to decrease (more efficient).\n\n") if 'dac_correlation' in agency_impact: corr = agency_impact['dac_correlation'] f.write(f"The correlation between number of agencies and DAC benefit percentage is {corr.get('correlation', 0):.2f} ") if corr.get('significant', False): f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ") else: f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ") if corr.get('correlation', 0) > 0: f.write("This suggests that as the number of agencies increases, the DAC benefit percentage tends to increase.\n\n") else: f.write("This suggests that as the number of agencies increases, the DAC benefit percentage tends to decrease.\n\n") f.write("![Agency Count Impact](agency_count_impact.png)\n\n") f.write("### Impact of Regional Scope\n\n") if 'regional_scope_impact' in rq1: scope_impact = rq1['regional_scope_impact'] f.write("The analysis examined how the regional scope of projects affects collaboration intensity and outcomes.\n\n") if 'avg_agencies' in scope_impact: f.write("#### Collaboration Intensity by Regional Scope\n\n") # Check if Multi-Regional has the highest average if 'Multi-Regional' in scope_impact['avg_agencies']: multi_regional_avg = scope_impact['avg_agencies']['Multi-Regional'] single_county_avg = scope_impact['avg_agencies'].get('Single County', 0) f.write(f"Multi-Regional projects involve an average of {multi_regional_avg:.2f} agencies, compared to {single_county_avg:.2f} for Single County projects. ") f.write("This indicates that projects with wider geographic scope tend to involve more collaborative partners.\n\n") if 'ghg_efficiency' in scope_impact and 'dac_benefit' in scope_impact: f.write("#### Efficiency and Equity by Regional Scope\n\n") # Identify best and worst scopes for efficiency if len(scope_impact['ghg_efficiency']) > 0: best_eff_scope = min(scope_impact['ghg_efficiency'].items(), key=lambda x: x[1])[0] worst_eff_scope = max(scope_impact['ghg_efficiency'].items(), key=lambda x: x[1])[0] f.write(f"The {best_eff_scope} scope achieves the best GHG efficiency (${scope_impact['ghg_efficiency'][best_eff_scope]:,.2f} per ton), while the {worst_eff_scope} scope has the least efficient outcomes (${scope_impact['ghg_efficiency'][worst_eff_scope]:,.2f} per ton).\n\n") # Identify best and worst scopes for DAC benefit if len(scope_impact['dac_benefit']) > 0: best_dac_scope = max(scope_impact['dac_benefit'].items(), key=lambda x: x[1])[0] worst_dac_scope = min(scope_impact['dac_benefit'].items(), key=lambda x: x[1])[0] f.write(f"The {best_dac_scope} scope achieves the highest DAC benefit ({scope_impact['dac_benefit'][best_dac_scope]:.2f}%), while the {worst_dac_scope} scope has the lowest equity outcomes ({scope_impact['dac_benefit'][worst_dac_scope]:.2f}%).\n\n") f.write("![Regional Scope Impact](regional_scope_impact.png)\n\n") if 'regression_analysis' in rq1: regression = rq1['regression_analysis'] f.write("### Regression Analysis\n\n") if 'ghg_efficiency' in regression: ghg_model = regression['ghg_efficiency'] if 'base_model' in ghg_model: base = ghg_model['base_model'] f.write("#### GHG Efficiency Model\n\n") f.write(f"A regression model predicting GHG efficiency from the number of agencies explains {base.get('r_squared', 0)*100:.1f}% of the variance (R²). ") if base.get('p_value', 1) < 0.05: f.write(f"The effect of agency count is statistically significant (p={base.get('p_value', 1):.4f}). ") if base.get('coefficient', 0) > 0: f.write("Each additional collaborating agency is associated with higher costs per ton of GHG reduction (less efficient).\n\n") else: f.write("Each additional collaborating agency is associated with lower costs per ton of GHG reduction (more efficient).\n\n") else: f.write(f"The effect of agency count is not statistically significant (p={base.get('p_value', 1):.4f}).\n\n") if 'dac_benefit' in regression: dac_model = regression['dac_benefit'] if 'base_model' in dac_model: base = dac_model['base_model'] f.write("#### DAC Benefit Model\n\n") f.write(f"A regression model predicting DAC benefit percentage from the number of agencies explains {base.get('r_squared', 0)*100:.1f}% of the variance (R²). ") if base.get('p_value', 1) < 0.05: f.write(f"The effect of agency count is statistically significant (p={base.get('p_value', 1):.4f}). ") if base.get('coefficient', 0) > 0: f.write("Each additional collaborating agency is associated with a higher DAC benefit percentage.\n\n") else: f.write("Each additional collaborating agency is associated with a lower DAC benefit percentage.\n\n") else: f.write(f"The effect of agency count is not statistically significant (p={base.get('p_value', 1):.4f}).\n\n") # Research Question 2: Temporal Trends f.write("## Research Question 2: Temporal Trends (2015-2024)\n\n") if 'rq2' in findings: rq2 = findings['rq2'] f.write("### Yearly Trends in Collaboration and Outcomes\n\n") if 'yearly' in rq2: yearly = rq2['yearly'] f.write("The analysis tracked changes in collaboration patterns and program outcomes from 2015 to 2024.\n\n") f.write("![Yearly Trends](yearly_trends.png)\n\n") f.write("### Pre-2020 vs. Post-2020 Comparison\n\n") if 'period_comparison' in rq2: period = rq2['period_comparison'] f.write("A comparison of the pre-2020 and post-2020 periods reveals significant changes in the CCI program.\n\n") if 'project_count_change' in period: change = period.get('project_count_change', 0) pre_count = period['project_counts'].get('Pre-2020', 0) post_count = period['project_counts'].get('Post-2020', 0) f.write(f"The number of projects changed from {pre_count:,} (pre-2020) to {post_count:,} (post-2020), a {change:+.1f}% change.\n\n") if 'avg_agencies_change' in period: change = period.get('avg_agencies_change', 0) pre_agencies = period['avg_agencies'].get('Pre-2020', 0) post_agencies = period['avg_agencies'].get('Post-2020', 0) f.write(f"The average number of agencies per program increased from {pre_agencies:.2f} to {post_agencies:.2f}, a {change:+.1f}% change. ") if change > 0: f.write("This indicates an increase in collaboration intensity after 2020.\n\n") else: f.write("This indicates a decrease in collaboration intensity after 2020.\n\n") if 'avg_funding_change' in period: change = period.get('avg_funding_change', 0) pre_funding = period['avg_funding'].get('Pre-2020', 0) post_funding = period['avg_funding'].get('Post-2020', 0) f.write(f"The average project funding changed from ${pre_funding:,.2f} to ${post_funding:,.2f}, a {change:+.1f}% change.\n\n") if 'ghg_efficiency_change' in period: change = period.get('ghg_efficiency_change', 0) pre_eff = period['median_ghg_efficiency'].get('Pre-2020', 0) post_eff = period['median_ghg_efficiency'].get('Post-2020', 0) f.write(f"The median GHG efficiency changed from ${pre_eff:,.2f} to ${post_eff:,.2f} per ton CO2e, a {change:+.1f}% change. ") if change > 0: f.write("This indicates a decrease in cost-effectiveness after 2020.\n\n") else: f.write("This indicates an increase in cost-effectiveness after 2020.\n\n") if 'ghg_efficiency_test' in period and period['ghg_efficiency_test'].get('significant', False): f.write("This change is statistically significant.\n\n") if 'dac_benefit_change' in period: change = period.get('dac_benefit_change', 0) pre_dac = period['avg_dac_benefit'].get('Pre-2020', 0) post_dac = period['avg_dac_benefit'].get('Post-2020', 0) f.write(f"The average DAC benefit percentage changed from {pre_dac:.2f}% to {post_dac:.2f}%, a {change:+.1f}% change. ") if change > 0: f.write("This indicates improved equity outcomes after 2020.\n\n") else: f.write("This indicates decreased equity outcomes after 2020.\n\n") if 'dac_benefit_test' in period and period['dac_benefit_test'].get('significant', False): f.write("This change is statistically significant.\n\n") f.write("![Pre/Post 2020 Comparison](pre_post_2020_comparison.png)\n\n") if 'collaboration_trends' in rq2: # Some logic here, or even just pass pass # or continue handling 'collaboration_trends' # Now define the function properly def analyze_regional_variations(df, output_dir): """ Analyze regional variations in collaboration and outcomes. Parameters: df (pd.DataFrame): The CCI data output_dir (Path): Directory to save outputs Returns: dict: Findings related to regional variations """ logger.info("Analyzing regional variations") findings = {} # Exclude EV vouchers for this analysis if they're marked if 'is_ev_voucher' in df.columns: analysis_df = df[~df['is_ev_voucher']].copy() logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis") else: analysis_df = df.copy() # Check if we have regional data if 'ca_region' not in analysis_df.columns: logger.error("California region data not available for regional analysis") return findings # 1. Calculate metrics by region regional_metrics = {} # Group by region region_groups = analysis_df.groupby('ca_region') # Project counts by region regional_metrics['project_counts'] = region_groups.size().to_dict() regional_metrics['project_percentages'] = (region_groups.size() / len(analysis_df) * 100).to_dict() # Number of agencies by region if 'num_agencies_in_program' in analysis_df.columns: regional_metrics['avg_agencies'] = region_groups['num_agencies_in_program'].mean().to_dict() regional_metrics['multi_agency_percentage'] = (region_groups['is_multi_agency'].mean() * 100).to_dict() # Funding metrics by region funding_col = [col for col in analysis_df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] if funding_col and len(funding_col) > 0: funding_col = funding_col[0] regional_metrics['total_funding'] = region_groups[funding_col].sum().to_dict() regional_metrics['avg_funding'] = region_groups[funding_col].mean().to_dict() # Calculate funding share total_funding = analysis_df[funding_col].sum() regional_metrics['funding_percentages'] = {region: funding / total_funding * 100 for region, funding in regional_metrics['total_funding'].items()} # GHG efficiency by region if 'ghg_efficiency' in analysis_df.columns: # Use valid values only and median for robustness to outliers valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)] if len(valid_df) > 0: region_efficiency = valid_df.groupby('ca_region')['ghg_efficiency'].median() regional_metrics['median_ghg_efficiency'] = region_efficiency.to_dict() # DAC benefit by region if 'dac_benefit_percentage' in analysis_df.columns: regional_metrics['avg_dac_benefit'] = region_groups['dac_benefit_percentage'].mean().to_dict() findings['regional_metrics'] = regional_metrics # Create visualization for regional metrics plt.figure(figsize=(15, 12)) # 1. Project distribution by region plt.subplot(2, 2, 1) if 'project_counts' in regional_metrics: regions = list(regional_metrics['project_counts'].keys()) counts = list(regional_metrics['project_counts'].values()) # Sort by count sorted_data = sorted(zip(regions, counts), key=lambda x: x[1], reverse=True) regions, counts = zip(*sorted_data) plt.barh(regions, counts) plt.xlabel('Number of Projects') plt.title('Project Distribution by Region') plt.grid(True, axis='x', alpha=0.3) # 2. Average number of agencies by region plt.subplot(2, 2, 2) if 'avg_agencies' in regional_metrics: regions = list(regional_metrics['avg_agencies'].keys()) values = list(regional_metrics['avg_agencies'].values()) # Sort by average agencies sorted_data = sorted(zip(regions, values), key=lambda x: x[1], reverse=True) regions, values = zip(*sorted_data) plt.barh(regions, values) plt.xlabel('Average Number of Agencies') plt.title('Collaboration Intensity by Region') plt.grid(True, axis='x', alpha=0.3) # 3. GHG efficiency by region plt.subplot(2, 2, 3) if 'median_ghg_efficiency' in regional_metrics: regions = list(regional_metrics['median_ghg_efficiency'].keys()) values = list(regional_metrics['median_ghg_efficiency'].values()) # Sort by efficiency (lower is better) sorted_data = sorted(zip(regions, values), key=lambda x: x[1]) regions, values = zip(*sorted_data) plt.barh(regions, values) plt.xlabel('GHG Efficiency ($ per ton CO2e)') plt.title('GHG Efficiency by Region') plt.grid(True, axis='x', alpha=0.3) # 4. DAC benefit by region plt.subplot(2, 2, 4) if 'avg_dac_benefit' in regional_metrics: regions = list(regional_metrics['avg_dac_benefit'].keys()) values = list(regional_metrics['avg_dac_benefit'].values()) # Sort by DAC benefit (higher is better) sorted_data = sorted(zip(regions, values), key=lambda x: x[1], reverse=True) regions, values = zip(*sorted_data) plt.barh(regions, values) plt.xlabel('DAC Benefit Percentage') plt.title('DAC Benefit by Region') plt.grid(True, axis='x', alpha=0.3) plt.tight_layout() plt.savefig(output_dir / "regional_metrics.png", dpi=300, bbox_inches='tight') plt.close() # 2. Analyze efficiency-equity tradeoffs by region if ('median_ghg_efficiency' in regional_metrics and 'avg_dac_benefit' in regional_metrics and 'avg_agencies' in regional_metrics): # Create dataframe for analysis tradeoff_data = { 'region': list(regional_metrics['median_ghg_efficiency'].keys()), 'efficiency': list(regional_metrics['median_ghg_efficiency'].values()), 'dac_benefit': list(regional_metrics['avg_dac_benefit'].values()), 'avg_agencies': list(regional_metrics['avg_agencies'].values()) } tradeoff_df = pd.DataFrame(tradeoff_data) # Calculate composite score (combines efficiency and equity) # First, normalize the metrics tradeoff_df['efficiency_norm'] = (tradeoff_df['efficiency'] - tradeoff_df['efficiency'].min()) / (tradeoff_df['efficiency'].max() - tradeoff_df['efficiency'].min()) tradeoff_df['dac_norm'] = (tradeoff_df['dac_benefit'] - tradeoff_df['dac_benefit'].min()) / (tradeoff_df['dac_benefit'].max() - tradeoff_df['dac_benefit'].min()) # Since lower efficiency is better, invert the normalized value tradeoff_df['efficiency_inv'] = 1 - tradeoff_df['efficiency_norm'] # Calculate composite score (equal weight to efficiency and equity) tradeoff_df['composite_score'] = (tradeoff_df['efficiency_inv'] + tradeoff_df['dac_norm']) / 2 # Store the results findings['efficiency_equity_tradeoff'] = { 'metrics': tradeoff_df.to_dict(orient='records'), 'top_regions': tradeoff_df.sort_values('composite_score', ascending=False)['region'].tolist() } # Check for correlation between efficiency and equity correlation, p_value = stats.pearsonr(tradeoff_df['efficiency'], tradeoff_df['dac_benefit']) findings['efficiency_equity_correlation'] = { 'correlation_type': 'Pearson', 'correlation': correlation, 'p_value': p_value, 'significant': p_value < 0.05 } # Create visualization for efficiency-equity tradeoff plt.figure(figsize=(10, 8)) # Scatter plot with bubble size representing average number of agencies scatter = plt.scatter( tradeoff_df['efficiency'], tradeoff_df['dac_benefit'], s=tradeoff_df['avg_agencies'] * 50, # Size scaled by avg agencies alpha=0.7, c=tradeoff_df['composite_score'], # Color by composite score cmap='viridis' ) # Add labels for each region for i, row in tradeoff_df.iterrows(): plt.annotate( row['region'], (row['efficiency'], row['dac_benefit']), textcoords="offset points", xytext=(5, 5), ha='left' ) # Add correlation information plt.text(0.05, 0.95, f"Correlation: {correlation:.2f} (p={p_value:.4f})", transform=plt.gca().transAxes, fontsize=10) def analyze_temporal_trends(df, output_dir): ... """ Analyze temporal trends in collaboration and outcomes from 2015-2024. Parameters: df (pd.DataFrame): The CCI data output_dir (Path): Directory to save outputs Returns: dict: Findings related to temporal trends """ logger.info("Analyzing temporal trends (2015-2024)") findings = {} # Exclude EV vouchers for this analysis if they're marked if 'is_ev_voucher' in df.columns: analysis_df = df[~df['is_ev_voucher']].copy() logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis") else: analysis_df = df.copy() # Check if we have the required temporal data if 'funding_year' not in analysis_df.columns: logger.error("Funding year data not available for temporal analysis") return findings # 1. Calculate yearly metrics yearly_metrics = {} # Group by funding year yearly_groups = analysis_df.groupby('funding_year') # Project counts by year yearly_metrics['project_counts'] = yearly_groups.size().to_dict() # Number of agencies by year if 'num_agencies_in_program' in analysis_df.columns: yearly_metrics['avg_agencies'] = yearly_groups['num_agencies_in_program'].mean().to_dict() yearly_metrics['multi_agency_percentage'] = (yearly_groups['is_multi_agency'].mean() * 100).to_dict() # Funding metrics by year funding_col = [col for col in analysis_df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] if funding_col and len(funding_col) > 0: funding_col = funding_col[0] yearly_metrics['total_funding'] = yearly_groups[funding_col].sum().to_dict() yearly_metrics['avg_funding'] = yearly_groups[funding_col].mean().to_dict() # GHG efficiency by year if 'ghg_efficiency' in analysis_df.columns: # Use valid values only and median for robustness to outliers yearly_valid = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)] yearly_efficiency = yearly_valid.groupby('funding_year')['ghg_efficiency'].median() yearly_metrics['median_ghg_efficiency'] = yearly_efficiency.to_dict() # DAC benefit by year if 'dac_benefit_percentage' in analysis_df.columns: yearly_metrics['avg_dac_benefit'] = yearly_groups['dac_benefit_percentage'].mean().to_dict() findings['yearly'] = yearly_metrics # Create visualization for yearly trends plt.figure(figsize=(15, 12)) # 1. Number of agencies over time plt.subplot(2, 2, 1) if 'avg_agencies' in yearly_metrics: years = sorted(yearly_metrics['avg_agencies'].keys()) values = [yearly_metrics['avg_agencies'][year] for year in years] plt.plot(years, values, marker='o', linestyle='-', linewidth=2) plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020') plt.xlabel('Funding Year') plt.ylabel('Average Number of Agencies') plt.title('Collaboration Intensity Over Time') plt.grid(True, alpha=0.3) plt.legend() # 2. Average project funding over time plt.subplot(2, 2, 2) if 'avg_funding' in yearly_metrics: years = sorted(yearly_metrics['avg_funding'].keys()) values = [yearly_metrics['avg_funding'][year] for year in years] plt.plot(years, values, marker='o', linestyle='-', linewidth=2) plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020') plt.xlabel('Funding Year') plt.ylabel('Average Project Funding ($)') plt.title('Project Funding Over Time') plt.grid(True, alpha=0.3) plt.legend() # 3. GHG efficiency over time plt.subplot(2, 2, 3) if 'median_ghg_efficiency' in yearly_metrics: years = sorted(yearly_metrics['median_ghg_efficiency'].keys()) values = [yearly_metrics['median_ghg_efficiency'][year] for year in years] plt.plot(years, values, marker='o', linestyle='-', linewidth=2) plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020') plt.xlabel('Funding Year') plt.ylabel('GHG Efficiency ($ per ton CO2e)') plt.title('GHG Efficiency Over Time') plt.grid(True, alpha=0.3) plt.legend() # 4. DAC benefit over time plt.subplot(2, 2, 4) if 'avg_dac_benefit' in yearly_metrics: years = sorted(yearly_metrics['avg_dac_benefit'].keys()) values = [yearly_metrics['avg_dac_benefit'][year] for year in years] plt.plot(years, values, marker='o', linestyle='-', linewidth=2) plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020') plt.xlabel('Funding Year') plt.ylabel('DAC Benefit Percentage') plt.title('DAC Benefit Over Time') plt.grid(True, alpha=0.3) plt.legend() plt.tight_layout() plt.savefig(output_dir / "yearly_trends.png", dpi=300, bbox_inches='tight') plt.close() # 2. Compare pre-2020 and post-2020 periods if 'period' in analysis_df.columns: period_metrics = {} # Group by period period_groups = analysis_df.groupby('period') # Project counts by period period_metrics['project_counts'] = period_groups.size().to_dict() # Calculate percentage change from pre-2020 to post-2020 pre_count = period_metrics['project_counts'].get('Pre-2020', 0) post_count = period_metrics['project_counts'].get('Post-2020', 0) period_metrics['project_count_change'] = ((post_count - pre_count) / pre_count * 100) if pre_count > 0 else np.nan # Number of agencies by period if 'num_agencies_in_program' in analysis_df.columns: period_metrics['avg_agencies'] = period_groups['num_agencies_in_program'].mean().to_dict() period_metrics['multi_agency_percentage'] = (period_groups['is_multi_agency'].mean() * 100).to_dict() # Calculate percentage change pre_agencies = period_metrics['avg_agencies'].get('Pre-2020', 0) post_agencies = period_metrics['avg_agencies'].get('Post-2020', 0) period_metrics['avg_agencies_change'] = ((post_agencies - pre_agencies) / pre_agencies * 100) if pre_agencies > 0 else np.nan # Funding metrics by period if funding_col: period_metrics['total_funding'] = period_groups[funding_col].sum().to_dict() period_metrics['avg_funding'] = period_groups[funding_col].mean().to_dict() # Calculate percentage change pre_funding = period_metrics['avg_funding'].get('Pre-2020', 0) post_funding = period_metrics['avg_funding'].get('Post-2020', 0) period_metrics['avg_funding_change'] = ((post_funding - pre_funding) / pre_funding * 100) if pre_funding > 0 else np.nan # GHG efficiency by period if 'ghg_efficiency' in analysis_df.columns: valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)] if len(valid_df) > 0: period_efficiency = valid_df.groupby('period')['ghg_efficiency'].median() period_metrics['median_ghg_efficiency'] = period_efficiency.to_dict() # Calculate percentage change pre_efficiency = period_metrics['median_ghg_efficiency'].get('Pre-2020', 0) post_efficiency = period_metrics['median_ghg_efficiency'].get('Post-2020', 0) period_metrics['ghg_efficiency_change'] = ((post_efficiency - pre_efficiency) / pre_efficiency * 100) if pre_efficiency > 0 else np.nan # Statistical test try: pre_group = valid_df[valid_df['period'] == 'Pre-2020']['ghg_efficiency'] post_group = valid_df[valid_df['period'] == 'Post-2020']['ghg_efficiency'] if len(pre_group) > 0 and len(post_group) > 0: u_stat, p_value = stats.mannwhitneyu( pre_group, post_group, alternative='two-sided' ) period_metrics['ghg_efficiency_test'] = { 'test': 'Mann-Whitney U', 'u_statistic': u_stat, 'p_value': p_value, 'significant': p_value < 0.05 } except Exception as e: logger.warning(f"Could not perform statistical test on GHG efficiency by period: {e}") # DAC benefit by period if 'dac_benefit_percentage' in analysis_df.columns: period_metrics['avg_dac_benefit'] = period_groups['dac_benefit_percentage'].mean().to_dict() # Calculate percentage change pre_dac = period_metrics['avg_dac_benefit'].get('Pre-2020', 0) post_dac = period_metrics['avg_dac_benefit'].get('Post-2020', 0) period_metrics['dac_benefit_change'] = ((post_dac - pre_dac) / pre_dac * 100) if pre_dac > 0 else np.nan # Statistical test try: pre_group = analysis_df[analysis_df['period'] == 'Pre-2020']['dac_benefit_percentage'] post_group = analysis_df[analysis_df['period'] == 'Post-2020']['dac_benefit_percentage'] if len(pre_group) > 0 and len(post_group) > 0: t_stat, p_value = stats.ttest_ind( pre_group, post_group, equal_var=False # Use Welch's t-test to handle unequal variances ) period_metrics['dac_benefit_test'] = { 'test': "Welch's t-test", 't_statistic': t_stat, 'p_value': p_value, 'significant': p_value < 0.05 } except Exception as e: logger.warning(f"Could not perform statistical test on DAC benefit by period: {e}") findings['period_comparison'] = period_metrics # Create visualization for pre/post-2020 comparison plt.figure(figsize=(12, 10)) # 1. Project counts by period plt.subplot(2, 2, 1) if 'project_counts' in period_metrics: periods = ['Pre-2020', 'Post-2020'] counts = [period_metrics['project_counts'].get(period, 0) for period in periods] plt.bar(periods, counts) plt.xlabel('Period') plt.ylabel('Number of Projects') plt.title('Project Counts: Pre-2020 vs. Post-2020') # Add percentage change if 'project_count_change' in period_metrics: change = period_metrics['project_count_change'] plt.text(1, counts[1] * 1.05, f"{change:+.1f}%", ha='center') plt.grid(True, axis='y', alpha=0.3) # 2. Average number of agencies by period plt.subplot(2, 2, 2) if 'avg_agencies' in period_metrics: periods = ['Pre-2020', 'Post-2020'] values = [period_metrics['avg_agencies'].get(period, 0) for period in periods] plt.bar(periods, values) plt.xlabel('Period') plt.ylabel('Average Number of Agencies') plt.title('Collaboration Intensity: Pre-2020 vs. Post-2020') # Add percentage change if 'avg_agencies_change' in period_metrics: change = period_metrics['avg_agencies_change'] plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center') plt.grid(True, axis='y', alpha=0.3) # 3. GHG efficiency by period plt.subplot(2, 2, 3) if 'median_ghg_efficiency' in period_metrics: periods = ['Pre-2020', 'Post-2020'] values = [period_metrics['median_ghg_efficiency'].get(period, 0) for period in periods] plt.bar(periods, values) plt.xlabel('Period') plt.ylabel('GHG Efficiency ($ per ton CO2e)') plt.title('GHG Efficiency: Pre-2020 vs. Post-2020') # Add percentage change if 'ghg_efficiency_change' in period_metrics: change = period_metrics['ghg_efficiency_change'] plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center') # Add statistical significance if 'ghg_efficiency_test' in period_metrics and period_metrics['ghg_efficiency_test'].get('significant', False): plt.text(0.5, max(values) * 1.1, "Statistically significant difference (p<0.05)", ha='center', fontsize=10) plt.grid(True, axis='y', alpha=0.3) # 4. DAC benefit by period plt.subplot(2, 2, 4) if 'avg_dac_benefit' in period_metrics: periods = ['Pre-2020', 'Post-2020'] values = [period_metrics['avg_dac_benefit'].get(period, 0) for period in periods] plt.bar(periods, values) plt.xlabel('Period') plt.ylabel('DAC Benefit Percentage') plt.title('DAC Benefit: Pre-2020 vs. Post-2020') # Add percentage change if 'dac_benefit_change' in period_metrics: change = period_metrics['dac_benefit_change'] plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center') # Add statistical significance if 'dac_benefit_test' in period_metrics and period_metrics['dac_benefit_test'].get('significant', False): plt.text(0.5, max(values) * 1.1, "Statistically significant difference (p<0.05)", ha='center', fontsize=10) plt.grid(True, axis='y', alpha=0.3) plt.tight_layout() plt.savefig(output_dir / "pre_post_2020_comparison.png", dpi=300, bbox_inches='tight') plt.close() # 3. Analyze changes in collaboration patterns over time if 'num_agencies_in_program' in analysis_df.columns and 'funding_year' in analysis_df.columns: # Create a time series of collaboration indicators collab_trends = {} # Average agencies over time collab_trends['avg_agencies'] = yearly_metrics.get('avg_agencies', {}) # Percentage of multi-agency projects over time collab_trends['multi_agency_percentage'] = yearly_metrics.get('multi_agency_percentage', {}) # Analyze correlation between time and collaboration intensity try: years = sorted(collab_trends['avg_agencies'].keys()) values = [collab_trends['avg_agencies'][year] for year in years] # Calculate correlation correlation, p_value = stats.pearsonr(years, values) collab_trends['time_correlation'] = { 'correlation_type': 'Pearson', 'correlation': correlation, 'p_value': p_value, 'significant': p_value < 0.05 } except Exception as e: logger.warning(f"Could not calculate correlation between time and collaboration: {e}") # Regional distribution over time (if available) if 'regional_scope' in analysis_df.columns: # Create a cross-tabulation of funding year and regional scope try: crosstab = pd.crosstab( analysis_df['funding_year'], analysis_df['regional_scope'], normalize='index' ).multiply(100) collab_trends['regional_distribution'] = {} for year in crosstab.index: collab_trends['regional_distribution'][int(year)] = {} for scope in crosstab.columns: collab_trends['regional_distribution'][int(year)][scope] = crosstab.loc[year, scope] except Exception as e: logger.warning(f"Error calculating regional distribution over time: {e}") findings['collaboration_trends'] = collab_trends # Create visualization for collaboration trends plt.figure(figsize=(12, 10)) # 1. Average agencies and multi-agency percentage over time plt.subplot(2, 1, 1) if 'avg_agencies' in collab_trends and 'multi_agency_percentage' in collab_trends: years = sorted(collab_trends['avg_agencies'].keys()) avg_values = [collab_trends['avg_agencies'][year] for year in years] pct_values = [collab_trends['multi_agency_percentage'][year] for year in years] # Create figure with two y-axes fig, ax1 = plt.subplots(figsize=(10, 6)) # Plot average agencies color = 'tab:blue' ax1.set_xlabel('Funding Year') ax1.set_ylabel('Average Number of Agencies', color=color) ax1.plot(years, avg_values, color=color, marker='o', linestyle='-', linewidth=2) ax1.tick_params(axis='y', labelcolor=color) # Create second y-axis for percentage ax2 = ax1.twinx() color = 'tab:red' ax2.set_ylabel('Multi-Agency Projects (%)', color=color) ax2.plot(years, pct_values, color=color, marker='s', linestyle='-', linewidth=2) ax2.tick_params(axis='y', labelcolor=color) # Add vertical line at 2020 plt.axvline(x=2020, color='black', linestyle='--', alpha=0.7) # Add correlation information if 'time_correlation' in collab_trends: corr = collab_trends['time_correlation']['correlation'] p_val = collab_trends['time_correlation']['p_value'] plt.figtext(0.15, 0.85, f"Time-Collaboration Correlation: {corr:.2f} (p={p_val:.4f})", fontsize=10, bbox=dict(facecolor='white', alpha=0.8)) plt.title('Collaboration Intensity Over Time') fig.tight_layout() plt.savefig(output_dir / "collaboration_trends.png", dpi=300, bbox_inches='tight') plt.close() # 2. Regional distribution over time (if available) if 'regional_distribution' in collab_trends: try: # Convert the nested dict to a DataFrame for easier plotting region_years = [] region_scopes = [] region_values = [] for year, scopes in collab_trends['regional_distribution'].items(): for scope, value in scopes.items(): region_years.append(year) region_scopes.append(scope) region_values.append(value) region_df = pd.DataFrame({ 'Year': region_years, 'Scope': region_scopes, 'Percentage': region_values }) # Pivot for stacked area chart pivot_df = region_df.pivot(index='Year', columns='Scope', values='Percentage') # Define the order for regional scope scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional'] # Reorder columns if possible ordered_columns = [col for col in scope_order if col in pivot_df.columns] missing_columns = [col for col in pivot_df.columns if col not in ordered_columns] pivot_df = pivot_df[ordered_columns + missing_columns] # Create stacked area chart plt.figure(figsize=(10, 6)) plt.stackplot(pivot_df.index, pivot_df.values.T, labels=pivot_df.columns, alpha=0.7) plt.xlabel('Funding Year') plt.ylabel('Percentage of Projects') plt.title('Regional Scope Distribution Over Time') plt.legend(loc='upper left') plt.grid(True, alpha=0.3) # Add vertical line at 2020 plt.axvline(x=2020, color='black', linestyle='--', alpha=0.7) plt.savefig(output_dir / "regional_trends.png", dpi=300, bbox_inches='tight') plt.close() except Exception as e: logger.warning(f"Error creating regional distribution visualization: {e}") # 4. Time series regression analysis time_regression = {} if 'num_agencies_in_program' in analysis_df.columns and 'funding_year' in analysis_df.columns: try: # Create time variable and center it at 2020 for easier interpretation analysis_df['year_centered'] = analysis_df['funding_year'] - 2020 # Regression model predicting number of agencies from year model1 = sm.OLS.from_formula('num_agencies_in_program ~ year_centered', data=analysis_df) results1 = model1.fit() time_regression['agencies_model'] = { 'r_squared': results1.rsquared, 'coefficient': results1.params['year_centered'], 'p_value': results1.pvalues['year_centered'], 'significant': results1.pvalues['year_centered'] < 0.05 } # Interpret the coefficient coef = results1.params['year_centered'] if coef > 0: time_regression['agencies_model']['interpretation'] = f"The number of agencies per program increases by {coef:.3f} each year." else: time_regression['agencies_model']['interpretation'] = f"The number of agencies per program decreases by {abs(coef):.3f} each year." except Exception as e: logger.warning(f"Could not perform time series regression for number of agencies: {e}") # Regression model predicting GHG efficiency from year if 'ghg_efficiency' in analysis_df.columns and 'funding_year' in analysis_df.columns: try: # Use log transformation for efficiency valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)].copy() if len(valid_df) > 0: valid_df['log_efficiency'] = np.log(valid_df['ghg_efficiency']) valid_df['year_centered'] = valid_df['funding_year'] - 2020 model2 = sm.OLS.from_formula('log_efficiency ~ year_centered', data=valid_df) results2 = model2.fit() time_regression['efficiency_model'] = { 'r_squared': results2.rsquared, 'coefficient': results2.params['year_centered'], 'p_value': results2.pvalues['year_centered'], 'significant': results2.pvalues['year_centered'] < 0.05 } # Interpret the coefficient (exponentiate for percentage change) coef = results2.params['year_centered'] pct_change = (np.exp(coef) - 1) * 100 if pct_change > 0: time_regression['efficiency_model']['interpretation'] = f"GHG efficiency worsens by approximately {pct_change:.1f}% each year (higher $ per ton)." else: time_regression['efficiency_model']['interpretation'] = f"GHG efficiency improves by approximately {abs(pct_change):.1f}% each year (lower $ per ton)." except Exception as e: logger.warning(f"Could not perform time series regression for GHG efficiency: {e}") # Regression model predicting DAC benefit from year if 'dac_benefit_percentage' in analysis_df.columns and 'funding_year' in analysis_df.columns: try: # Create time variable analysis_df['year_centered'] = analysis_df['funding_year'] - 2020 model3 = sm.OLS.from_formula('dac_benefit_percentage ~ year_centered', data=analysis_df) results3 = model3.fit() time_regression['dac_model'] = { 'r_squared': results3.rsquared, 'coefficient': results3.params['year_centered'], 'p_value': results3.pvalues['year_centered'], 'significant': results3.pvalues['year_centered'] < 0.05 } # Interpret the coefficient coef = results3.params['year_centered'] if coef > 0: time_regression['dac_model']['interpretation'] = f"DAC benefit percentage increases by {coef:.3f} percentage points each year." else: time_regression['dac_model']['interpretation'] = f"DAC benefit percentage decreases by {abs(coef):.3f} percentage points each year." except Exception as e: logger.warning(f"Could not perform time series regression for DAC benefit: {e}") findings['time_regression'] = time_regression return findings import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path import logging import statsmodels.api as sm import statsmodels.formula.api as smf from statsmodels.stats.anova import anova_lm from scipy import stats # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("cci_research_analysis") def analyze_research_questions(data_path, output_path="./output/research"): """ Analyze the key research questions about collaboration in CCI. Parameters: data_path (str): Path to the cleaned CCI data file output_path (str): Path to save outputs Returns: dict: Dictionary containing research findings """ logger.info(f"Loading data from {data_path}") # Load the data try: df = pd.read_csv(data_path) except Exception as e: logger.error(f"Error loading data: {e}") return None logger.info(f"Successfully loaded {len(df)} rows") # Create output directory output_dir = Path(output_path) output_dir.mkdir(parents=True, exist_ok=True) # Initialize findings dictionary findings = {} # Analyze each research question try: # Research Question 1: How does structure and scale of collaboration affect outcomes? findings['rq1'] = analyze_collaboration_structure_impact(df, output_dir) # Research Question 2: What temporal trends have occurred from 2015-2024? findings['rq2'] = analyze_temporal_trends(df, output_dir) # Additional analysis of regional patterns and variations findings['regional_patterns'] = analyze_regional_variations(df, output_dir) # Generate comprehensive report generate_research_report(findings, output_dir) logger.info(f"Analysis complete, results saved to {output_dir}") return findings except Exception as e: logger.error(f"Error in research analysis: {e}") return None def analyze_collaboration_structure_impact(df, output_dir): """ Analyze how collaboration structure and scale affect GHG efficiency and equity. Parameters: df (pd.DataFrame): The CCI data output_dir (Path): Directory to save outputs Returns: dict: Findings related to collaboration structure impact """ logger.info("Analyzing collaboration structure impact") findings = {} # Exclude EV vouchers for this analysis if they're marked if 'is_ev_voucher' in df.columns: analysis_df = df[~df['is_ev_voucher']].copy() logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis") else: analysis_df = df.copy() # 1. Compare multi-agency vs single-agency projects if 'is_multi_agency' in analysis_df.columns: multi_df = analysis_df[analysis_df['is_multi_agency']] single_df = analysis_df[~analysis_df['is_multi_agency']] findings['project_counts'] = { 'multi_agency': len(multi_df), 'single_agency': len(single_df), 'multi_agency_percentage': len(multi_df) / len(analysis_df) * 100 if len(analysis_df) > 0 else 0 } # GHG efficiency comparison if 'ghg_efficiency' in analysis_df.columns: # Use valid efficiency data only valid_multi = multi_df[multi_df['ghg_efficiency'].notna() & (multi_df['ghg_efficiency'] > 0)] valid_single = single_df[single_df['ghg_efficiency'].notna() & (single_df['ghg_efficiency'] > 0)] if len(valid_multi) > 0 and len(valid_single) > 0: # Calculate median efficiency ($ per ton CO2e) multi_efficiency = valid_multi['ghg_efficiency'].median() single_efficiency = valid_single['ghg_efficiency'].median() findings['ghg_efficiency'] = { 'multi_agency_median': multi_efficiency, 'single_agency_median': single_efficiency, 'difference': single_efficiency - multi_efficiency, 'difference_percentage': ((single_efficiency - multi_efficiency) / single_efficiency * 100) if single_efficiency > 0 else 0 } # Statistical test (Mann-Whitney U test for non-parametric comparison) try: u_stat, p_value = stats.mannwhitneyu( valid_multi['ghg_efficiency'], valid_single['ghg_efficiency'], alternative='two-sided' ) findings['ghg_efficiency']['statistical_test'] = { 'test': 'Mann-Whitney U', 'u_statistic': u_stat, 'p_value': p_value, 'significant': p_value < 0.05 } except Exception as e: logger.warning(f"Could not perform statistical test on efficiency comparison: {e}") # Create visualization for efficiency comparison plt.figure(figsize=(10, 6)) # Use log scale for better visualization since efficiency can be highly skewed bins = np.logspace( np.log10(min(valid_multi['ghg_efficiency'].min(), valid_single['ghg_efficiency'].min())), np.log10(max(valid_multi['ghg_efficiency'].quantile(0.95), valid_single['ghg_efficiency'].quantile(0.95))), 30 ) plt.hist(valid_single['ghg_efficiency'], bins=bins, alpha=0.5, label=f'Single-Agency (Median: ${single_efficiency:.2f})') plt.hist(valid_multi['ghg_efficiency'], bins=bins, alpha=0.5, label=f'Multi-Agency (Median: ${multi_efficiency:.2f})') plt.xscale('log') plt.xlabel('GHG Efficiency ($ per ton CO2e) - Log Scale') plt.ylabel('Number of Projects') plt.title('GHG Efficiency: Multi-Agency vs. Single-Agency Programs') plt.legend() plt.grid(True, alpha=0.3) # Save the figure plt.savefig(output_dir / "ghg_efficiency_comparison.png", dpi=300, bbox_inches='tight') plt.close() # DAC benefit comparison if 'dac_benefit_percentage' in analysis_df.columns: findings['dac_benefit'] = { 'multi_agency_mean': multi_df['dac_benefit_percentage'].mean(), 'single_agency_mean': single_df['dac_benefit_percentage'].mean(), 'difference': multi_df['dac_benefit_percentage'].mean() - single_df['dac_benefit_percentage'].mean() } # Statistical test try: t_stat, p_value = stats.ttest_ind( multi_df['dac_benefit_percentage'], single_df['dac_benefit_percentage'], equal_var=False # Use Welch's t-test to handle unequal variances ) findings['dac_benefit']['statistical_test'] = { 'test': "Welch's t-test", 't_statistic': t_stat, 'p_value': p_value, 'significant': p_value < 0.05 } except Exception as e: logger.warning(f"Could not perform statistical test on DAC benefit comparison: {e}") # Create visualization for DAC benefit comparison plt.figure(figsize=(10, 6)) # Create a boxplot with outlier points sns.boxplot(x='is_multi_agency', y='dac_benefit_percentage', data=analysis_df) # Add a swarm plot of the data points sns.swarmplot(x='is_multi_agency', y='dac_benefit_percentage', data=analysis_df, alpha=0.5, size=4, edgecolor='black', linewidth=0.5) # Set labels and title plt.xlabel('Multi-Agency Program') plt.ylabel('DAC Benefit Percentage') plt.title('DAC Benefit: Multi-Agency vs. Single-Agency Programs') plt.xticks([0, 1], ['Single-Agency', 'Multi-Agency']) plt.grid(True, axis='y', alpha=0.3) # Save the figure plt.savefig(output_dir / "dac_benefit_comparison.png", dpi=300, bbox_inches='tight') plt.close() # 2. Analyze relationship between number of agencies and outcomes if 'num_agencies_in_program' in analysis_df.columns: # Group by number of agencies agency_groups = analysis_df.groupby('num_agencies_in_program') # Calculate metrics for each group agency_metrics = {} # Project counts by number of agencies agency_counts = agency_groups.size() agency_metrics['project_counts'] = agency_counts.to_dict() # Filter to groups with at least 5 projects for more reliable metrics valid_groups = agency_counts[agency_counts >= 5].index # Calculate GHG efficiency by number of agencies if 'ghg_efficiency' in analysis_df.columns: efficiency_by_agency = agency_groups['ghg_efficiency'].median() efficiency_by_agency = efficiency_by_agency[valid_groups] agency_metrics['ghg_efficiency'] = efficiency_by_agency.to_dict() # Calculate correlation using Spearman rank correlation # (more robust to non-linear relationships and outliers) valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)] if len(valid_df) > 0: spearman_corr, p_value = stats.spearmanr( valid_df['num_agencies_in_program'], valid_df['ghg_efficiency'] ) agency_metrics['efficiency_correlation'] = { 'correlation_type': 'Spearman rank', 'correlation': spearman_corr, 'p_value': p_value, 'significant': p_value < 0.05 } # Calculate DAC benefit by number of agencies if 'dac_benefit_percentage' in analysis_df.columns: dac_by_agency = agency_groups['dac_benefit_percentage'].mean() dac_by_agency = dac_by_agency[valid_groups] agency_metrics['dac_benefit'] = dac_by_agency.to_dict() # Calculate correlation valid_df = analysis_df[analysis_df['dac_benefit_percentage'].notna()] if len(valid_df) > 0: spearman_corr, p_value = stats.spearmanr( valid_df['num_agencies_in_program'], valid_df['dac_benefit_percentage'] ) agency_metrics['dac_correlation'] = { 'correlation_type': 'Spearman rank', 'correlation': spearman_corr, 'p_value': p_value, 'significant': p_value < 0.05 } findings['agency_count_impact'] = agency_metrics # Create visualization for agency count impact plt.figure(figsize=(12, 10)) # 1. GHG Efficiency by number of agencies plt.subplot(2, 1, 1) if 'ghg_efficiency' in agency_metrics: efficiency_df = pd.DataFrame(agency_metrics['ghg_efficiency'].items(), columns=['num_agencies', 'efficiency']) efficiency_df = efficiency_df.sort_values('num_agencies') plt.bar(efficiency_df['num_agencies'], efficiency_df['efficiency']) plt.xlabel('Number of Agencies in Program') plt.ylabel('GHG Efficiency ($ per ton CO2e)') plt.title('GHG Efficiency by Number of Agencies') # Add correlation information if 'efficiency_correlation' in agency_metrics: corr = agency_metrics['efficiency_correlation']['correlation'] p_val = agency_metrics['efficiency_correlation']['p_value'] plt.text(0.05, 0.95, f"Correlation: {corr:.2f} (p={p_val:.4f})", transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.8)) plt.grid(True, axis='y', alpha=0.3) # 2. DAC Benefit by number of agencies plt.subplot(2, 1, 2) if 'dac_benefit' in agency_metrics: dac_df = pd.DataFrame(agency_metrics['dac_benefit'].items(), columns=['num_agencies', 'dac_benefit']) dac_df = dac_df.sort_values('num_agencies') plt.bar(dac_df['num_agencies'], dac_df['dac_benefit']) plt.xlabel('Number of Agencies in Program') plt.ylabel('DAC Benefit Percentage') plt.title('DAC Benefit by Number of Agencies') # Add correlation information if 'dac_correlation' in agency_metrics: corr = agency_metrics['dac_correlation']['correlation'] p_val = agency_metrics['dac_correlation']['p_value'] plt.text(0.05, 0.95, f"Correlation: {corr:.2f} (p={p_val:.4f})", transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.8)) plt.grid(True, axis='y', alpha=0.3) plt.tight_layout() plt.savefig(output_dir / "agency_count_impact.png", dpi=300, bbox_inches='tight') plt.close() # 3. Analyze impact of regional scope if 'regional_scope' in analysis_df.columns: # Group by regional scope scope_groups = analysis_df.groupby('regional_scope') # Calculate metrics for each scope scope_metrics = {} # Project counts by regional scope scope_counts = scope_groups.size() scope_metrics['project_counts'] = scope_counts.to_dict() # Average number of agencies by regional scope if 'num_agencies_in_program' in analysis_df.columns: scope_agencies = scope_groups['num_agencies_in_program'].mean() scope_metrics['avg_agencies'] = scope_agencies.to_dict() # Calculate GHG efficiency by regional scope if 'ghg_efficiency' in analysis_df.columns: valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)] if len(valid_df) > 0: scope_efficiency = valid_df.groupby('regional_scope')['ghg_efficiency'].median() scope_metrics['ghg_efficiency'] = scope_efficiency.to_dict() # Calculate DAC benefit by regional scope if 'dac_benefit_percentage' in analysis_df.columns: scope_dac = scope_groups['dac_benefit_percentage'].mean() scope_metrics['dac_benefit'] = scope_dac.to_dict() findings['regional_scope_impact'] = scope_metrics # Create visualization for regional scope impact plt.figure(figsize=(12, 10)) # 1. Average number of agencies by regional scope plt.subplot(2, 2, 1) if 'avg_agencies' in scope_metrics: scope_df = pd.DataFrame(scope_metrics['avg_agencies'].items(), columns=['regional_scope', 'avg_agencies']) # Define the order for regional scope scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional'] scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1) scope_df = scope_df.sort_values('scope_order') plt.bar(scope_df['regional_scope'], scope_df['avg_agencies']) plt.xlabel('Regional Scope') plt.ylabel('Average Number of Agencies') plt.title('Agency Collaboration by Regional Scope') plt.xticks(rotation=45) plt.grid(True, axis='y', alpha=0.3) # 2. GHG Efficiency by regional scope plt.subplot(2, 2, 2) if 'ghg_efficiency' in scope_metrics: scope_df = pd.DataFrame(scope_metrics['ghg_efficiency'].items(), columns=['regional_scope', 'efficiency']) # Define the order for regional scope scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional'] scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1) scope_df = scope_df.sort_values('scope_order') plt.bar(scope_df['regional_scope'], scope_df['efficiency']) plt.xlabel('Regional Scope') plt.ylabel('GHG Efficiency ($ per ton CO2e)') plt.title('GHG Efficiency by Regional Scope') plt.xticks(rotation=45) plt.grid(True, axis='y', alpha=0.3) # 3. DAC Benefit by regional scope plt.subplot(2, 2, 3) if 'dac_benefit' in scope_metrics: scope_df = pd.DataFrame(scope_metrics['dac_benefit'].items(), columns=['regional_scope', 'dac_benefit']) # Define the order for regional scope scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional'] scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1) scope_df = scope_df.sort_values('scope_order') plt.bar(scope_df['regional_scope'], scope_df['dac_benefit']) plt.xlabel('Regional Scope') plt.ylabel('DAC Benefit Percentage') plt.title('DAC Benefit by Regional Scope') plt.xticks(rotation=45) plt.grid(True, axis='y', alpha=0.3) # 4. Efficiency vs. Equity by Regional Scope (scatter plot) plt.subplot(2, 2, 4) if 'ghg_efficiency' in scope_metrics and 'dac_benefit' in scope_metrics and 'avg_agencies' in scope_metrics: # Combine the metrics combined_df = pd.DataFrame({ 'regional_scope': list(scope_metrics['ghg_efficiency'].keys()), 'efficiency': list(scope_metrics['ghg_efficiency'].values()), 'dac_benefit': list(scope_metrics['dac_benefit'].values()), 'avg_agencies': list(scope_metrics['avg_agencies'].values()) }) # Create scatter plot plt.scatter( combined_df['efficiency'], combined_df['dac_benefit'], s=combined_df['avg_agencies'] * 50, # Size based on number of agencies alpha=0.7 ) # Add labels for each point for i, row in combined_df.iterrows(): plt.annotate( row['regional_scope'], (row['efficiency'], row['dac_benefit']), textcoords="offset points", xytext=(5, 5), ha='left' ) plt.xlabel('GHG Efficiency ($ per ton CO2e)') plt.ylabel('DAC Benefit Percentage') plt.title('Efficiency vs. Equity by Regional Scope') plt.grid(True, linestyle='--', alpha=0.7) plt.tight_layout() plt.savefig(output_dir / "regional_scope_impact.png", dpi=300, bbox_inches='tight') plt.close() # 4. Regression analysis of collaboration factors on outcomes # Create models to predict GHG efficiency and DAC benefit regression_results = {} # Filter out missing values and extreme outliers if 'ghg_efficiency' in analysis_df.columns: ghg_model_df = analysis_df[ analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0) & (analysis_df['ghg_efficiency'] < analysis_df['ghg_efficiency'].quantile(0.95)) ].copy() if 'num_agencies_in_program' in ghg_model_df.columns: # Log transform the GHG efficiency for better model fit ghg_model_df['log_efficiency'] = np.log(ghg_model_df['ghg_efficiency']) # Create model predicting log efficiency try: # Base model with just number of agencies model1 = sm.OLS.from_formula('log_efficiency ~ num_agencies_in_program', data=ghg_model_df) results1 = model1.fit() # Model with number of agencies and regional scope (if available) if 'regional_scope' in ghg_model_df.columns: # Convert categorical to dummy variables ghg_model_df = pd.get_dummies(ghg_model_df, columns=['regional_scope'], drop_first=True) # Get dummy variable columns scope_cols = [col for col in ghg_model_df.columns if col.startswith('regional_scope_')] # Create formula with interaction terms formula2 = 'log_efficiency ~ num_agencies_in_program + ' + ' + '.join(scope_cols) model2 = sm.OLS.from_formula(formula2, data=ghg_model_df) results2 = model2.fit() # Store model results regression_results['ghg_efficiency'] = { 'base_model': { 'r_squared': results1.rsquared, 'r_squared_adj': results1.rsquared_adj, 'p_value': results1.f_pvalue, 'coefficient': results1.params['num_agencies_in_program'], 'coefficient_p_value': results1.pvalues['num_agencies_in_program'] }, 'full_model': { 'r_squared': results2.rsquared, 'r_squared_adj': results2.rsquared_adj, 'p_value': results2.f_pvalue, 'coefficient': results2.params['num_agencies_in_program'], 'coefficient_p_value': results2.pvalues['num_agencies_in_program'] } } else: # Store just the base model results regression_results['ghg_efficiency'] = { 'base_model': { 'r_squared': results1.rsquared, 'r_squared_adj': results1.rsquared_adj, 'p_value': results1.f_pvalue, 'coefficient': results1.params['num_agencies_in_program'], 'coefficient_p_value': results1.pvalues['num_agencies_in_program'] } } except Exception as e: logger.warning(f"Could not perform regression analysis for GHG efficiency: {e}") # Regression for DAC benefit if 'dac_benefit_percentage' in analysis_df.columns: dac_model_df = analysis_df[ analysis_df['dac_benefit_percentage'].notna() ].copy() if 'num_agencies_in_program' in dac_model_df.columns: try: # Base model with just number of agencies model1 = sm.OLS.from_formula('dac_benefit_percentage ~ num_agencies_in_program', data=dac_model_df) results1 = model1.fit() # Model with number of agencies and regional scope (if available) if 'regional_scope' in dac_model_df.columns: # Convert categorical to dummy variables dac_model_df = pd.get_dummies(dac_model_df, columns=['regional_scope'], drop_first=True) # Get dummy variable columns scope_cols = [col for col in dac_model_df.columns if col.startswith('regional_scope_')] # Create formula with interaction terms formula2 = 'dac_benefit_percentage ~ num_agencies_in_program + ' + ' + '.join(scope_cols) model2 = sm.OLS.from_formula(formula2, data=dac_model_df) results2 = model2.fit() # Store model results regression_results['dac_benefit'] = { 'base_model': { 'r_squared': results1.rsquared, 'r_squared_adj': results1.rsquared_adj, 'p_value': results1.f_pvalue, 'coefficient': results1.params['num_agencies_in_program'], 'coefficient_p_value': results1.pvalues['num_agencies_in_program'] }, 'full_model': { 'r_squared': results2.rsquared, 'r_squared_adj': results2.rsquared_adj, 'p_value': results2.f_pvalue, 'coefficient': results2.params['num_agencies_in_program'], 'coefficient_p_value': results2.pvalues['num_agencies_in_program'] } } else: # Store just the base model results regression_results['dac_benefit'] = { 'base_model': { 'r_squared': results1.rsquared, 'r_squared_adj': results1.rsquared_adj, 'p_value': results1.f_pvalue, 'coefficient': results1.params['num_agencies_in_program'], 'coefficient_p_value': results1.pvalues['num_agencies_in_program'] } } except Exception as e: logger.warning(f"Could not perform regression analysis for DAC benefit: {e}") findings['regression_analysis'] = regression_results return findings