2209 lines
111 KiB
Python
2209 lines
111 KiB
Python
if 'collaboration_trends' in rq2:
|
|
collab_trends = rq2['collaboration_trends']
|
|
|
|
f.write("### Changes in Collaboration Patterns\n\n")
|
|
|
|
if 'time_correlation' in collab_trends:
|
|
corr = collab_trends['time_correlation']
|
|
f.write(f"The correlation between time (year) and collaboration intensity (average number of agencies) is {corr.get('correlation', 0):.2f} ")
|
|
|
|
if corr.get('significant', False):
|
|
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
else:
|
|
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
|
|
if corr.get('correlation', 0) > 0:
|
|
f.write("This indicates a general trend of increasing collaboration over the study period.\n\n")
|
|
else:
|
|
f.write("This indicates a general trend of decreasing collaboration over the study period.\n\n")
|
|
|
|
if 'regional_distribution' in collab_trends:
|
|
f.write("The distribution of projects across different regional scopes has changed over time.\n\n")
|
|
f.write("\n\n")
|
|
|
|
if 'time_regression' in rq2:
|
|
regression = rq2['time_regression']
|
|
|
|
f.write("### Time Series Regression Analysis\n\n")
|
|
|
|
if 'agencies_model' in regression:
|
|
model = regression['agencies_model']
|
|
f.write("#### Trend in Collaboration Intensity\n\n")
|
|
|
|
if 'interpretation' in model:
|
|
f.write(f"{model['interpretation']} ")
|
|
|
|
if model.get('significant', False):
|
|
f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
|
|
else:
|
|
f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
|
|
|
|
if 'efficiency_model' in regression:
|
|
model = regression['efficiency_model']
|
|
f.write("#### Trend in GHG Efficiency\n\n")
|
|
|
|
if 'interpretation' in model:
|
|
f.write(f"{model['interpretation']} ")
|
|
|
|
if model.get('significant', False):
|
|
f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
|
|
else:
|
|
f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
|
|
|
|
if 'dac_model' in regression:
|
|
model = regression['dac_model']
|
|
f.write("#### Trend in DAC Benefit\n\n")
|
|
|
|
if 'interpretation' in model:
|
|
f.write(f"{model['interpretation']} ")
|
|
|
|
if model.get('significant', False):
|
|
f.write(f"This trend is statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
|
|
else:
|
|
f.write(f"This trend is not statistically significant (p={model.get('p_value', 1):.4f}).\n\n")
|
|
|
|
# Regional Variations
|
|
f.write("## Regional Variations in Collaboration Effectiveness\n\n")
|
|
|
|
if 'regional_patterns' in findings:
|
|
regional = findings['regional_patterns']
|
|
|
|
if 'regional_metrics' in regional:
|
|
metrics = regional['regional_metrics']
|
|
|
|
f.write("### Regional Distribution and Characteristics\n\n")
|
|
|
|
f.write("The analysis identified significant variations in collaboration patterns and outcomes across different regions of California.\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("### Efficiency-Equity Tradeoffs by Region\n\n")
|
|
|
|
if 'efficiency_equity_tradeoff' in regional:
|
|
tradeoff = regional['efficiency_equity_tradeoff']
|
|
|
|
f.write("The analysis examined the tradeoff between GHG efficiency and equity outcomes across regions.\n\n")
|
|
|
|
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
|
|
top_region = tradeoff['top_regions'][0]
|
|
f.write(f"The {top_region} region achieves the best overall balance between efficiency and equity outcomes. ")
|
|
|
|
# Include top 3 regions if available
|
|
if len(tradeoff['top_regions']) >= 3:
|
|
top_3 = ", ".join(tradeoff['top_regions'][:3])
|
|
f.write(f"The top three regions in terms of combined efficiency and equity performance are: {top_3}.\n\n")
|
|
else:
|
|
f.write("\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
if 'efficiency_equity_correlation' in regional:
|
|
corr = regional['efficiency_equity_correlation']
|
|
corr_val = corr.get('correlation', 0)
|
|
|
|
f.write(f"The correlation between GHG efficiency cost and DAC benefit percentage across regions is {corr_val:.2f} ")
|
|
|
|
if corr.get('significant', False):
|
|
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
else:
|
|
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
|
|
if corr_val > 0.3:
|
|
f.write("This positive correlation suggests a tradeoff between efficiency and equity, where regions with better efficiency (lower cost per ton) tend to have lower DAC benefits.\n\n")
|
|
elif corr_val < -0.3:
|
|
f.write("This negative correlation suggests synergy between efficiency and equity, where regions with better efficiency (lower cost per ton) tend to have higher DAC benefits.\n\n")
|
|
else:
|
|
f.write("This weak correlation suggests that efficiency and equity outcomes may be largely independent of each other across regions.\n\n")
|
|
|
|
if 'regional_anova' in regional:
|
|
anova = regional['regional_anova']
|
|
|
|
f.write("### Statistical Comparison of Regions\n\n")
|
|
|
|
if 'ghg_efficiency' in anova:
|
|
efficiency = anova['ghg_efficiency']
|
|
f.write("#### Regional Differences in GHG Efficiency\n\n")
|
|
|
|
if efficiency.get('significant', False):
|
|
f.write(f"There are statistically significant differences in GHG efficiency across regions (F={efficiency.get('f_value', 0):.2f}, p={efficiency.get('p_value', 1):.4f}).\n\n")
|
|
|
|
if 'pairwise_tests' in efficiency:
|
|
sig_pairs = [pair for pair, result in efficiency['pairwise_tests'].items() if result.get('significant', False)]
|
|
|
|
if len(sig_pairs) > 0:
|
|
f.write("The significant regional differences include:\n\n")
|
|
|
|
for pair in sig_pairs[:3]: # Show up to 3 significant pairs
|
|
f.write(f"- {pair}\n")
|
|
|
|
if len(sig_pairs) > 3:
|
|
f.write(f"- And {len(sig_pairs) - 3} other significant regional differences\n")
|
|
|
|
f.write("\n")
|
|
else:
|
|
f.write(f"There are no statistically significant differences in GHG efficiency across regions (F={efficiency.get('f_value', 0):.2f}, p={efficiency.get('p_value', 1):.4f}).\n\n")
|
|
|
|
if 'dac_benefit' in anova:
|
|
dac = anova['dac_benefit']
|
|
f.write("#### Regional Differences in DAC Benefit\n\n")
|
|
|
|
if dac.get('significant', False):
|
|
f.write(f"There are statistically significant differences in DAC benefit percentage across regions (F={dac.get('f_value', 0):.2f}, p={dac.get('p_value', 1):.4f}).\n\n")
|
|
|
|
if 'pairwise_tests' in dac:
|
|
sig_pairs = [pair for pair, result in dac['pairwise_tests'].items() if result.get('significant', False)]
|
|
|
|
if len(sig_pairs) > 0:
|
|
f.write("The significant regional differences include:\n\n")
|
|
|
|
for pair in sig_pairs[:3]: # Show up to 3 significant pairs
|
|
f.write(f"- {pair}\n")
|
|
|
|
if len(sig_pairs) > 3:
|
|
f.write(f"- And {len(sig_pairs) - 3} other significant regional differences\n")
|
|
|
|
f.write("\n")
|
|
else:
|
|
f.write(f"There are no statistically significant differences in DAC benefit percentage across regions (F={dac.get('f_value', 0):.2f}, p={dac.get('p_value', 1):.4f}).\n\n")
|
|
|
|
# Conclusions and Implications
|
|
f.write("## Conclusions and Implications\n\n")
|
|
|
|
f.write("### Key Findings\n\n")
|
|
|
|
# Summarize key findings from RQ1
|
|
f.write("#### Research Question 1: Collaboration Structure and Scale\n\n")
|
|
|
|
if 'rq1' in findings:
|
|
rq1 = findings['rq1']
|
|
|
|
conclusions_rq1 = []
|
|
|
|
# Multi-agency vs single-agency comparison
|
|
if 'ghg_efficiency' in rq1 and 'dac_benefit' in rq1:
|
|
eff = rq1['ghg_efficiency']
|
|
dac = rq1['dac_benefit']
|
|
|
|
eff_diff = eff.get('difference', 0)
|
|
dac_diff = dac.get('difference', 0)
|
|
|
|
if eff_diff < 0 and dac_diff > 0:
|
|
conclusions_rq1.append("Multi-agency programs demonstrate a clear tradeoff: they are less cost-efficient but achieve higher equity outcomes compared to single-agency programs.")
|
|
elif eff_diff < 0 and dac_diff < 0:
|
|
conclusions_rq1.append("Multi-agency programs underperform single-agency programs in both cost efficiency and equity outcomes, suggesting potential coordination challenges.")
|
|
elif eff_diff > 0 and dac_diff > 0:
|
|
conclusions_rq1.append("Multi-agency programs outperform single-agency programs in both cost efficiency and equity outcomes, demonstrating synergistic benefits of collaboration.")
|
|
elif eff_diff > 0 and dac_diff < 0:
|
|
conclusions_rq1.append("Multi-agency programs show better cost efficiency but lower equity outcomes compared to single-agency programs.")
|
|
|
|
# Agency count correlation findings
|
|
if 'agency_count_impact' in rq1:
|
|
agency_impact = rq1['agency_count_impact']
|
|
|
|
if 'efficiency_correlation' in agency_impact and 'dac_correlation' in agency_impact:
|
|
eff_corr = agency_impact['efficiency_correlation'].get('correlation', 0)
|
|
dac_corr = agency_impact['dac_correlation'].get('correlation', 0)
|
|
|
|
if abs(eff_corr) > 0.1:
|
|
if eff_corr > 0:
|
|
conclusions_rq1.append("Increasing the number of collaborating agencies tends to reduce cost efficiency (higher $ per ton).")
|
|
else:
|
|
conclusions_rq1.append("Increasing the number of collaborating agencies tends to improve cost efficiency (lower $ per ton).")
|
|
|
|
if abs(dac_corr) > 0.1:
|
|
if dac_corr > 0:
|
|
conclusions_rq1.append("Increasing the number of collaborating agencies tends to improve equity outcomes (higher DAC benefit).")
|
|
else:
|
|
conclusions_rq1.append("Increasing the number of collaborating agencies tends to reduce equity outcomes (lower DAC benefit).")
|
|
|
|
# Regional scope findings
|
|
if 'regional_scope_impact' in rq1:
|
|
scope_impact = rq1['regional_scope_impact']
|
|
|
|
if 'avg_agencies' in scope_impact and 'Multi-Regional' in scope_impact['avg_agencies']:
|
|
multi_regional_avg = scope_impact['avg_agencies']['Multi-Regional']
|
|
single_county_avg = scope_impact['avg_agencies'].get('Single County', 0)
|
|
|
|
if multi_regional_avg > single_county_avg:
|
|
conclusions_rq1.append(f"Projects with wider geographic scope involve significantly more collaborative partners (Multi-Regional: {multi_regional_avg:.1f} agencies vs. Single County: {single_county_avg:.1f} agencies).")
|
|
|
|
for conclusion in conclusions_rq1:
|
|
f.write(f"- {conclusion}\n")
|
|
|
|
f.write("\n")
|
|
|
|
# Summarize key findings from RQ2
|
|
f.write("#### Research Question 2: Temporal Trends\n\n")
|
|
|
|
if 'rq2' in findings:
|
|
rq2 = findings['rq2']
|
|
|
|
conclusions_rq2 = []
|
|
|
|
# Pre/post 2020 comparison
|
|
if 'period_comparison' in rq2:
|
|
period = rq2['period_comparison']
|
|
|
|
if 'avg_agencies_change' in period:
|
|
change = period.get('avg_agencies_change', 0)
|
|
|
|
if abs(change) > 5: # Only include if the change is substantial
|
|
if change > 0:
|
|
conclusions_rq2.append(f"Collaboration intensity increased significantly ({change:.1f}%) after 2020, indicating a shift toward more collaborative approaches.")
|
|
else:
|
|
conclusions_rq2.append(f"Collaboration intensity decreased significantly ({abs(change):.1f}%) after 2020, indicating a shift toward more independent approaches.")
|
|
|
|
if 'avg_funding_change' in period:
|
|
change = period.get('avg_funding_change', 0)
|
|
|
|
if abs(change) > 10: # Only include if the change is substantial
|
|
if change > 0:
|
|
conclusions_rq2.append(f"Average project funding increased substantially ({change:.1f}%) after 2020, suggesting a shift toward larger-scale initiatives.")
|
|
else:
|
|
conclusions_rq2.append(f"Average project funding decreased substantially ({abs(change):.1f}%) after 2020, suggesting a shift toward smaller-scale initiatives.")
|
|
|
|
if 'dac_benefit_change' in period:
|
|
change = period.get('dac_benefit_change', 0)
|
|
|
|
if abs(change) > 5: # Only include if the change is substantial
|
|
if change > 0:
|
|
conclusions_rq2.append(f"Equity outcomes improved ({change:.1f}% increase in DAC benefit) after 2020, reflecting increased emphasis on disadvantaged communities.")
|
|
else:
|
|
conclusions_rq2.append(f"Equity outcomes declined ({abs(change):.1f}% decrease in DAC benefit) after 2020, suggesting potential challenges in maintaining equity focus.")
|
|
|
|
# Time series regression findings
|
|
if 'time_regression' in rq2:
|
|
regression = rq2['time_regression']
|
|
|
|
if 'agencies_model' in regression and regression['agencies_model'].get('significant', False):
|
|
conclusions_rq2.append(regression['agencies_model'].get('interpretation', ''))
|
|
|
|
if 'efficiency_model' in regression and regression['efficiency_model'].get('significant', False):
|
|
conclusions_rq2.append(regression['efficiency_model'].get('interpretation', ''))
|
|
|
|
if 'dac_model' in regression and regression['dac_model'].get('significant', False):
|
|
conclusions_rq2.append(regression['dac_model'].get('interpretation', ''))
|
|
|
|
for conclusion in conclusions_rq2:
|
|
f.write(f"- {conclusion}\n")
|
|
|
|
f.write("\n")
|
|
|
|
# Summarize regional variations
|
|
f.write("#### Regional Variations\n\n")
|
|
|
|
if 'regional_patterns' in findings:
|
|
regional = findings['regional_patterns']
|
|
|
|
conclusions_regional = []
|
|
|
|
if 'efficiency_equity_tradeoff' in regional:
|
|
tradeoff = regional['efficiency_equity_tradeoff']
|
|
|
|
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
|
|
top_region = tradeoff['top_regions'][0]
|
|
conclusions_regional.append(f"The {top_region} region achieves the most balanced performance between efficiency and equity outcomes.")
|
|
|
|
if 'efficiency_equity_correlation' in regional:
|
|
corr = regional['efficiency_equity_correlation']
|
|
corr_val = corr.get('correlation', 0)
|
|
|
|
if abs(corr_val) > 0.3:
|
|
if corr_val > 0:
|
|
conclusions_regional.append("There is evidence of a tradeoff between efficiency and equity across regions, suggesting potential policy tensions.")
|
|
else:
|
|
conclusions_regional.append("There is evidence of synergy between efficiency and equity across regions, suggesting potential for policies that achieve both goals.")
|
|
|
|
if 'regional_anova' in regional:
|
|
anova = regional['regional_anova']
|
|
|
|
if 'ghg_efficiency' in anova and anova['ghg_efficiency'].get('significant', False):
|
|
conclusions_regional.append("There are statistically significant differences in GHG efficiency across regions, indicating that geographic context matters for cost-effectiveness.")
|
|
|
|
if 'dac_benefit' in anova and anova['dac_benefit'].get('significant', False):
|
|
conclusions_regional.append("There are statistically significant differences in DAC benefit across regions, suggesting variation in equity focus or capabilities.")
|
|
|
|
for conclusion in conclusions_regional:
|
|
f.write(f"- {conclusion}\n")
|
|
|
|
f.write("\n")
|
|
|
|
# Policy Implications
|
|
f.write("### Policy Implications\n\n")
|
|
|
|
implications = []
|
|
|
|
# Generate some implications based on findings
|
|
if 'rq1' in findings:
|
|
rq1 = findings['rq1']
|
|
|
|
if 'ghg_efficiency' in rq1 and 'dac_benefit' in rq1:
|
|
eff = rq1['ghg_efficiency']
|
|
dac = rq1['dac_benefit']
|
|
|
|
eff_diff = eff.get('difference', 0)
|
|
dac_diff = dac.get('difference', 0)
|
|
|
|
if eff_diff < 0 and dac_diff > 0:
|
|
implications.append("Policymakers face a tradeoff between efficiency and equity when designing collaborative structures. Multi-agency programs may prioritize equity at some cost to efficiency.")
|
|
elif eff_diff < 0 and dac_diff < 0:
|
|
implications.append("Coordination challenges in multi-agency programs may need to be addressed through improved governance structures and clearer role definitions.")
|
|
elif eff_diff > 0 and dac_diff > 0:
|
|
implications.append("Further investment in multi-agency collaborative structures is warranted given their superior performance in both efficiency and equity dimensions.")
|
|
|
|
if 'rq2' in findings:
|
|
rq2 = findings['rq2']
|
|
|
|
if 'period_comparison' in rq2:
|
|
period = rq2['period_comparison']
|
|
|
|
if 'avg_agencies_change' in period:
|
|
change = period.get('avg_agencies_change', 0)
|
|
|
|
if change > 0:
|
|
implications.append("The trend toward increased collaboration after 2020 suggests policy support for collaborative approaches is having an effect and could be further strengthened.")
|
|
|
|
if 'regional_patterns' in findings:
|
|
regional = findings['regional_patterns']
|
|
|
|
if 'efficiency_equity_tradeoff' in regional:
|
|
tradeoff = regional['efficiency_equity_tradeoff']
|
|
|
|
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
|
|
top_region = tradeoff['top_regions'][0]
|
|
implications.append(f"The successful balance achieved in the {top_region} region could serve as a model for other regions seeking to improve both efficiency and equity outcomes.")
|
|
|
|
# Add some general implications
|
|
implications.append("Careful design of collaboration structures is needed to manage potential tradeoffs between efficiency and equity goals.")
|
|
implications.append("Regional context matters for program implementation and outcomes, suggesting that customized approaches may be more effective than one-size-fits-all policies.")
|
|
implications.append("Regular monitoring and evaluation of collaboration patterns and outcomes can help identify best practices and areas for improvement.")
|
|
|
|
for implication in implications:
|
|
f.write(f"- {implication}\n")
|
|
|
|
f.write("\n")
|
|
|
|
# Future Research
|
|
f.write("### Future Research Directions\n\n")
|
|
|
|
future_research = []
|
|
|
|
future_research.append("Investigate the specific mechanisms through which multi-agency collaboration affects program outcomes, including both benefits and potential coordination challenges.")
|
|
future_research.append("Explore the role of different types of agencies (state, local, non-profit) in collaborative structures and their impact on program effectiveness.")
|
|
future_research.append("Conduct case studies of high-performing regions to identify transferable best practices for balancing efficiency and equity goals.")
|
|
future_research.append("Analyze the impact of policy changes around 2020 on program implementation and outcomes.")
|
|
future_research.append("Examine the long-term sustainability of climate investments and the durability of their greenhouse gas reduction benefits.")
|
|
|
|
for research in future_research:
|
|
f.write(f"- {research}\n")
|
|
|
|
logger.info(f"Research report generated and saved to {report_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='California Climate Investments Collaboration Analysis')
|
|
parser.add_argument('--data_path', type=str, required=True, help='Path to the cleaned CCI data CSV file')
|
|
parser.add_argument('--output_path', type=str, default='./output/research', help='Path to save outputs')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Run the research analysis
|
|
findings = analyze_research_questions(args.data_path, args.output_path)
|
|
|
|
if findings:
|
|
print(f"\nAnalysis complete! Results saved to {args.output_path}")
|
|
else:
|
|
print("\nAnalysis failed to complete successfully")
|
|
|
|
# The above code is creating a scatter plot visualization to compare efficiency vs. equity by region.
|
|
# Here is a breakdown of the code:
|
|
# 3. Regional ANOVA analysis
|
|
if ('ca_region' in analysis_df.columns and
|
|
'ghg_efficiency' in analysis_df.columns and
|
|
'dac_benefit_percentage' in analysis_df.columns):
|
|
|
|
regional_anova = {}
|
|
|
|
# ANOVA for GHG efficiency by region
|
|
try:
|
|
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)].copy()
|
|
|
|
if len(valid_df) > 0:
|
|
# Log transform efficiency for better normality
|
|
valid_df['log_efficiency'] = np.log(valid_df['ghg_efficiency'])
|
|
|
|
# Create ANOVA model
|
|
model = smf.ols('log_efficiency ~ C(ca_region)', data=valid_df)
|
|
results = model.fit()
|
|
|
|
# Perform ANOVA
|
|
anova_table = sm.stats.anova_lm(results, typ=2)
|
|
|
|
regional_anova['ghg_efficiency'] = {
|
|
'f_value': anova_table['F'][0],
|
|
'p_value': anova_table['PR(>F)'][0],
|
|
'significant': anova_table['PR(>F)'][0] < 0.05,
|
|
'r_squared': results.rsquared
|
|
}
|
|
|
|
# Post-hoc pairwise t-tests if ANOVA is significant
|
|
if regional_anova['ghg_efficiency']['significant']:
|
|
# Get regions with at least 10 projects
|
|
region_counts = valid_df['ca_region'].value_counts()
|
|
valid_regions = region_counts[region_counts >= 10].index
|
|
|
|
# Filter to valid regions
|
|
posthoc_df = valid_df[valid_df['ca_region'].isin(valid_regions)]
|
|
|
|
if len(valid_regions) > 1:
|
|
# Calculate pairwise t-tests
|
|
pairwise_results = {}
|
|
|
|
for i, region1 in enumerate(valid_regions):
|
|
for region2 in valid_regions[i+1:]:
|
|
group1 = posthoc_df[posthoc_df['ca_region'] == region1]['log_efficiency']
|
|
group2 = posthoc_df[posthoc_df['ca_region'] == region2]['log_efficiency']
|
|
|
|
try:
|
|
t_stat, p_value = stats.ttest_ind(
|
|
group1, group2, equal_var=False # Use Welch's t-test
|
|
)
|
|
|
|
# Store results with Bonferroni correction
|
|
n_comparisons = len(valid_regions) * (len(valid_regions) - 1) / 2
|
|
corrected_p = min(p_value * n_comparisons, 1.0)
|
|
|
|
pairwise_results[f"{region1} vs {region2}"] = {
|
|
't_statistic': t_stat,
|
|
'p_value': p_value,
|
|
'corrected_p': corrected_p,
|
|
'significant': corrected_p < 0.05
|
|
}
|
|
except Exception:
|
|
continue
|
|
|
|
regional_anova['ghg_efficiency']['pairwise_tests'] = pairwise_results
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform ANOVA for GHG efficiency by region: {e}")
|
|
|
|
# ANOVA for DAC benefit by region
|
|
try:
|
|
# Create ANOVA model
|
|
model = smf.ols('dac_benefit_percentage ~ C(ca_region)', data=analysis_df)
|
|
results = model.fit()
|
|
|
|
# Perform ANOVA
|
|
anova_table = sm.stats.anova_lm(results, typ=2)
|
|
|
|
regional_anova['dac_benefit'] = {
|
|
'f_value': anova_table['F'][0],
|
|
'p_value': anova_table['PR(>F)'][0],
|
|
'significant': anova_table['PR(>F)'][0] < 0.05,
|
|
'r_squared': results.rsquared
|
|
}
|
|
|
|
# Post-hoc pairwise t-tests if ANOVA is significant
|
|
if regional_anova['dac_benefit']['significant']:
|
|
# Get regions with at least 10 projects
|
|
region_counts = analysis_df['ca_region'].value_counts()
|
|
valid_regions = region_counts[region_counts >= 10].index
|
|
|
|
# Filter to valid regions
|
|
posthoc_df = analysis_df[analysis_df['ca_region'].isin(valid_regions)]
|
|
|
|
if len(valid_regions) > 1:
|
|
# Calculate pairwise t-tests
|
|
pairwise_results = {}
|
|
|
|
for i, region1 in enumerate(valid_regions):
|
|
for region2 in valid_regions[i+1:]:
|
|
group1 = posthoc_df[posthoc_df['ca_region'] == region1]['dac_benefit_percentage']
|
|
group2 = posthoc_df[posthoc_df['ca_region'] == region2]['dac_benefit_percentage']
|
|
|
|
try:
|
|
t_stat, p_value = stats.ttest_ind(
|
|
group1, group2, equal_var=False # Use Welch's t-test
|
|
)
|
|
|
|
# Store results with Bonferroni correction
|
|
n_comparisons = len(valid_regions) * (len(valid_regions) - 1) / 2
|
|
corrected_p = min(p_value * n_comparisons, 1.0)
|
|
|
|
pairwise_results[f"{region1} vs {region2}"] = {
|
|
't_statistic': t_stat,
|
|
'p_value': p_value,
|
|
'corrected_p': corrected_p,
|
|
'significant': corrected_p < 0.05
|
|
}
|
|
except Exception:
|
|
continue
|
|
|
|
regional_anova['dac_benefit']['pairwise_tests'] = pairwise_results
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform ANOVA for DAC benefit by region: {e}")
|
|
|
|
findings['regional_anova'] = regional_anova
|
|
|
|
return findings
|
|
|
|
|
|
def generate_research_report(findings, output_dir):
|
|
"""
|
|
Generate a comprehensive research report based on the analysis findings.
|
|
|
|
Parameters:
|
|
findings (dict): Dictionary containing research findings
|
|
output_dir (Path): Directory to save the report
|
|
"""
|
|
logger.info("Generating research report")
|
|
|
|
# Create report file
|
|
report_file = output_dir / "research_report.md"
|
|
|
|
with open(report_file, 'w') as f:
|
|
# Title and introduction
|
|
f.write("# California Climate Investments Collaboration Analysis\n\n")
|
|
f.write("## Research Report\n\n")
|
|
f.write("This report presents findings from an analysis of collaboration patterns in California's Climate Investments (CCI) program from 2015 to 2024, focusing on how inter-agency collaboration affects greenhouse gas (GHG) reduction efficiencies and equity outcomes.\n\n")
|
|
|
|
# Executive summary
|
|
f.write("## Executive Summary\n\n")
|
|
|
|
# Highlight key findings
|
|
exec_summary = []
|
|
|
|
# RQ1: Collaboration Structure Impact
|
|
if 'rq1' in findings:
|
|
rq1 = findings['rq1']
|
|
|
|
if 'ghg_efficiency' in rq1:
|
|
eff = rq1['ghg_efficiency']
|
|
diff = eff.get('difference', 0)
|
|
if diff < 0:
|
|
exec_summary.append(f"Multi-agency programs are less cost-efficient than single-agency programs (${eff.get('multi_agency_median', 0):,.2f} vs. ${eff.get('single_agency_median', 0):,.2f} per ton CO2e).")
|
|
else:
|
|
exec_summary.append(f"Multi-agency programs are more cost-efficient than single-agency programs (${eff.get('multi_agency_median', 0):,.2f} vs. ${eff.get('single_agency_median', 0):,.2f} per ton CO2e).")
|
|
|
|
if 'dac_benefit' in rq1:
|
|
dac = rq1['dac_benefit']
|
|
diff = dac.get('difference', 0)
|
|
if diff > 0:
|
|
exec_summary.append(f"Multi-agency programs achieve higher equity outcomes than single-agency programs ({dac.get('multi_agency_mean', 0):.2f}% vs. {dac.get('single_agency_mean', 0):.2f}% DAC benefit).")
|
|
else:
|
|
exec_summary.append(f"Multi-agency programs achieve lower equity outcomes than single-agency programs ({dac.get('multi_agency_mean', 0):.2f}% vs. {dac.get('single_agency_mean', 0):.2f}% DAC benefit).")
|
|
|
|
# RQ2: Temporal Trends
|
|
if 'rq2' in findings:
|
|
rq2 = findings['rq2']
|
|
|
|
if 'period_comparison' in rq2:
|
|
period = rq2['period_comparison']
|
|
|
|
if 'avg_agencies_change' in period:
|
|
change = period.get('avg_agencies_change', 0)
|
|
if change > 0:
|
|
exec_summary.append(f"Collaboration intensity increased by {change:.1f}% from pre-2020 to post-2020 periods.")
|
|
else:
|
|
exec_summary.append(f"Collaboration intensity decreased by {abs(change):.1f}% from pre-2020 to post-2020 periods.")
|
|
|
|
if 'avg_funding_change' in period:
|
|
change = period.get('avg_funding_change', 0)
|
|
if change > 0:
|
|
exec_summary.append(f"Average project funding increased by {change:.1f}% after 2020.")
|
|
else:
|
|
exec_summary.append(f"Average project funding decreased by {abs(change):.1f}% after 2020.")
|
|
|
|
if 'dac_benefit_change' in period:
|
|
change = period.get('dac_benefit_change', 0)
|
|
if change > 0:
|
|
exec_summary.append(f"DAC benefit percentage increased by {change:.1f}% after 2020.")
|
|
else:
|
|
exec_summary.append(f"DAC benefit percentage decreased by {abs(change):.1f}% after 2020.")
|
|
|
|
# Regional variations
|
|
if 'regional_patterns' in findings:
|
|
regional = findings['regional_patterns']
|
|
|
|
if 'efficiency_equity_tradeoff' in regional:
|
|
tradeoff = regional['efficiency_equity_tradeoff']
|
|
if 'top_regions' in tradeoff and len(tradeoff['top_regions']) > 0:
|
|
top_region = tradeoff['top_regions'][0]
|
|
exec_summary.append(f"The {top_region} region demonstrates the best balance between GHG efficiency and equity outcomes.")
|
|
|
|
if 'efficiency_equity_correlation' in regional:
|
|
corr = regional['efficiency_equity_correlation']
|
|
corr_val = corr.get('correlation', 0)
|
|
if abs(corr_val) > 0.3:
|
|
if corr_val > 0:
|
|
exec_summary.append(f"There is a positive correlation ({corr_val:.2f}) between GHG efficiency cost and equity outcomes, suggesting a tradeoff between these goals.")
|
|
else:
|
|
exec_summary.append(f"There is a negative correlation ({corr_val:.2f}) between GHG efficiency cost and equity outcomes, suggesting synergy between these goals.")
|
|
|
|
# Write executive summary bullets
|
|
for point in exec_summary:
|
|
f.write(f"- {point}\n")
|
|
|
|
f.write("\n## Research Questions\n\n")
|
|
f.write("1. How does the structure and scale of inter-agency collaboration within the California Climate Investments (CCI) program shape outputs related to greenhouse gas (GHG) reduction efficiencies and equity?\n")
|
|
f.write("2. What temporal trends relative to funding allocation, agency collaboration, and collaborative outputs related to efficiency and equity outcomes have occurred from 2015 to 2024?\n\n")
|
|
|
|
# Research Question 1: Collaboration Structure Impact
|
|
f.write("## Research Question 1: Impact of Collaboration Structure and Scale\n\n")
|
|
|
|
if 'rq1' in findings:
|
|
rq1 = findings['rq1']
|
|
|
|
f.write("### Multi-Agency vs. Single-Agency Program Comparison\n\n")
|
|
|
|
if 'project_counts' in rq1:
|
|
counts = rq1['project_counts']
|
|
f.write(f"The dataset includes {counts.get('multi_agency', 0):,} projects in multi-agency programs ({counts.get('multi_agency_percentage', 0):.1f}% of non-EV projects) and {counts.get('single_agency', 0):,} projects in single-agency programs.\n\n")
|
|
|
|
if 'ghg_efficiency' in rq1:
|
|
eff = rq1['ghg_efficiency']
|
|
f.write("#### GHG Efficiency Comparison\n\n")
|
|
f.write(f"The median GHG efficiency (cost per ton of CO2e reduced) for multi-agency programs is ${eff.get('multi_agency_median', 0):,.2f}, compared to ${eff.get('single_agency_median', 0):,.2f} for single-agency programs. This represents a difference of ${abs(eff.get('difference', 0)):,.2f} per ton ")
|
|
|
|
if eff.get('difference', 0) < 0:
|
|
f.write("in favor of single-agency programs.\n\n")
|
|
else:
|
|
f.write("in favor of multi-agency programs.\n\n")
|
|
|
|
if 'statistical_test' in eff:
|
|
test = eff['statistical_test']
|
|
if test.get('significant', False):
|
|
f.write(f"This difference is statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
|
|
else:
|
|
f.write(f"This difference is not statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
if 'dac_benefit' in rq1:
|
|
dac = rq1['dac_benefit']
|
|
f.write("#### Disadvantaged Community (DAC) Benefit Comparison\n\n")
|
|
f.write(f"The mean DAC benefit percentage for multi-agency programs is {dac.get('multi_agency_mean', 0):.2f}%, compared to {dac.get('single_agency_mean', 0):.2f}% for single-agency programs. This represents a difference of {abs(dac.get('difference', 0)):.2f}% ")
|
|
|
|
if dac.get('difference', 0) > 0:
|
|
f.write("in favor of multi-agency programs.\n\n")
|
|
else:
|
|
f.write("in favor of single-agency programs.\n\n")
|
|
|
|
if 'statistical_test' in dac:
|
|
test = dac['statistical_test']
|
|
if test.get('significant', False):
|
|
f.write(f"This difference is statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
|
|
else:
|
|
f.write(f"This difference is not statistically significant (p={test.get('p_value', 1):.4f}).\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("### Impact of Number of Agencies\n\n")
|
|
|
|
if 'agency_count_impact' in rq1:
|
|
agency_impact = rq1['agency_count_impact']
|
|
|
|
f.write("The analysis examined how the number of collaborating agencies affects program outcomes.\n\n")
|
|
|
|
if 'efficiency_correlation' in agency_impact:
|
|
corr = agency_impact['efficiency_correlation']
|
|
f.write(f"The correlation between number of agencies and GHG efficiency is {corr.get('correlation', 0):.2f} ")
|
|
|
|
if corr.get('significant', False):
|
|
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
else:
|
|
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
|
|
if corr.get('correlation', 0) > 0:
|
|
f.write("This suggests that as the number of agencies increases, the cost per ton of GHG reduction tends to increase (less efficient).\n\n")
|
|
else:
|
|
f.write("This suggests that as the number of agencies increases, the cost per ton of GHG reduction tends to decrease (more efficient).\n\n")
|
|
|
|
if 'dac_correlation' in agency_impact:
|
|
corr = agency_impact['dac_correlation']
|
|
f.write(f"The correlation between number of agencies and DAC benefit percentage is {corr.get('correlation', 0):.2f} ")
|
|
|
|
if corr.get('significant', False):
|
|
f.write(f"(statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
else:
|
|
f.write(f"(not statistically significant, p={corr.get('p_value', 1):.4f}). ")
|
|
|
|
if corr.get('correlation', 0) > 0:
|
|
f.write("This suggests that as the number of agencies increases, the DAC benefit percentage tends to increase.\n\n")
|
|
else:
|
|
f.write("This suggests that as the number of agencies increases, the DAC benefit percentage tends to decrease.\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("### Impact of Regional Scope\n\n")
|
|
|
|
if 'regional_scope_impact' in rq1:
|
|
scope_impact = rq1['regional_scope_impact']
|
|
|
|
f.write("The analysis examined how the regional scope of projects affects collaboration intensity and outcomes.\n\n")
|
|
|
|
if 'avg_agencies' in scope_impact:
|
|
f.write("#### Collaboration Intensity by Regional Scope\n\n")
|
|
|
|
# Check if Multi-Regional has the highest average
|
|
if 'Multi-Regional' in scope_impact['avg_agencies']:
|
|
multi_regional_avg = scope_impact['avg_agencies']['Multi-Regional']
|
|
single_county_avg = scope_impact['avg_agencies'].get('Single County', 0)
|
|
|
|
f.write(f"Multi-Regional projects involve an average of {multi_regional_avg:.2f} agencies, compared to {single_county_avg:.2f} for Single County projects. ")
|
|
f.write("This indicates that projects with wider geographic scope tend to involve more collaborative partners.\n\n")
|
|
|
|
if 'ghg_efficiency' in scope_impact and 'dac_benefit' in scope_impact:
|
|
f.write("#### Efficiency and Equity by Regional Scope\n\n")
|
|
|
|
# Identify best and worst scopes for efficiency
|
|
if len(scope_impact['ghg_efficiency']) > 0:
|
|
best_eff_scope = min(scope_impact['ghg_efficiency'].items(), key=lambda x: x[1])[0]
|
|
worst_eff_scope = max(scope_impact['ghg_efficiency'].items(), key=lambda x: x[1])[0]
|
|
|
|
f.write(f"The {best_eff_scope} scope achieves the best GHG efficiency (${scope_impact['ghg_efficiency'][best_eff_scope]:,.2f} per ton), while the {worst_eff_scope} scope has the least efficient outcomes (${scope_impact['ghg_efficiency'][worst_eff_scope]:,.2f} per ton).\n\n")
|
|
|
|
# Identify best and worst scopes for DAC benefit
|
|
if len(scope_impact['dac_benefit']) > 0:
|
|
best_dac_scope = max(scope_impact['dac_benefit'].items(), key=lambda x: x[1])[0]
|
|
worst_dac_scope = min(scope_impact['dac_benefit'].items(), key=lambda x: x[1])[0]
|
|
|
|
f.write(f"The {best_dac_scope} scope achieves the highest DAC benefit ({scope_impact['dac_benefit'][best_dac_scope]:.2f}%), while the {worst_dac_scope} scope has the lowest equity outcomes ({scope_impact['dac_benefit'][worst_dac_scope]:.2f}%).\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
if 'regression_analysis' in rq1:
|
|
regression = rq1['regression_analysis']
|
|
|
|
f.write("### Regression Analysis\n\n")
|
|
|
|
if 'ghg_efficiency' in regression:
|
|
ghg_model = regression['ghg_efficiency']
|
|
|
|
if 'base_model' in ghg_model:
|
|
base = ghg_model['base_model']
|
|
f.write("#### GHG Efficiency Model\n\n")
|
|
f.write(f"A regression model predicting GHG efficiency from the number of agencies explains {base.get('r_squared', 0)*100:.1f}% of the variance (R²). ")
|
|
|
|
if base.get('p_value', 1) < 0.05:
|
|
f.write(f"The effect of agency count is statistically significant (p={base.get('p_value', 1):.4f}). ")
|
|
|
|
if base.get('coefficient', 0) > 0:
|
|
f.write("Each additional collaborating agency is associated with higher costs per ton of GHG reduction (less efficient).\n\n")
|
|
else:
|
|
f.write("Each additional collaborating agency is associated with lower costs per ton of GHG reduction (more efficient).\n\n")
|
|
else:
|
|
f.write(f"The effect of agency count is not statistically significant (p={base.get('p_value', 1):.4f}).\n\n")
|
|
|
|
if 'dac_benefit' in regression:
|
|
dac_model = regression['dac_benefit']
|
|
|
|
if 'base_model' in dac_model:
|
|
base = dac_model['base_model']
|
|
f.write("#### DAC Benefit Model\n\n")
|
|
f.write(f"A regression model predicting DAC benefit percentage from the number of agencies explains {base.get('r_squared', 0)*100:.1f}% of the variance (R²). ")
|
|
|
|
if base.get('p_value', 1) < 0.05:
|
|
f.write(f"The effect of agency count is statistically significant (p={base.get('p_value', 1):.4f}). ")
|
|
|
|
if base.get('coefficient', 0) > 0:
|
|
f.write("Each additional collaborating agency is associated with a higher DAC benefit percentage.\n\n")
|
|
else:
|
|
f.write("Each additional collaborating agency is associated with a lower DAC benefit percentage.\n\n")
|
|
else:
|
|
f.write(f"The effect of agency count is not statistically significant (p={base.get('p_value', 1):.4f}).\n\n")
|
|
|
|
# Research Question 2: Temporal Trends
|
|
f.write("## Research Question 2: Temporal Trends (2015-2024)\n\n")
|
|
|
|
if 'rq2' in findings:
|
|
rq2 = findings['rq2']
|
|
|
|
f.write("### Yearly Trends in Collaboration and Outcomes\n\n")
|
|
|
|
if 'yearly' in rq2:
|
|
yearly = rq2['yearly']
|
|
|
|
f.write("The analysis tracked changes in collaboration patterns and program outcomes from 2015 to 2024.\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("### Pre-2020 vs. Post-2020 Comparison\n\n")
|
|
|
|
if 'period_comparison' in rq2:
|
|
period = rq2['period_comparison']
|
|
|
|
f.write("A comparison of the pre-2020 and post-2020 periods reveals significant changes in the CCI program.\n\n")
|
|
|
|
if 'project_count_change' in period:
|
|
change = period.get('project_count_change', 0)
|
|
pre_count = period['project_counts'].get('Pre-2020', 0)
|
|
post_count = period['project_counts'].get('Post-2020', 0)
|
|
|
|
f.write(f"The number of projects changed from {pre_count:,} (pre-2020) to {post_count:,} (post-2020), a {change:+.1f}% change.\n\n")
|
|
|
|
if 'avg_agencies_change' in period:
|
|
change = period.get('avg_agencies_change', 0)
|
|
pre_agencies = period['avg_agencies'].get('Pre-2020', 0)
|
|
post_agencies = period['avg_agencies'].get('Post-2020', 0)
|
|
|
|
f.write(f"The average number of agencies per program increased from {pre_agencies:.2f} to {post_agencies:.2f}, a {change:+.1f}% change. ")
|
|
|
|
if change > 0:
|
|
f.write("This indicates an increase in collaboration intensity after 2020.\n\n")
|
|
else:
|
|
f.write("This indicates a decrease in collaboration intensity after 2020.\n\n")
|
|
|
|
if 'avg_funding_change' in period:
|
|
change = period.get('avg_funding_change', 0)
|
|
pre_funding = period['avg_funding'].get('Pre-2020', 0)
|
|
post_funding = period['avg_funding'].get('Post-2020', 0)
|
|
|
|
f.write(f"The average project funding changed from ${pre_funding:,.2f} to ${post_funding:,.2f}, a {change:+.1f}% change.\n\n")
|
|
|
|
if 'ghg_efficiency_change' in period:
|
|
change = period.get('ghg_efficiency_change', 0)
|
|
pre_eff = period['median_ghg_efficiency'].get('Pre-2020', 0)
|
|
post_eff = period['median_ghg_efficiency'].get('Post-2020', 0)
|
|
|
|
f.write(f"The median GHG efficiency changed from ${pre_eff:,.2f} to ${post_eff:,.2f} per ton CO2e, a {change:+.1f}% change. ")
|
|
|
|
if change > 0:
|
|
f.write("This indicates a decrease in cost-effectiveness after 2020.\n\n")
|
|
else:
|
|
f.write("This indicates an increase in cost-effectiveness after 2020.\n\n")
|
|
|
|
if 'ghg_efficiency_test' in period and period['ghg_efficiency_test'].get('significant', False):
|
|
f.write("This change is statistically significant.\n\n")
|
|
|
|
if 'dac_benefit_change' in period:
|
|
change = period.get('dac_benefit_change', 0)
|
|
pre_dac = period['avg_dac_benefit'].get('Pre-2020', 0)
|
|
post_dac = period['avg_dac_benefit'].get('Post-2020', 0)
|
|
|
|
f.write(f"The average DAC benefit percentage changed from {pre_dac:.2f}% to {post_dac:.2f}%, a {change:+.1f}% change. ")
|
|
|
|
if change > 0:
|
|
f.write("This indicates improved equity outcomes after 2020.\n\n")
|
|
else:
|
|
f.write("This indicates decreased equity outcomes after 2020.\n\n")
|
|
|
|
if 'dac_benefit_test' in period and period['dac_benefit_test'].get('significant', False):
|
|
f.write("This change is statistically significant.\n\n")
|
|
|
|
f.write("\n\n")
|
|
|
|
if 'collaboration_trends' in rq2:
|
|
# Some logic here, or even just pass
|
|
pass # or continue handling 'collaboration_trends'
|
|
|
|
# Now define the function properly
|
|
def analyze_regional_variations(df, output_dir):
|
|
"""
|
|
Analyze regional variations in collaboration and outcomes.
|
|
|
|
Parameters:
|
|
df (pd.DataFrame): The CCI data
|
|
output_dir (Path): Directory to save outputs
|
|
|
|
Returns:
|
|
dict: Findings related to regional variations
|
|
"""
|
|
logger.info("Analyzing regional variations")
|
|
|
|
findings = {}
|
|
|
|
# Exclude EV vouchers for this analysis if they're marked
|
|
if 'is_ev_voucher' in df.columns:
|
|
analysis_df = df[~df['is_ev_voucher']].copy()
|
|
logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis")
|
|
else:
|
|
analysis_df = df.copy()
|
|
|
|
# Check if we have regional data
|
|
if 'ca_region' not in analysis_df.columns:
|
|
logger.error("California region data not available for regional analysis")
|
|
return findings
|
|
|
|
# 1. Calculate metrics by region
|
|
regional_metrics = {}
|
|
|
|
# Group by region
|
|
region_groups = analysis_df.groupby('ca_region')
|
|
|
|
# Project counts by region
|
|
regional_metrics['project_counts'] = region_groups.size().to_dict()
|
|
regional_metrics['project_percentages'] = (region_groups.size() / len(analysis_df) * 100).to_dict()
|
|
|
|
# Number of agencies by region
|
|
if 'num_agencies_in_program' in analysis_df.columns:
|
|
regional_metrics['avg_agencies'] = region_groups['num_agencies_in_program'].mean().to_dict()
|
|
regional_metrics['multi_agency_percentage'] = (region_groups['is_multi_agency'].mean() * 100).to_dict()
|
|
|
|
# Funding metrics by region
|
|
funding_col = [col for col in analysis_df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
|
if funding_col and len(funding_col) > 0:
|
|
funding_col = funding_col[0]
|
|
regional_metrics['total_funding'] = region_groups[funding_col].sum().to_dict()
|
|
regional_metrics['avg_funding'] = region_groups[funding_col].mean().to_dict()
|
|
|
|
# Calculate funding share
|
|
total_funding = analysis_df[funding_col].sum()
|
|
regional_metrics['funding_percentages'] = {region: funding / total_funding * 100
|
|
for region, funding in regional_metrics['total_funding'].items()}
|
|
|
|
# GHG efficiency by region
|
|
if 'ghg_efficiency' in analysis_df.columns:
|
|
# Use valid values only and median for robustness to outliers
|
|
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
|
|
|
|
if len(valid_df) > 0:
|
|
region_efficiency = valid_df.groupby('ca_region')['ghg_efficiency'].median()
|
|
regional_metrics['median_ghg_efficiency'] = region_efficiency.to_dict()
|
|
|
|
# DAC benefit by region
|
|
if 'dac_benefit_percentage' in analysis_df.columns:
|
|
regional_metrics['avg_dac_benefit'] = region_groups['dac_benefit_percentage'].mean().to_dict()
|
|
|
|
findings['regional_metrics'] = regional_metrics
|
|
|
|
# Create visualization for regional metrics
|
|
plt.figure(figsize=(15, 12))
|
|
|
|
# 1. Project distribution by region
|
|
plt.subplot(2, 2, 1)
|
|
if 'project_counts' in regional_metrics:
|
|
regions = list(regional_metrics['project_counts'].keys())
|
|
counts = list(regional_metrics['project_counts'].values())
|
|
|
|
# Sort by count
|
|
sorted_data = sorted(zip(regions, counts), key=lambda x: x[1], reverse=True)
|
|
regions, counts = zip(*sorted_data)
|
|
|
|
plt.barh(regions, counts)
|
|
plt.xlabel('Number of Projects')
|
|
plt.title('Project Distribution by Region')
|
|
plt.grid(True, axis='x', alpha=0.3)
|
|
|
|
# 2. Average number of agencies by region
|
|
plt.subplot(2, 2, 2)
|
|
if 'avg_agencies' in regional_metrics:
|
|
regions = list(regional_metrics['avg_agencies'].keys())
|
|
values = list(regional_metrics['avg_agencies'].values())
|
|
|
|
# Sort by average agencies
|
|
sorted_data = sorted(zip(regions, values), key=lambda x: x[1], reverse=True)
|
|
regions, values = zip(*sorted_data)
|
|
|
|
plt.barh(regions, values)
|
|
plt.xlabel('Average Number of Agencies')
|
|
plt.title('Collaboration Intensity by Region')
|
|
plt.grid(True, axis='x', alpha=0.3)
|
|
|
|
# 3. GHG efficiency by region
|
|
plt.subplot(2, 2, 3)
|
|
if 'median_ghg_efficiency' in regional_metrics:
|
|
regions = list(regional_metrics['median_ghg_efficiency'].keys())
|
|
values = list(regional_metrics['median_ghg_efficiency'].values())
|
|
|
|
# Sort by efficiency (lower is better)
|
|
sorted_data = sorted(zip(regions, values), key=lambda x: x[1])
|
|
regions, values = zip(*sorted_data)
|
|
|
|
plt.barh(regions, values)
|
|
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.title('GHG Efficiency by Region')
|
|
plt.grid(True, axis='x', alpha=0.3)
|
|
|
|
# 4. DAC benefit by region
|
|
plt.subplot(2, 2, 4)
|
|
if 'avg_dac_benefit' in regional_metrics:
|
|
regions = list(regional_metrics['avg_dac_benefit'].keys())
|
|
values = list(regional_metrics['avg_dac_benefit'].values())
|
|
|
|
# Sort by DAC benefit (higher is better)
|
|
sorted_data = sorted(zip(regions, values), key=lambda x: x[1], reverse=True)
|
|
regions, values = zip(*sorted_data)
|
|
|
|
plt.barh(regions, values)
|
|
plt.xlabel('DAC Benefit Percentage')
|
|
plt.title('DAC Benefit by Region')
|
|
plt.grid(True, axis='x', alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_dir / "regional_metrics.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# 2. Analyze efficiency-equity tradeoffs by region
|
|
if ('median_ghg_efficiency' in regional_metrics and
|
|
'avg_dac_benefit' in regional_metrics and
|
|
'avg_agencies' in regional_metrics):
|
|
|
|
# Create dataframe for analysis
|
|
tradeoff_data = {
|
|
'region': list(regional_metrics['median_ghg_efficiency'].keys()),
|
|
'efficiency': list(regional_metrics['median_ghg_efficiency'].values()),
|
|
'dac_benefit': list(regional_metrics['avg_dac_benefit'].values()),
|
|
'avg_agencies': list(regional_metrics['avg_agencies'].values())
|
|
}
|
|
|
|
tradeoff_df = pd.DataFrame(tradeoff_data)
|
|
|
|
# Calculate composite score (combines efficiency and equity)
|
|
# First, normalize the metrics
|
|
tradeoff_df['efficiency_norm'] = (tradeoff_df['efficiency'] - tradeoff_df['efficiency'].min()) / (tradeoff_df['efficiency'].max() - tradeoff_df['efficiency'].min())
|
|
tradeoff_df['dac_norm'] = (tradeoff_df['dac_benefit'] - tradeoff_df['dac_benefit'].min()) / (tradeoff_df['dac_benefit'].max() - tradeoff_df['dac_benefit'].min())
|
|
|
|
# Since lower efficiency is better, invert the normalized value
|
|
tradeoff_df['efficiency_inv'] = 1 - tradeoff_df['efficiency_norm']
|
|
|
|
# Calculate composite score (equal weight to efficiency and equity)
|
|
tradeoff_df['composite_score'] = (tradeoff_df['efficiency_inv'] + tradeoff_df['dac_norm']) / 2
|
|
|
|
# Store the results
|
|
findings['efficiency_equity_tradeoff'] = {
|
|
'metrics': tradeoff_df.to_dict(orient='records'),
|
|
'top_regions': tradeoff_df.sort_values('composite_score', ascending=False)['region'].tolist()
|
|
}
|
|
|
|
# Check for correlation between efficiency and equity
|
|
correlation, p_value = stats.pearsonr(tradeoff_df['efficiency'], tradeoff_df['dac_benefit'])
|
|
|
|
findings['efficiency_equity_correlation'] = {
|
|
'correlation_type': 'Pearson',
|
|
'correlation': correlation,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
|
|
# Create visualization for efficiency-equity tradeoff
|
|
plt.figure(figsize=(10, 8))
|
|
|
|
# Scatter plot with bubble size representing average number of agencies
|
|
scatter = plt.scatter(
|
|
tradeoff_df['efficiency'],
|
|
tradeoff_df['dac_benefit'],
|
|
s=tradeoff_df['avg_agencies'] * 50, # Size scaled by avg agencies
|
|
alpha=0.7,
|
|
c=tradeoff_df['composite_score'], # Color by composite score
|
|
cmap='viridis'
|
|
)
|
|
|
|
# Add labels for each region
|
|
for i, row in tradeoff_df.iterrows():
|
|
plt.annotate(
|
|
row['region'],
|
|
(row['efficiency'], row['dac_benefit']),
|
|
textcoords="offset points",
|
|
xytext=(5, 5),
|
|
ha='left'
|
|
)
|
|
|
|
# Add correlation information
|
|
plt.text(0.05, 0.95, f"Correlation: {correlation:.2f} (p={p_value:.4f})",
|
|
transform=plt.gca().transAxes, fontsize=10)
|
|
|
|
def analyze_temporal_trends(df, output_dir):
|
|
...
|
|
|
|
"""
|
|
Analyze temporal trends in collaboration and outcomes from 2015-2024.
|
|
|
|
Parameters:
|
|
df (pd.DataFrame): The CCI data
|
|
output_dir (Path): Directory to save outputs
|
|
|
|
Returns:
|
|
dict: Findings related to temporal trends
|
|
"""
|
|
logger.info("Analyzing temporal trends (2015-2024)")
|
|
|
|
findings = {}
|
|
|
|
# Exclude EV vouchers for this analysis if they're marked
|
|
if 'is_ev_voucher' in df.columns:
|
|
analysis_df = df[~df['is_ev_voucher']].copy()
|
|
logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis")
|
|
else:
|
|
analysis_df = df.copy()
|
|
|
|
# Check if we have the required temporal data
|
|
if 'funding_year' not in analysis_df.columns:
|
|
logger.error("Funding year data not available for temporal analysis")
|
|
return findings
|
|
|
|
# 1. Calculate yearly metrics
|
|
yearly_metrics = {}
|
|
|
|
# Group by funding year
|
|
yearly_groups = analysis_df.groupby('funding_year')
|
|
|
|
# Project counts by year
|
|
yearly_metrics['project_counts'] = yearly_groups.size().to_dict()
|
|
|
|
# Number of agencies by year
|
|
if 'num_agencies_in_program' in analysis_df.columns:
|
|
yearly_metrics['avg_agencies'] = yearly_groups['num_agencies_in_program'].mean().to_dict()
|
|
yearly_metrics['multi_agency_percentage'] = (yearly_groups['is_multi_agency'].mean() * 100).to_dict()
|
|
|
|
# Funding metrics by year
|
|
funding_col = [col for col in analysis_df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
|
if funding_col and len(funding_col) > 0:
|
|
funding_col = funding_col[0]
|
|
yearly_metrics['total_funding'] = yearly_groups[funding_col].sum().to_dict()
|
|
yearly_metrics['avg_funding'] = yearly_groups[funding_col].mean().to_dict()
|
|
|
|
# GHG efficiency by year
|
|
if 'ghg_efficiency' in analysis_df.columns:
|
|
# Use valid values only and median for robustness to outliers
|
|
yearly_valid = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
|
|
yearly_efficiency = yearly_valid.groupby('funding_year')['ghg_efficiency'].median()
|
|
yearly_metrics['median_ghg_efficiency'] = yearly_efficiency.to_dict()
|
|
|
|
# DAC benefit by year
|
|
if 'dac_benefit_percentage' in analysis_df.columns:
|
|
yearly_metrics['avg_dac_benefit'] = yearly_groups['dac_benefit_percentage'].mean().to_dict()
|
|
|
|
findings['yearly'] = yearly_metrics
|
|
|
|
# Create visualization for yearly trends
|
|
plt.figure(figsize=(15, 12))
|
|
|
|
# 1. Number of agencies over time
|
|
plt.subplot(2, 2, 1)
|
|
if 'avg_agencies' in yearly_metrics:
|
|
years = sorted(yearly_metrics['avg_agencies'].keys())
|
|
values = [yearly_metrics['avg_agencies'][year] for year in years]
|
|
|
|
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
|
|
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
|
|
plt.xlabel('Funding Year')
|
|
plt.ylabel('Average Number of Agencies')
|
|
plt.title('Collaboration Intensity Over Time')
|
|
plt.grid(True, alpha=0.3)
|
|
plt.legend()
|
|
|
|
# 2. Average project funding over time
|
|
plt.subplot(2, 2, 2)
|
|
if 'avg_funding' in yearly_metrics:
|
|
years = sorted(yearly_metrics['avg_funding'].keys())
|
|
values = [yearly_metrics['avg_funding'][year] for year in years]
|
|
|
|
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
|
|
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
|
|
plt.xlabel('Funding Year')
|
|
plt.ylabel('Average Project Funding ($)')
|
|
plt.title('Project Funding Over Time')
|
|
plt.grid(True, alpha=0.3)
|
|
plt.legend()
|
|
|
|
# 3. GHG efficiency over time
|
|
plt.subplot(2, 2, 3)
|
|
if 'median_ghg_efficiency' in yearly_metrics:
|
|
years = sorted(yearly_metrics['median_ghg_efficiency'].keys())
|
|
values = [yearly_metrics['median_ghg_efficiency'][year] for year in years]
|
|
|
|
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
|
|
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
|
|
plt.xlabel('Funding Year')
|
|
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.title('GHG Efficiency Over Time')
|
|
plt.grid(True, alpha=0.3)
|
|
plt.legend()
|
|
|
|
# 4. DAC benefit over time
|
|
plt.subplot(2, 2, 4)
|
|
if 'avg_dac_benefit' in yearly_metrics:
|
|
years = sorted(yearly_metrics['avg_dac_benefit'].keys())
|
|
values = [yearly_metrics['avg_dac_benefit'][year] for year in years]
|
|
|
|
plt.plot(years, values, marker='o', linestyle='-', linewidth=2)
|
|
plt.axvline(x=2020, color='r', linestyle='--', alpha=0.7, label='2020')
|
|
plt.xlabel('Funding Year')
|
|
plt.ylabel('DAC Benefit Percentage')
|
|
plt.title('DAC Benefit Over Time')
|
|
plt.grid(True, alpha=0.3)
|
|
plt.legend()
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_dir / "yearly_trends.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# 2. Compare pre-2020 and post-2020 periods
|
|
if 'period' in analysis_df.columns:
|
|
period_metrics = {}
|
|
|
|
# Group by period
|
|
period_groups = analysis_df.groupby('period')
|
|
|
|
# Project counts by period
|
|
period_metrics['project_counts'] = period_groups.size().to_dict()
|
|
|
|
# Calculate percentage change from pre-2020 to post-2020
|
|
pre_count = period_metrics['project_counts'].get('Pre-2020', 0)
|
|
post_count = period_metrics['project_counts'].get('Post-2020', 0)
|
|
period_metrics['project_count_change'] = ((post_count - pre_count) / pre_count * 100) if pre_count > 0 else np.nan
|
|
|
|
# Number of agencies by period
|
|
if 'num_agencies_in_program' in analysis_df.columns:
|
|
period_metrics['avg_agencies'] = period_groups['num_agencies_in_program'].mean().to_dict()
|
|
period_metrics['multi_agency_percentage'] = (period_groups['is_multi_agency'].mean() * 100).to_dict()
|
|
|
|
# Calculate percentage change
|
|
pre_agencies = period_metrics['avg_agencies'].get('Pre-2020', 0)
|
|
post_agencies = period_metrics['avg_agencies'].get('Post-2020', 0)
|
|
period_metrics['avg_agencies_change'] = ((post_agencies - pre_agencies) / pre_agencies * 100) if pre_agencies > 0 else np.nan
|
|
|
|
# Funding metrics by period
|
|
if funding_col:
|
|
period_metrics['total_funding'] = period_groups[funding_col].sum().to_dict()
|
|
period_metrics['avg_funding'] = period_groups[funding_col].mean().to_dict()
|
|
|
|
# Calculate percentage change
|
|
pre_funding = period_metrics['avg_funding'].get('Pre-2020', 0)
|
|
post_funding = period_metrics['avg_funding'].get('Post-2020', 0)
|
|
period_metrics['avg_funding_change'] = ((post_funding - pre_funding) / pre_funding * 100) if pre_funding > 0 else np.nan
|
|
|
|
# GHG efficiency by period
|
|
if 'ghg_efficiency' in analysis_df.columns:
|
|
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
|
|
if len(valid_df) > 0:
|
|
period_efficiency = valid_df.groupby('period')['ghg_efficiency'].median()
|
|
period_metrics['median_ghg_efficiency'] = period_efficiency.to_dict()
|
|
|
|
# Calculate percentage change
|
|
pre_efficiency = period_metrics['median_ghg_efficiency'].get('Pre-2020', 0)
|
|
post_efficiency = period_metrics['median_ghg_efficiency'].get('Post-2020', 0)
|
|
period_metrics['ghg_efficiency_change'] = ((post_efficiency - pre_efficiency) / pre_efficiency * 100) if pre_efficiency > 0 else np.nan
|
|
|
|
# Statistical test
|
|
try:
|
|
pre_group = valid_df[valid_df['period'] == 'Pre-2020']['ghg_efficiency']
|
|
post_group = valid_df[valid_df['period'] == 'Post-2020']['ghg_efficiency']
|
|
|
|
if len(pre_group) > 0 and len(post_group) > 0:
|
|
u_stat, p_value = stats.mannwhitneyu(
|
|
pre_group,
|
|
post_group,
|
|
alternative='two-sided'
|
|
)
|
|
period_metrics['ghg_efficiency_test'] = {
|
|
'test': 'Mann-Whitney U',
|
|
'u_statistic': u_stat,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform statistical test on GHG efficiency by period: {e}")
|
|
|
|
# DAC benefit by period
|
|
if 'dac_benefit_percentage' in analysis_df.columns:
|
|
period_metrics['avg_dac_benefit'] = period_groups['dac_benefit_percentage'].mean().to_dict()
|
|
|
|
# Calculate percentage change
|
|
pre_dac = period_metrics['avg_dac_benefit'].get('Pre-2020', 0)
|
|
post_dac = period_metrics['avg_dac_benefit'].get('Post-2020', 0)
|
|
period_metrics['dac_benefit_change'] = ((post_dac - pre_dac) / pre_dac * 100) if pre_dac > 0 else np.nan
|
|
|
|
# Statistical test
|
|
try:
|
|
pre_group = analysis_df[analysis_df['period'] == 'Pre-2020']['dac_benefit_percentage']
|
|
post_group = analysis_df[analysis_df['period'] == 'Post-2020']['dac_benefit_percentage']
|
|
|
|
if len(pre_group) > 0 and len(post_group) > 0:
|
|
t_stat, p_value = stats.ttest_ind(
|
|
pre_group,
|
|
post_group,
|
|
equal_var=False # Use Welch's t-test to handle unequal variances
|
|
)
|
|
period_metrics['dac_benefit_test'] = {
|
|
'test': "Welch's t-test",
|
|
't_statistic': t_stat,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform statistical test on DAC benefit by period: {e}")
|
|
|
|
findings['period_comparison'] = period_metrics
|
|
|
|
# Create visualization for pre/post-2020 comparison
|
|
plt.figure(figsize=(12, 10))
|
|
|
|
# 1. Project counts by period
|
|
plt.subplot(2, 2, 1)
|
|
if 'project_counts' in period_metrics:
|
|
periods = ['Pre-2020', 'Post-2020']
|
|
counts = [period_metrics['project_counts'].get(period, 0) for period in periods]
|
|
|
|
plt.bar(periods, counts)
|
|
plt.xlabel('Period')
|
|
plt.ylabel('Number of Projects')
|
|
plt.title('Project Counts: Pre-2020 vs. Post-2020')
|
|
|
|
# Add percentage change
|
|
if 'project_count_change' in period_metrics:
|
|
change = period_metrics['project_count_change']
|
|
plt.text(1, counts[1] * 1.05, f"{change:+.1f}%", ha='center')
|
|
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# 2. Average number of agencies by period
|
|
plt.subplot(2, 2, 2)
|
|
if 'avg_agencies' in period_metrics:
|
|
periods = ['Pre-2020', 'Post-2020']
|
|
values = [period_metrics['avg_agencies'].get(period, 0) for period in periods]
|
|
|
|
plt.bar(periods, values)
|
|
plt.xlabel('Period')
|
|
plt.ylabel('Average Number of Agencies')
|
|
plt.title('Collaboration Intensity: Pre-2020 vs. Post-2020')
|
|
|
|
# Add percentage change
|
|
if 'avg_agencies_change' in period_metrics:
|
|
change = period_metrics['avg_agencies_change']
|
|
plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center')
|
|
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# 3. GHG efficiency by period
|
|
plt.subplot(2, 2, 3)
|
|
if 'median_ghg_efficiency' in period_metrics:
|
|
periods = ['Pre-2020', 'Post-2020']
|
|
values = [period_metrics['median_ghg_efficiency'].get(period, 0) for period in periods]
|
|
|
|
plt.bar(periods, values)
|
|
plt.xlabel('Period')
|
|
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.title('GHG Efficiency: Pre-2020 vs. Post-2020')
|
|
|
|
# Add percentage change
|
|
if 'ghg_efficiency_change' in period_metrics:
|
|
change = period_metrics['ghg_efficiency_change']
|
|
plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center')
|
|
|
|
# Add statistical significance
|
|
if 'ghg_efficiency_test' in period_metrics and period_metrics['ghg_efficiency_test'].get('significant', False):
|
|
plt.text(0.5, max(values) * 1.1, "Statistically significant difference (p<0.05)",
|
|
ha='center', fontsize=10)
|
|
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# 4. DAC benefit by period
|
|
plt.subplot(2, 2, 4)
|
|
if 'avg_dac_benefit' in period_metrics:
|
|
periods = ['Pre-2020', 'Post-2020']
|
|
values = [period_metrics['avg_dac_benefit'].get(period, 0) for period in periods]
|
|
|
|
plt.bar(periods, values)
|
|
plt.xlabel('Period')
|
|
plt.ylabel('DAC Benefit Percentage')
|
|
plt.title('DAC Benefit: Pre-2020 vs. Post-2020')
|
|
|
|
# Add percentage change
|
|
if 'dac_benefit_change' in period_metrics:
|
|
change = period_metrics['dac_benefit_change']
|
|
plt.text(1, values[1] * 1.05, f"{change:+.1f}%", ha='center')
|
|
|
|
# Add statistical significance
|
|
if 'dac_benefit_test' in period_metrics and period_metrics['dac_benefit_test'].get('significant', False):
|
|
plt.text(0.5, max(values) * 1.1, "Statistically significant difference (p<0.05)",
|
|
ha='center', fontsize=10)
|
|
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_dir / "pre_post_2020_comparison.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# 3. Analyze changes in collaboration patterns over time
|
|
if 'num_agencies_in_program' in analysis_df.columns and 'funding_year' in analysis_df.columns:
|
|
# Create a time series of collaboration indicators
|
|
collab_trends = {}
|
|
|
|
# Average agencies over time
|
|
collab_trends['avg_agencies'] = yearly_metrics.get('avg_agencies', {})
|
|
|
|
# Percentage of multi-agency projects over time
|
|
collab_trends['multi_agency_percentage'] = yearly_metrics.get('multi_agency_percentage', {})
|
|
|
|
# Analyze correlation between time and collaboration intensity
|
|
try:
|
|
years = sorted(collab_trends['avg_agencies'].keys())
|
|
values = [collab_trends['avg_agencies'][year] for year in years]
|
|
|
|
# Calculate correlation
|
|
correlation, p_value = stats.pearsonr(years, values)
|
|
|
|
collab_trends['time_correlation'] = {
|
|
'correlation_type': 'Pearson',
|
|
'correlation': correlation,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not calculate correlation between time and collaboration: {e}")
|
|
|
|
# Regional distribution over time (if available)
|
|
if 'regional_scope' in analysis_df.columns:
|
|
# Create a cross-tabulation of funding year and regional scope
|
|
try:
|
|
crosstab = pd.crosstab(
|
|
analysis_df['funding_year'],
|
|
analysis_df['regional_scope'],
|
|
normalize='index'
|
|
).multiply(100)
|
|
|
|
collab_trends['regional_distribution'] = {}
|
|
|
|
for year in crosstab.index:
|
|
collab_trends['regional_distribution'][int(year)] = {}
|
|
for scope in crosstab.columns:
|
|
collab_trends['regional_distribution'][int(year)][scope] = crosstab.loc[year, scope]
|
|
except Exception as e:
|
|
logger.warning(f"Error calculating regional distribution over time: {e}")
|
|
|
|
findings['collaboration_trends'] = collab_trends
|
|
|
|
# Create visualization for collaboration trends
|
|
plt.figure(figsize=(12, 10))
|
|
|
|
# 1. Average agencies and multi-agency percentage over time
|
|
plt.subplot(2, 1, 1)
|
|
if 'avg_agencies' in collab_trends and 'multi_agency_percentage' in collab_trends:
|
|
years = sorted(collab_trends['avg_agencies'].keys())
|
|
avg_values = [collab_trends['avg_agencies'][year] for year in years]
|
|
pct_values = [collab_trends['multi_agency_percentage'][year] for year in years]
|
|
|
|
# Create figure with two y-axes
|
|
fig, ax1 = plt.subplots(figsize=(10, 6))
|
|
|
|
# Plot average agencies
|
|
color = 'tab:blue'
|
|
ax1.set_xlabel('Funding Year')
|
|
ax1.set_ylabel('Average Number of Agencies', color=color)
|
|
ax1.plot(years, avg_values, color=color, marker='o', linestyle='-', linewidth=2)
|
|
ax1.tick_params(axis='y', labelcolor=color)
|
|
|
|
# Create second y-axis for percentage
|
|
ax2 = ax1.twinx()
|
|
color = 'tab:red'
|
|
ax2.set_ylabel('Multi-Agency Projects (%)', color=color)
|
|
ax2.plot(years, pct_values, color=color, marker='s', linestyle='-', linewidth=2)
|
|
ax2.tick_params(axis='y', labelcolor=color)
|
|
|
|
# Add vertical line at 2020
|
|
plt.axvline(x=2020, color='black', linestyle='--', alpha=0.7)
|
|
|
|
# Add correlation information
|
|
if 'time_correlation' in collab_trends:
|
|
corr = collab_trends['time_correlation']['correlation']
|
|
p_val = collab_trends['time_correlation']['p_value']
|
|
plt.figtext(0.15, 0.85, f"Time-Collaboration Correlation: {corr:.2f} (p={p_val:.4f})",
|
|
fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
|
|
|
|
plt.title('Collaboration Intensity Over Time')
|
|
fig.tight_layout()
|
|
plt.savefig(output_dir / "collaboration_trends.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# 2. Regional distribution over time (if available)
|
|
if 'regional_distribution' in collab_trends:
|
|
try:
|
|
# Convert the nested dict to a DataFrame for easier plotting
|
|
region_years = []
|
|
region_scopes = []
|
|
region_values = []
|
|
|
|
for year, scopes in collab_trends['regional_distribution'].items():
|
|
for scope, value in scopes.items():
|
|
region_years.append(year)
|
|
region_scopes.append(scope)
|
|
region_values.append(value)
|
|
|
|
region_df = pd.DataFrame({
|
|
'Year': region_years,
|
|
'Scope': region_scopes,
|
|
'Percentage': region_values
|
|
})
|
|
|
|
# Pivot for stacked area chart
|
|
pivot_df = region_df.pivot(index='Year', columns='Scope', values='Percentage')
|
|
|
|
# Define the order for regional scope
|
|
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
|
|
|
|
# Reorder columns if possible
|
|
ordered_columns = [col for col in scope_order if col in pivot_df.columns]
|
|
missing_columns = [col for col in pivot_df.columns if col not in ordered_columns]
|
|
pivot_df = pivot_df[ordered_columns + missing_columns]
|
|
|
|
# Create stacked area chart
|
|
plt.figure(figsize=(10, 6))
|
|
plt.stackplot(pivot_df.index, pivot_df.values.T, labels=pivot_df.columns, alpha=0.7)
|
|
|
|
plt.xlabel('Funding Year')
|
|
plt.ylabel('Percentage of Projects')
|
|
plt.title('Regional Scope Distribution Over Time')
|
|
plt.legend(loc='upper left')
|
|
plt.grid(True, alpha=0.3)
|
|
|
|
# Add vertical line at 2020
|
|
plt.axvline(x=2020, color='black', linestyle='--', alpha=0.7)
|
|
|
|
plt.savefig(output_dir / "regional_trends.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
except Exception as e:
|
|
logger.warning(f"Error creating regional distribution visualization: {e}")
|
|
|
|
# 4. Time series regression analysis
|
|
time_regression = {}
|
|
|
|
if 'num_agencies_in_program' in analysis_df.columns and 'funding_year' in analysis_df.columns:
|
|
try:
|
|
# Create time variable and center it at 2020 for easier interpretation
|
|
analysis_df['year_centered'] = analysis_df['funding_year'] - 2020
|
|
|
|
# Regression model predicting number of agencies from year
|
|
model1 = sm.OLS.from_formula('num_agencies_in_program ~ year_centered', data=analysis_df)
|
|
results1 = model1.fit()
|
|
|
|
time_regression['agencies_model'] = {
|
|
'r_squared': results1.rsquared,
|
|
'coefficient': results1.params['year_centered'],
|
|
'p_value': results1.pvalues['year_centered'],
|
|
'significant': results1.pvalues['year_centered'] < 0.05
|
|
}
|
|
|
|
# Interpret the coefficient
|
|
coef = results1.params['year_centered']
|
|
if coef > 0:
|
|
time_regression['agencies_model']['interpretation'] = f"The number of agencies per program increases by {coef:.3f} each year."
|
|
else:
|
|
time_regression['agencies_model']['interpretation'] = f"The number of agencies per program decreases by {abs(coef):.3f} each year."
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform time series regression for number of agencies: {e}")
|
|
|
|
# Regression model predicting GHG efficiency from year
|
|
if 'ghg_efficiency' in analysis_df.columns and 'funding_year' in analysis_df.columns:
|
|
try:
|
|
# Use log transformation for efficiency
|
|
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)].copy()
|
|
|
|
if len(valid_df) > 0:
|
|
valid_df['log_efficiency'] = np.log(valid_df['ghg_efficiency'])
|
|
valid_df['year_centered'] = valid_df['funding_year'] - 2020
|
|
|
|
model2 = sm.OLS.from_formula('log_efficiency ~ year_centered', data=valid_df)
|
|
results2 = model2.fit()
|
|
|
|
time_regression['efficiency_model'] = {
|
|
'r_squared': results2.rsquared,
|
|
'coefficient': results2.params['year_centered'],
|
|
'p_value': results2.pvalues['year_centered'],
|
|
'significant': results2.pvalues['year_centered'] < 0.05
|
|
}
|
|
|
|
# Interpret the coefficient (exponentiate for percentage change)
|
|
coef = results2.params['year_centered']
|
|
pct_change = (np.exp(coef) - 1) * 100
|
|
|
|
if pct_change > 0:
|
|
time_regression['efficiency_model']['interpretation'] = f"GHG efficiency worsens by approximately {pct_change:.1f}% each year (higher $ per ton)."
|
|
else:
|
|
time_regression['efficiency_model']['interpretation'] = f"GHG efficiency improves by approximately {abs(pct_change):.1f}% each year (lower $ per ton)."
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform time series regression for GHG efficiency: {e}")
|
|
|
|
# Regression model predicting DAC benefit from year
|
|
if 'dac_benefit_percentage' in analysis_df.columns and 'funding_year' in analysis_df.columns:
|
|
try:
|
|
# Create time variable
|
|
analysis_df['year_centered'] = analysis_df['funding_year'] - 2020
|
|
|
|
model3 = sm.OLS.from_formula('dac_benefit_percentage ~ year_centered', data=analysis_df)
|
|
results3 = model3.fit()
|
|
|
|
time_regression['dac_model'] = {
|
|
'r_squared': results3.rsquared,
|
|
'coefficient': results3.params['year_centered'],
|
|
'p_value': results3.pvalues['year_centered'],
|
|
'significant': results3.pvalues['year_centered'] < 0.05
|
|
}
|
|
|
|
# Interpret the coefficient
|
|
coef = results3.params['year_centered']
|
|
if coef > 0:
|
|
time_regression['dac_model']['interpretation'] = f"DAC benefit percentage increases by {coef:.3f} percentage points each year."
|
|
else:
|
|
time_regression['dac_model']['interpretation'] = f"DAC benefit percentage decreases by {abs(coef):.3f} percentage points each year."
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform time series regression for DAC benefit: {e}")
|
|
|
|
findings['time_regression'] = time_regression
|
|
|
|
return findings
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from pathlib import Path
|
|
import logging
|
|
import statsmodels.api as sm
|
|
import statsmodels.formula.api as smf
|
|
from statsmodels.stats.anova import anova_lm
|
|
from scipy import stats
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger("cci_research_analysis")
|
|
|
|
def analyze_research_questions(data_path, output_path="./output/research"):
|
|
"""
|
|
Analyze the key research questions about collaboration in CCI.
|
|
|
|
Parameters:
|
|
data_path (str): Path to the cleaned CCI data file
|
|
output_path (str): Path to save outputs
|
|
|
|
Returns:
|
|
dict: Dictionary containing research findings
|
|
"""
|
|
logger.info(f"Loading data from {data_path}")
|
|
|
|
# Load the data
|
|
try:
|
|
df = pd.read_csv(data_path)
|
|
except Exception as e:
|
|
logger.error(f"Error loading data: {e}")
|
|
return None
|
|
|
|
logger.info(f"Successfully loaded {len(df)} rows")
|
|
|
|
# Create output directory
|
|
output_dir = Path(output_path)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize findings dictionary
|
|
findings = {}
|
|
|
|
# Analyze each research question
|
|
try:
|
|
# Research Question 1: How does structure and scale of collaboration affect outcomes?
|
|
findings['rq1'] = analyze_collaboration_structure_impact(df, output_dir)
|
|
|
|
# Research Question 2: What temporal trends have occurred from 2015-2024?
|
|
findings['rq2'] = analyze_temporal_trends(df, output_dir)
|
|
|
|
# Additional analysis of regional patterns and variations
|
|
findings['regional_patterns'] = analyze_regional_variations(df, output_dir)
|
|
|
|
# Generate comprehensive report
|
|
generate_research_report(findings, output_dir)
|
|
|
|
logger.info(f"Analysis complete, results saved to {output_dir}")
|
|
|
|
return findings
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in research analysis: {e}")
|
|
return None
|
|
|
|
|
|
def analyze_collaboration_structure_impact(df, output_dir):
|
|
"""
|
|
Analyze how collaboration structure and scale affect GHG efficiency and equity.
|
|
|
|
Parameters:
|
|
df (pd.DataFrame): The CCI data
|
|
output_dir (Path): Directory to save outputs
|
|
|
|
Returns:
|
|
dict: Findings related to collaboration structure impact
|
|
"""
|
|
logger.info("Analyzing collaboration structure impact")
|
|
|
|
findings = {}
|
|
|
|
# Exclude EV vouchers for this analysis if they're marked
|
|
if 'is_ev_voucher' in df.columns:
|
|
analysis_df = df[~df['is_ev_voucher']].copy()
|
|
logger.info(f"Excluded {len(df) - len(analysis_df)} EV vouchers from analysis")
|
|
else:
|
|
analysis_df = df.copy()
|
|
|
|
# 1. Compare multi-agency vs single-agency projects
|
|
if 'is_multi_agency' in analysis_df.columns:
|
|
multi_df = analysis_df[analysis_df['is_multi_agency']]
|
|
single_df = analysis_df[~analysis_df['is_multi_agency']]
|
|
|
|
findings['project_counts'] = {
|
|
'multi_agency': len(multi_df),
|
|
'single_agency': len(single_df),
|
|
'multi_agency_percentage': len(multi_df) / len(analysis_df) * 100 if len(analysis_df) > 0 else 0
|
|
}
|
|
|
|
# GHG efficiency comparison
|
|
if 'ghg_efficiency' in analysis_df.columns:
|
|
# Use valid efficiency data only
|
|
valid_multi = multi_df[multi_df['ghg_efficiency'].notna() & (multi_df['ghg_efficiency'] > 0)]
|
|
valid_single = single_df[single_df['ghg_efficiency'].notna() & (single_df['ghg_efficiency'] > 0)]
|
|
|
|
if len(valid_multi) > 0 and len(valid_single) > 0:
|
|
# Calculate median efficiency ($ per ton CO2e)
|
|
multi_efficiency = valid_multi['ghg_efficiency'].median()
|
|
single_efficiency = valid_single['ghg_efficiency'].median()
|
|
|
|
findings['ghg_efficiency'] = {
|
|
'multi_agency_median': multi_efficiency,
|
|
'single_agency_median': single_efficiency,
|
|
'difference': single_efficiency - multi_efficiency,
|
|
'difference_percentage': ((single_efficiency - multi_efficiency) / single_efficiency * 100) if single_efficiency > 0 else 0
|
|
}
|
|
|
|
# Statistical test (Mann-Whitney U test for non-parametric comparison)
|
|
try:
|
|
u_stat, p_value = stats.mannwhitneyu(
|
|
valid_multi['ghg_efficiency'],
|
|
valid_single['ghg_efficiency'],
|
|
alternative='two-sided'
|
|
)
|
|
findings['ghg_efficiency']['statistical_test'] = {
|
|
'test': 'Mann-Whitney U',
|
|
'u_statistic': u_stat,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform statistical test on efficiency comparison: {e}")
|
|
|
|
# Create visualization for efficiency comparison
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
# Use log scale for better visualization since efficiency can be highly skewed
|
|
bins = np.logspace(
|
|
np.log10(min(valid_multi['ghg_efficiency'].min(), valid_single['ghg_efficiency'].min())),
|
|
np.log10(max(valid_multi['ghg_efficiency'].quantile(0.95), valid_single['ghg_efficiency'].quantile(0.95))),
|
|
30
|
|
)
|
|
|
|
plt.hist(valid_single['ghg_efficiency'], bins=bins, alpha=0.5, label=f'Single-Agency (Median: ${single_efficiency:.2f})')
|
|
plt.hist(valid_multi['ghg_efficiency'], bins=bins, alpha=0.5, label=f'Multi-Agency (Median: ${multi_efficiency:.2f})')
|
|
|
|
plt.xscale('log')
|
|
plt.xlabel('GHG Efficiency ($ per ton CO2e) - Log Scale')
|
|
plt.ylabel('Number of Projects')
|
|
plt.title('GHG Efficiency: Multi-Agency vs. Single-Agency Programs')
|
|
plt.legend()
|
|
plt.grid(True, alpha=0.3)
|
|
|
|
# Save the figure
|
|
plt.savefig(output_dir / "ghg_efficiency_comparison.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# DAC benefit comparison
|
|
if 'dac_benefit_percentage' in analysis_df.columns:
|
|
findings['dac_benefit'] = {
|
|
'multi_agency_mean': multi_df['dac_benefit_percentage'].mean(),
|
|
'single_agency_mean': single_df['dac_benefit_percentage'].mean(),
|
|
'difference': multi_df['dac_benefit_percentage'].mean() - single_df['dac_benefit_percentage'].mean()
|
|
}
|
|
|
|
# Statistical test
|
|
try:
|
|
t_stat, p_value = stats.ttest_ind(
|
|
multi_df['dac_benefit_percentage'],
|
|
single_df['dac_benefit_percentage'],
|
|
equal_var=False # Use Welch's t-test to handle unequal variances
|
|
)
|
|
findings['dac_benefit']['statistical_test'] = {
|
|
'test': "Welch's t-test",
|
|
't_statistic': t_stat,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform statistical test on DAC benefit comparison: {e}")
|
|
|
|
# Create visualization for DAC benefit comparison
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
# Create a boxplot with outlier points
|
|
sns.boxplot(x='is_multi_agency', y='dac_benefit_percentage', data=analysis_df)
|
|
|
|
# Add a swarm plot of the data points
|
|
sns.swarmplot(x='is_multi_agency', y='dac_benefit_percentage', data=analysis_df,
|
|
alpha=0.5, size=4, edgecolor='black', linewidth=0.5)
|
|
|
|
# Set labels and title
|
|
plt.xlabel('Multi-Agency Program')
|
|
plt.ylabel('DAC Benefit Percentage')
|
|
plt.title('DAC Benefit: Multi-Agency vs. Single-Agency Programs')
|
|
plt.xticks([0, 1], ['Single-Agency', 'Multi-Agency'])
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# Save the figure
|
|
plt.savefig(output_dir / "dac_benefit_comparison.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# 2. Analyze relationship between number of agencies and outcomes
|
|
if 'num_agencies_in_program' in analysis_df.columns:
|
|
# Group by number of agencies
|
|
agency_groups = analysis_df.groupby('num_agencies_in_program')
|
|
|
|
# Calculate metrics for each group
|
|
agency_metrics = {}
|
|
|
|
# Project counts by number of agencies
|
|
agency_counts = agency_groups.size()
|
|
agency_metrics['project_counts'] = agency_counts.to_dict()
|
|
|
|
# Filter to groups with at least 5 projects for more reliable metrics
|
|
valid_groups = agency_counts[agency_counts >= 5].index
|
|
|
|
# Calculate GHG efficiency by number of agencies
|
|
if 'ghg_efficiency' in analysis_df.columns:
|
|
efficiency_by_agency = agency_groups['ghg_efficiency'].median()
|
|
efficiency_by_agency = efficiency_by_agency[valid_groups]
|
|
agency_metrics['ghg_efficiency'] = efficiency_by_agency.to_dict()
|
|
|
|
# Calculate correlation using Spearman rank correlation
|
|
# (more robust to non-linear relationships and outliers)
|
|
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
|
|
if len(valid_df) > 0:
|
|
spearman_corr, p_value = stats.spearmanr(
|
|
valid_df['num_agencies_in_program'],
|
|
valid_df['ghg_efficiency']
|
|
)
|
|
agency_metrics['efficiency_correlation'] = {
|
|
'correlation_type': 'Spearman rank',
|
|
'correlation': spearman_corr,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
|
|
# Calculate DAC benefit by number of agencies
|
|
if 'dac_benefit_percentage' in analysis_df.columns:
|
|
dac_by_agency = agency_groups['dac_benefit_percentage'].mean()
|
|
dac_by_agency = dac_by_agency[valid_groups]
|
|
agency_metrics['dac_benefit'] = dac_by_agency.to_dict()
|
|
|
|
# Calculate correlation
|
|
valid_df = analysis_df[analysis_df['dac_benefit_percentage'].notna()]
|
|
if len(valid_df) > 0:
|
|
spearman_corr, p_value = stats.spearmanr(
|
|
valid_df['num_agencies_in_program'],
|
|
valid_df['dac_benefit_percentage']
|
|
)
|
|
agency_metrics['dac_correlation'] = {
|
|
'correlation_type': 'Spearman rank',
|
|
'correlation': spearman_corr,
|
|
'p_value': p_value,
|
|
'significant': p_value < 0.05
|
|
}
|
|
|
|
findings['agency_count_impact'] = agency_metrics
|
|
|
|
# Create visualization for agency count impact
|
|
plt.figure(figsize=(12, 10))
|
|
|
|
# 1. GHG Efficiency by number of agencies
|
|
plt.subplot(2, 1, 1)
|
|
if 'ghg_efficiency' in agency_metrics:
|
|
efficiency_df = pd.DataFrame(agency_metrics['ghg_efficiency'].items(), columns=['num_agencies', 'efficiency'])
|
|
efficiency_df = efficiency_df.sort_values('num_agencies')
|
|
|
|
plt.bar(efficiency_df['num_agencies'], efficiency_df['efficiency'])
|
|
plt.xlabel('Number of Agencies in Program')
|
|
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.title('GHG Efficiency by Number of Agencies')
|
|
|
|
# Add correlation information
|
|
if 'efficiency_correlation' in agency_metrics:
|
|
corr = agency_metrics['efficiency_correlation']['correlation']
|
|
p_val = agency_metrics['efficiency_correlation']['p_value']
|
|
plt.text(0.05, 0.95, f"Correlation: {corr:.2f} (p={p_val:.4f})",
|
|
transform=plt.gca().transAxes, fontsize=10,
|
|
bbox=dict(facecolor='white', alpha=0.8))
|
|
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# 2. DAC Benefit by number of agencies
|
|
plt.subplot(2, 1, 2)
|
|
if 'dac_benefit' in agency_metrics:
|
|
dac_df = pd.DataFrame(agency_metrics['dac_benefit'].items(), columns=['num_agencies', 'dac_benefit'])
|
|
dac_df = dac_df.sort_values('num_agencies')
|
|
|
|
plt.bar(dac_df['num_agencies'], dac_df['dac_benefit'])
|
|
plt.xlabel('Number of Agencies in Program')
|
|
plt.ylabel('DAC Benefit Percentage')
|
|
plt.title('DAC Benefit by Number of Agencies')
|
|
|
|
# Add correlation information
|
|
if 'dac_correlation' in agency_metrics:
|
|
corr = agency_metrics['dac_correlation']['correlation']
|
|
p_val = agency_metrics['dac_correlation']['p_value']
|
|
plt.text(0.05, 0.95, f"Correlation: {corr:.2f} (p={p_val:.4f})",
|
|
transform=plt.gca().transAxes, fontsize=10,
|
|
bbox=dict(facecolor='white', alpha=0.8))
|
|
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_dir / "agency_count_impact.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# 3. Analyze impact of regional scope
|
|
if 'regional_scope' in analysis_df.columns:
|
|
# Group by regional scope
|
|
scope_groups = analysis_df.groupby('regional_scope')
|
|
|
|
# Calculate metrics for each scope
|
|
scope_metrics = {}
|
|
|
|
# Project counts by regional scope
|
|
scope_counts = scope_groups.size()
|
|
scope_metrics['project_counts'] = scope_counts.to_dict()
|
|
|
|
# Average number of agencies by regional scope
|
|
if 'num_agencies_in_program' in analysis_df.columns:
|
|
scope_agencies = scope_groups['num_agencies_in_program'].mean()
|
|
scope_metrics['avg_agencies'] = scope_agencies.to_dict()
|
|
|
|
# Calculate GHG efficiency by regional scope
|
|
if 'ghg_efficiency' in analysis_df.columns:
|
|
valid_df = analysis_df[analysis_df['ghg_efficiency'].notna() & (analysis_df['ghg_efficiency'] > 0)]
|
|
if len(valid_df) > 0:
|
|
scope_efficiency = valid_df.groupby('regional_scope')['ghg_efficiency'].median()
|
|
scope_metrics['ghg_efficiency'] = scope_efficiency.to_dict()
|
|
|
|
# Calculate DAC benefit by regional scope
|
|
if 'dac_benefit_percentage' in analysis_df.columns:
|
|
scope_dac = scope_groups['dac_benefit_percentage'].mean()
|
|
scope_metrics['dac_benefit'] = scope_dac.to_dict()
|
|
|
|
findings['regional_scope_impact'] = scope_metrics
|
|
|
|
# Create visualization for regional scope impact
|
|
plt.figure(figsize=(12, 10))
|
|
|
|
# 1. Average number of agencies by regional scope
|
|
plt.subplot(2, 2, 1)
|
|
if 'avg_agencies' in scope_metrics:
|
|
scope_df = pd.DataFrame(scope_metrics['avg_agencies'].items(),
|
|
columns=['regional_scope', 'avg_agencies'])
|
|
|
|
# Define the order for regional scope
|
|
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
|
|
scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1)
|
|
scope_df = scope_df.sort_values('scope_order')
|
|
|
|
plt.bar(scope_df['regional_scope'], scope_df['avg_agencies'])
|
|
plt.xlabel('Regional Scope')
|
|
plt.ylabel('Average Number of Agencies')
|
|
plt.title('Agency Collaboration by Regional Scope')
|
|
plt.xticks(rotation=45)
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# 2. GHG Efficiency by regional scope
|
|
plt.subplot(2, 2, 2)
|
|
if 'ghg_efficiency' in scope_metrics:
|
|
scope_df = pd.DataFrame(scope_metrics['ghg_efficiency'].items(),
|
|
columns=['regional_scope', 'efficiency'])
|
|
|
|
# Define the order for regional scope
|
|
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
|
|
scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1)
|
|
scope_df = scope_df.sort_values('scope_order')
|
|
|
|
plt.bar(scope_df['regional_scope'], scope_df['efficiency'])
|
|
plt.xlabel('Regional Scope')
|
|
plt.ylabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.title('GHG Efficiency by Regional Scope')
|
|
plt.xticks(rotation=45)
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# 3. DAC Benefit by regional scope
|
|
plt.subplot(2, 2, 3)
|
|
if 'dac_benefit' in scope_metrics:
|
|
scope_df = pd.DataFrame(scope_metrics['dac_benefit'].items(),
|
|
columns=['regional_scope', 'dac_benefit'])
|
|
|
|
# Define the order for regional scope
|
|
scope_order = ['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
|
|
scope_df['scope_order'] = scope_df['regional_scope'].apply(lambda x: scope_order.index(x) if x in scope_order else -1)
|
|
scope_df = scope_df.sort_values('scope_order')
|
|
|
|
plt.bar(scope_df['regional_scope'], scope_df['dac_benefit'])
|
|
plt.xlabel('Regional Scope')
|
|
plt.ylabel('DAC Benefit Percentage')
|
|
plt.title('DAC Benefit by Regional Scope')
|
|
plt.xticks(rotation=45)
|
|
plt.grid(True, axis='y', alpha=0.3)
|
|
|
|
# 4. Efficiency vs. Equity by Regional Scope (scatter plot)
|
|
plt.subplot(2, 2, 4)
|
|
if 'ghg_efficiency' in scope_metrics and 'dac_benefit' in scope_metrics and 'avg_agencies' in scope_metrics:
|
|
# Combine the metrics
|
|
combined_df = pd.DataFrame({
|
|
'regional_scope': list(scope_metrics['ghg_efficiency'].keys()),
|
|
'efficiency': list(scope_metrics['ghg_efficiency'].values()),
|
|
'dac_benefit': list(scope_metrics['dac_benefit'].values()),
|
|
'avg_agencies': list(scope_metrics['avg_agencies'].values())
|
|
})
|
|
|
|
# Create scatter plot
|
|
plt.scatter(
|
|
combined_df['efficiency'],
|
|
combined_df['dac_benefit'],
|
|
s=combined_df['avg_agencies'] * 50, # Size based on number of agencies
|
|
alpha=0.7
|
|
)
|
|
|
|
# Add labels for each point
|
|
for i, row in combined_df.iterrows():
|
|
plt.annotate(
|
|
row['regional_scope'],
|
|
(row['efficiency'], row['dac_benefit']),
|
|
textcoords="offset points",
|
|
xytext=(5, 5),
|
|
ha='left'
|
|
)
|
|
|
|
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.ylabel('DAC Benefit Percentage')
|
|
plt.title('Efficiency vs. Equity by Regional Scope')
|
|
plt.grid(True, linestyle='--', alpha=0.7)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_dir / "regional_scope_impact.png", dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
# 4. Regression analysis of collaboration factors on outcomes
|
|
# Create models to predict GHG efficiency and DAC benefit
|
|
regression_results = {}
|
|
|
|
# Filter out missing values and extreme outliers
|
|
if 'ghg_efficiency' in analysis_df.columns:
|
|
ghg_model_df = analysis_df[
|
|
analysis_df['ghg_efficiency'].notna() &
|
|
(analysis_df['ghg_efficiency'] > 0) &
|
|
(analysis_df['ghg_efficiency'] < analysis_df['ghg_efficiency'].quantile(0.95))
|
|
].copy()
|
|
|
|
if 'num_agencies_in_program' in ghg_model_df.columns:
|
|
# Log transform the GHG efficiency for better model fit
|
|
ghg_model_df['log_efficiency'] = np.log(ghg_model_df['ghg_efficiency'])
|
|
|
|
# Create model predicting log efficiency
|
|
try:
|
|
# Base model with just number of agencies
|
|
model1 = sm.OLS.from_formula('log_efficiency ~ num_agencies_in_program', data=ghg_model_df)
|
|
results1 = model1.fit()
|
|
|
|
# Model with number of agencies and regional scope (if available)
|
|
if 'regional_scope' in ghg_model_df.columns:
|
|
# Convert categorical to dummy variables
|
|
ghg_model_df = pd.get_dummies(ghg_model_df, columns=['regional_scope'], drop_first=True)
|
|
|
|
# Get dummy variable columns
|
|
scope_cols = [col for col in ghg_model_df.columns if col.startswith('regional_scope_')]
|
|
|
|
# Create formula with interaction terms
|
|
formula2 = 'log_efficiency ~ num_agencies_in_program + ' + ' + '.join(scope_cols)
|
|
model2 = sm.OLS.from_formula(formula2, data=ghg_model_df)
|
|
results2 = model2.fit()
|
|
|
|
# Store model results
|
|
regression_results['ghg_efficiency'] = {
|
|
'base_model': {
|
|
'r_squared': results1.rsquared,
|
|
'r_squared_adj': results1.rsquared_adj,
|
|
'p_value': results1.f_pvalue,
|
|
'coefficient': results1.params['num_agencies_in_program'],
|
|
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
|
|
},
|
|
'full_model': {
|
|
'r_squared': results2.rsquared,
|
|
'r_squared_adj': results2.rsquared_adj,
|
|
'p_value': results2.f_pvalue,
|
|
'coefficient': results2.params['num_agencies_in_program'],
|
|
'coefficient_p_value': results2.pvalues['num_agencies_in_program']
|
|
}
|
|
}
|
|
else:
|
|
# Store just the base model results
|
|
regression_results['ghg_efficiency'] = {
|
|
'base_model': {
|
|
'r_squared': results1.rsquared,
|
|
'r_squared_adj': results1.rsquared_adj,
|
|
'p_value': results1.f_pvalue,
|
|
'coefficient': results1.params['num_agencies_in_program'],
|
|
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform regression analysis for GHG efficiency: {e}")
|
|
|
|
# Regression for DAC benefit
|
|
if 'dac_benefit_percentage' in analysis_df.columns:
|
|
dac_model_df = analysis_df[
|
|
analysis_df['dac_benefit_percentage'].notna()
|
|
].copy()
|
|
|
|
if 'num_agencies_in_program' in dac_model_df.columns:
|
|
try:
|
|
# Base model with just number of agencies
|
|
model1 = sm.OLS.from_formula('dac_benefit_percentage ~ num_agencies_in_program', data=dac_model_df)
|
|
results1 = model1.fit()
|
|
|
|
# Model with number of agencies and regional scope (if available)
|
|
if 'regional_scope' in dac_model_df.columns:
|
|
# Convert categorical to dummy variables
|
|
dac_model_df = pd.get_dummies(dac_model_df, columns=['regional_scope'], drop_first=True)
|
|
|
|
# Get dummy variable columns
|
|
scope_cols = [col for col in dac_model_df.columns if col.startswith('regional_scope_')]
|
|
|
|
# Create formula with interaction terms
|
|
formula2 = 'dac_benefit_percentage ~ num_agencies_in_program + ' + ' + '.join(scope_cols)
|
|
model2 = sm.OLS.from_formula(formula2, data=dac_model_df)
|
|
results2 = model2.fit()
|
|
|
|
# Store model results
|
|
regression_results['dac_benefit'] = {
|
|
'base_model': {
|
|
'r_squared': results1.rsquared,
|
|
'r_squared_adj': results1.rsquared_adj,
|
|
'p_value': results1.f_pvalue,
|
|
'coefficient': results1.params['num_agencies_in_program'],
|
|
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
|
|
},
|
|
'full_model': {
|
|
'r_squared': results2.rsquared,
|
|
'r_squared_adj': results2.rsquared_adj,
|
|
'p_value': results2.f_pvalue,
|
|
'coefficient': results2.params['num_agencies_in_program'],
|
|
'coefficient_p_value': results2.pvalues['num_agencies_in_program']
|
|
}
|
|
}
|
|
else:
|
|
# Store just the base model results
|
|
regression_results['dac_benefit'] = {
|
|
'base_model': {
|
|
'r_squared': results1.rsquared,
|
|
'r_squared_adj': results1.rsquared_adj,
|
|
'p_value': results1.f_pvalue,
|
|
'coefficient': results1.params['num_agencies_in_program'],
|
|
'coefficient_p_value': results1.pvalues['num_agencies_in_program']
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not perform regression analysis for DAC benefit: {e}")
|
|
|
|
findings['regression_analysis'] = regression_results
|
|
|
|
return findings |