from __future__ import annotations from dataclasses import dataclass from typing import Dict, List, Optional, Any, Tuple, Union from datetime import datetime, timedelta import os import json import logging import warnings from pathlib import Path import numpy as np import pandas as pd import geopandas as gpd from sqlalchemy import create_engine, text, Engine import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.metrics.pairwise import cosine_similarity from sklearn.linear_model import LinearRegression import statsmodels.api as sm import statsmodels.formula.api as smf from scipy.stats import pearsonr, spearmanr # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Suppress pandas warnings warnings.filterwarnings('ignore') # Custom exceptions class CCIAnalyzerError(Exception): """Base exception class for CCIAnalyzer.""" pass class DataLoadError(CCIAnalyzerError): """Raised when data loading fails.""" pass class ConfigError(CCIAnalyzerError): """Raised when configuration is invalid.""" pass class AnalysisError(CCIAnalyzerError): """Raised when analysis fails.""" pass @dataclass class ProjectData: """Data class for CCI project information.""" project_id: str reporting_cycle: int agency_name: str program_name: str sub_program_name: Optional[str] record_type: str census_tract: Optional[int] lat_long: Optional[str] senate_district: Optional[int] assembly_district: Optional[int] county: str total_project_cost: float total_ggrf_funding: float project_life_years: int total_project_ghg_reductions: float annual_project_ghg_reductions: float fiscal_year_funding: str is_benefit_dac: bool total_ggrf_dac_funding: Optional[float] funding_benefiting_dac: Optional[float] date_operational: Optional[datetime] project_completion_date: Optional[datetime] is_ab1550_buffer_region: Optional[bool] is_benefit_dac1550: Optional[bool] is_low_income_communities: Optional[bool] @dataclass class Config: """Configuration settings for CCIAnalyzer.""" data_path: Path output_path: Path = Path("./output") shapefile_path: Optional[Path] = None ej_data_path: Optional[Path] = None cache_dir: Path = Path("./cache") chunk_size: int = 10000 start_year: int = 2015 end_year: int = 2024 temporal_breakpoint: int = 2020 # Year to use as pre/post analysis breakpoint class CCIAnalyzer: """ Analyzes California Climate Investments (CCI) program data with focus on: 1. Inter-agency collaboration and its impacts on GHG reduction and equity 2. Temporal trends in funding allocation and collaborative outputs 3. Spatial patterns in program implementation Attributes: config (Config): Configuration settings data (Dict): Dictionary containing loaded datasets """ def __init__(self, data_path: Union[str, Path], output_path: Optional[Union[str, Path]] = None, shapefile_path: Optional[Union[str, Path]] = None, ej_data_path: Optional[Union[str, Path]] = None, start_year: int = 2015, end_year: int = 2024, temporal_breakpoint: int = 2020, chunk_size: int = 10000): """ Initialize CCIAnalyzer with paths to data sources and configuration. Args: data_path: Path to the main CCI program data CSV file output_path: Directory to save analysis outputs shapefile_path: Path to geographic shapefiles (optional) ej_data_path: Path to environmental justice data (optional) start_year: Starting year for analysis end_year: Ending year for analysis temporal_breakpoint: Year to use as pre/post analysis breakpoint chunk_size: Size of chunks for reading large datasets """ try: # Convert string paths to Path objects if needed data_path = Path(data_path) if isinstance(data_path, str) else data_path if output_path: output_path = Path(output_path) if isinstance(output_path, str) else output_path else: output_path = Path("./output") if shapefile_path: shapefile_path = Path(shapefile_path) if isinstance(shapefile_path, str) else shapefile_path if ej_data_path: ej_data_path = Path(ej_data_path) if isinstance(ej_data_path, str) else ej_data_path # Initialize configuration self.config = Config( data_path=data_path, output_path=output_path, shapefile_path=shapefile_path, ej_data_path=ej_data_path, cache_dir=Path('./cache'), chunk_size=chunk_size, start_year=start_year, end_year=end_year, temporal_breakpoint=temporal_breakpoint ) # Create output and cache directories self.config.output_path.mkdir(parents=True, exist_ok=True) self.config.cache_dir.mkdir(parents=True, exist_ok=True) # Initialize data dictionary self.data: Dict[str, pd.DataFrame] = {} self.geo_data: Dict[str, gpd.GeoDataFrame] = {} logger.info("Initializing CCIAnalyzer...") self._initialize_data() except Exception as e: logger.error(f"Initialization error: {e}", exc_info=True) raise ConfigError(f"Failed to initialize CCIAnalyzer: {str(e)}") def _initialize_data(self): """Load and process required data.""" try: # Load core dataset self.data['cci_projects'] = self._load_cci_data() # Process the data to create derived datasets self._process_cci_data() # Load geographic data if available if self.config.shapefile_path: self._load_geographic_data() # Load environmental justice data if available if self.config.ej_data_path: self._load_ej_data() # Create analysis-ready datasets self._create_analysis_datasets() logger.info("Data initialization complete") except Exception as e: logger.error(f"Error during data initialization: {e}", exc_info=True) raise DataLoadError(f"Failed to initialize data: {str(e)}") def _load_cci_data(self) -> pd.DataFrame: """ Load CCI program data from CSV with proper handling of data types and chunking for large files. Returns: DataFrame containing CCI project data """ try: logger.info(f"Loading CCI data from {self.config.data_path}...") # Define data types for efficient loading dtypes = { 'Project IDNumber': str, 'Reporting Cycle Name': 'Int64', 'Agency Name': str, 'Program Name': str, 'Sub Program Name': str, 'Record Type': str, 'Census Tract': 'Int64', 'Lat Long': str, 'Senate District': 'Int64', 'Assembly District': 'Int64', 'County': str, 'Total Project Cost': 'float64', 'Total Program GGRFFunding': 'float64', 'Project Life Years': 'Int64', 'Total Project GHGReductions': 'float64', 'Annual Project GHGReductions': 'float64', 'Project Count': 'Int64', 'Fiscal Year Funding Project': str, 'Is Benefit Disadvantaged Communities': bool, 'Disadvantaged Community Criteria': str, 'Total GGRFDisadvantaged Community Funding': 'float64', 'Funding Benefiting Disadvantaged Communities': 'float64', 'Funding Within Disadvantage Communities': 'float64', 'Date Operational': str, 'Project Completion Date': str, 'Is AB1550Buffer Region': bool, 'Is Benefit DAC1550Communities': bool, 'Is Low Income Communities': bool } # Parse dates date_cols = ['Date Operational', 'Project Completion Date'] # Read in chunks for large files df_chunks = [] # Use chunking to handle large files chunk_iter = pd.read_csv( self.config.data_path, dtype=dtypes, parse_dates=date_cols, chunksize=self.config.chunk_size ) for chunk in chunk_iter: df_chunks.append(chunk) # Combine chunks df = pd.concat(df_chunks, ignore_index=True) # Basic data cleaning df = self._clean_cci_data(df) logger.info(f"Loaded {len(df)} CCI projects") return df except Exception as e: logger.error(f"Error loading CCI data: {e}", exc_info=True) raise DataLoadError(f"Failed to load CCI data: {str(e)}") def _clean_cci_data(self, df: pd.DataFrame) -> pd.DataFrame: """ Clean and prepare CCI data. Args: df: Raw CCI data DataFrame Returns: Cleaned DataFrame """ try: # Rename columns for consistency and readability column_mapping = { 'Project IDNumber': 'project_id', 'Reporting Cycle Name': 'reporting_cycle', 'Agency Name': 'agency_name', 'Program Name': 'program_name', 'Sub Program Name': 'sub_program_name', 'Record Type': 'record_type', 'Census Tract': 'census_tract', 'Lat Long': 'lat_long', 'Senate District': 'senate_district', 'Assembly District': 'assembly_district', 'County': 'county', 'Total Project Cost': 'total_project_cost', 'Total Program GGRFFunding': 'total_ggrf_funding', 'Project Life Years': 'project_life_years', 'Total Project GHGReductions': 'total_project_ghg_reductions', 'Annual Project GHGReductions': 'annual_project_ghg_reductions', 'Project Count': 'project_count', 'Fiscal Year Funding Project': 'fiscal_year_funding', 'Is Benefit Disadvantaged Communities': 'is_benefit_dac', 'Disadvantaged Community Criteria': 'dac_criteria', 'Total GGRFDisadvantaged Community Funding': 'total_ggrf_dac_funding', 'Funding Benefiting Disadvantaged Communities': 'funding_benefiting_dac', 'Funding Within Disadvantage Communities': 'funding_within_dac', 'Date Operational': 'date_operational', 'Project Completion Date': 'project_completion_date', 'Is AB1550Buffer Region': 'is_ab1550_buffer_region', 'Is Benefit DAC1550Communities': 'is_benefit_dac1550', 'Is Low Income Communities': 'is_low_income_communities' } # Rename columns df = df.rename(columns=column_mapping) # Extract latitude and longitude from combined field if 'lat_long' in df.columns: df['latitude'] = df['lat_long'].str.split(',').str[1].str.strip().astype(float) df['longitude'] = df['lat_long'].str.split(',').str[0].str.strip().astype(float) # Convert fiscal year to calendar year if 'fiscal_year_funding' in df.columns: # Extract the starting year from fiscal year strings like "2019-20" df['funding_year'] = df['fiscal_year_funding'].str.split('-').str[0].astype('Int64') # Calculate GHG efficiency ($ spent per ton of GHG reduced) df['ghg_efficiency'] = np.where( df['total_project_ghg_reductions'] > 0, df['total_ggrf_funding'] / df['total_project_ghg_reductions'], np.nan ) # Calculate DAC benefit percentage df['dac_benefit_percentage'] = np.where( df['total_ggrf_funding'] > 0, 100 * df['funding_benefiting_dac'] / df['total_ggrf_funding'], 0 ) # Create pre/post temporal breakpoint indicator if 'funding_year' in df.columns: df['post_breakpoint'] = df['funding_year'] >= self.config.temporal_breakpoint # Drop rows with missing critical values df = df.dropna(subset=['project_id', 'agency_name', 'program_name']) return df except Exception as e: logger.error(f"Error cleaning CCI data: {e}", exc_info=True) raise DataLoadError(f"Failed to clean CCI data: {str(e)}") def _process_cci_data(self): """Process the CCI data to create derived analytical datasets.""" try: if 'cci_projects' not in self.data or self.data['cci_projects'].empty: raise DataLoadError("CCI project data not loaded") df = self.data['cci_projects'] # Create agency collaboration dataset self._create_agency_collaboration_data(df) # Create regional dataset self._create_regional_data(df) # Create temporal dataset self._create_temporal_data(df) except Exception as e: logger.error(f"Error processing CCI data: {e}", exc_info=True) raise DataLoadError(f"Failed to process CCI data: {str(e)}") def _create_agency_collaboration_data(self, df: pd.DataFrame): """ Create agency collaboration metrics from CCI data. Args: df: CCI projects DataFrame """ try: # Group projects by program to identify unique agencies per program program_agencies = df.groupby('program_name')['agency_name'].unique().apply(list) # Calculate number of agencies per program program_agency_counts = program_agencies.apply(len) # Create program-level collaboration metrics program_collab = pd.DataFrame({ 'program_name': program_agency_counts.index, 'num_partner_agencies': program_agency_counts.values }) # Add collaboration size categories program_collab['collab_size_category'] = pd.cut( program_collab['num_partner_agencies'], bins=[0, 1, 3, 5, float('inf')], labels=['Single agency', 'Small collab (2-3)', 'Medium collab (4-5)', 'Large collab (6+)'] ) # Store the result self.data['agency_collaboration'] = program_collab # Add collaboration metrics back to the main dataset program_collab_dict = dict(zip(program_collab['program_name'], program_collab['num_partner_agencies'])) df['num_partner_agencies'] = df['program_name'].map(program_collab_dict) # Update the main dataset self.data['cci_projects'] = df logger.info(f"Created agency collaboration data for {len(program_collab)} programs") except Exception as e: logger.error(f"Error creating agency collaboration data: {e}", exc_info=True) raise DataLoadError(f"Failed to create agency collaboration data: {str(e)}") def _create_regional_data(self, df: pd.DataFrame): """ Create regional dataset for spatial analysis. Args: df: CCI projects DataFrame """ try: # Define regions based on counties # California regions mapping (simplified example) ca_regions = { 'Bay Area': ['Alameda', 'Contra Costa', 'Marin', 'Napa', 'San Francisco', 'San Mateo', 'Santa Clara', 'Solano', 'Sonoma'], 'Sacramento Region': ['Sacramento', 'Yolo', 'Placer', 'El Dorado', 'Sutter', 'Yuba'], 'San Joaquin Valley': ['Fresno', 'Kern', 'Kings', 'Madera', 'Merced', 'San Joaquin', 'Stanislaus', 'Tulare'], 'Southern California': ['Los Angeles', 'Orange', 'Riverside', 'San Bernardino', 'Ventura', 'Imperial', 'San Diego'], 'Central Coast': ['Monterey', 'San Benito', 'San Luis Obispo', 'Santa Barbara', 'Santa Cruz'], 'Northern California': ['Butte', 'Colusa', 'Del Norte', 'Glenn', 'Humboldt', 'Lake', 'Lassen', 'Mendocino', 'Modoc', 'Nevada', 'Plumas', 'Shasta', 'Sierra', 'Siskiyou', 'Tehama', 'Trinity'] } # Create reverse mapping: county -> region county_to_region = {} for region, counties in ca_regions.items(): for county in counties: county_to_region[county] = region # Apply region mapping df['region'] = df['county'].map(county_to_region) # Handle multi-county projects multi_county_mask = df['county'].str.contains(',', na=False) df.loc[multi_county_mask, 'region'] = 'Multi-Region' # Fill missing regions df['region'] = df['region'].fillna('Other/Unknown') # Create regional summary regional_summary = df.groupby('region').agg({ 'project_id': 'count', 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'num_partner_agencies': 'mean', 'ghg_efficiency': 'mean', 'dac_benefit_percentage': 'mean' }).reset_index() regional_summary = regional_summary.rename(columns={ 'project_id': 'num_projects', 'num_partner_agencies': 'avg_partners' }) # Calculate total GHG reduction and DAC benefit percentages by region regional_summary['pct_total_funding'] = 100 * regional_summary['total_ggrf_funding'] / regional_summary['total_ggrf_funding'].sum() regional_summary['pct_total_ghg'] = 100 * regional_summary['total_project_ghg_reductions'] / regional_summary['total_project_ghg_reductions'].sum() regional_summary['pct_dac_funding'] = 100 * regional_summary['funding_benefiting_dac'] / regional_summary['total_ggrf_funding'] # Store the results self.data['cci_projects'] = df # Update with region info self.data['regional_summary'] = regional_summary logger.info(f"Created regional data with {len(regional_summary)} regions") except Exception as e: logger.error(f"Error creating regional data: {e}", exc_info=True) raise DataLoadError(f"Failed to create regional data: {str(e)}") def _create_temporal_data(self, df: pd.DataFrame): """ Create temporal dataset for time series analysis. Args: df: CCI projects DataFrame """ try: # Ensure we have a funding year column if 'funding_year' not in df.columns: raise DataLoadError("Funding year column not available for temporal analysis") # Create year-level summaries yearly_summary = df.groupby('funding_year').agg({ 'project_id': 'count', 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'num_partner_agencies': 'mean', 'ghg_efficiency': 'mean', 'dac_benefit_percentage': 'mean' }).reset_index() yearly_summary = yearly_summary.rename(columns={ 'project_id': 'num_projects', 'num_partner_agencies': 'avg_partners' }) # Calculate percentages and rates yearly_summary['pct_dac_funding'] = 100 * yearly_summary['funding_benefiting_dac'] / yearly_summary['total_ggrf_funding'] yearly_summary['avg_project_funding'] = yearly_summary['total_ggrf_funding'] / yearly_summary['num_projects'] # Create pre/post breakpoint summaries breakpoint_summary = df.groupby('post_breakpoint').agg({ 'project_id': 'count', 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'num_partner_agencies': 'mean', 'ghg_efficiency': 'mean', 'dac_benefit_percentage': 'mean', 'total_ggrf_funding': ['mean', 'median'] }).reset_index() # Flatten multi-level columns after aggregation breakpoint_summary.columns = ['_'.join(col).strip('_') for col in breakpoint_summary.columns.values] # Store the results self.data['yearly_summary'] = yearly_summary self.data['breakpoint_summary'] = breakpoint_summary logger.info(f"Created temporal data with {len(yearly_summary)} yearly records") except Exception as e: logger.error(f"Error creating temporal data: {e}", exc_info=True) raise DataLoadError(f"Failed to create temporal data: {str(e)}") def _load_geographic_data(self): """Load geographic shapefiles if available.""" try: if not self.config.shapefile_path or not self.config.shapefile_path.exists(): logger.warning("Shapefile path not provided or does not exist") return # Load county boundaries county_shapefile = self.config.shapefile_path / "counties.shp" if county_shapefile.exists(): self.geo_data['counties'] = gpd.read_file(county_shapefile) logger.info(f"Loaded {len(self.geo_data['counties'])} county boundaries") # Load census tract boundaries tract_shapefile = self.config.shapefile_path / "census_tracts.shp" if tract_shapefile.exists(): self.geo_data['census_tracts'] = gpd.read_file(tract_shapefile) logger.info(f"Loaded {len(self.geo_data['census_tracts'])} census tract boundaries") except Exception as e: logger.error(f"Error loading geographic data: {e}", exc_info=True) logger.warning("Continuing without geographic data") def _load_ej_data(self): """Load environmental justice data if available.""" try: if not self.config.ej_data_path or not self.config.ej_data_path.exists(): logger.warning("Environmental justice data path not provided or does not exist") return # Load EJ data by census tract ej_data_file = self.config.ej_data_path / "ej_indicators.csv" if ej_data_file.exists(): ej_data = pd.read_csv(ej_data_file) self.data['ej_indicators'] = ej_data logger.info(f"Loaded environmental justice data for {len(ej_data)} census tracts") except Exception as e: logger.error(f"Error loading environmental justice data: {e}", exc_info=True) logger.warning("Continuing without environmental justice data") def _create_analysis_datasets(self): """Create integrated datasets for analysis.""" try: # Create collaboration outcomes dataset self._create_collaboration_outcomes_dataset() # Create regional collaboration dataset self._create_regional_collaboration_dataset() # Create temporal trends dataset self._create_temporal_trends_dataset() except Exception as e: logger.error(f"Error creating analysis datasets: {e}", exc_info=True) raise DataLoadError(f"Failed to create analysis datasets: {str(e)}") def _create_collaboration_outcomes_dataset(self): """Create dataset linking collaboration metrics to outcomes.""" try: if 'cci_projects' not in self.data: raise DataLoadError("CCI project data not loaded") df = self.data['cci_projects'] # Group by program and calculate metrics collab_outcomes = df.groupby(['program_name', 'num_partner_agencies']).agg({ 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'project_id': 'count', 'ghg_efficiency': 'mean', 'dac_benefit_percentage': 'mean' }).reset_index() collab_outcomes = collab_outcomes.rename(columns={ 'project_id': 'num_projects' }) # Calculate percentages collab_outcomes['pct_dac_funding'] = 100 * collab_outcomes['funding_benefiting_dac'] / collab_outcomes['total_ggrf_funding'] # Add collaboration size categories collab_outcomes['collab_size_category'] = pd.cut( collab_outcomes['num_partner_agencies'], bins=[0, 1, 3, 5, float('inf')], labels=['Single agency', 'Small collab (2-3)', 'Medium collab (4-5)', 'Large collab (6+)'] ) # Store the result self.data['collaboration_outcomes'] = collab_outcomes logger.info(f"Created collaboration outcomes dataset with {len(collab_outcomes)} records") except Exception as e: logger.error(f"Error creating collaboration outcomes dataset: {e}", exc_info=True) raise DataLoadError(f"Failed to create collaboration outcomes dataset: {str(e)}") def _create_regional_collaboration_dataset(self): """Create dataset linking regional and collaboration patterns.""" try: if 'cci_projects' not in self.data: raise DataLoadError("CCI project data not loaded") df = self.data['cci_projects'] # Group by region and collaboration size regional_collab = df.groupby(['region', 'num_partner_agencies']).agg({ 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'project_id': 'count', 'ghg_efficiency': 'mean', 'dac_benefit_percentage': 'mean' }).reset_index() regional_collab = regional_collab.rename(columns={ 'project_id': 'num_projects' }) # Calculate percentages regional_collab['pct_dac_funding'] = 100 * regional_collab['funding_benefiting_dac'] / regional_collab['total_ggrf_funding'] # Add collaboration size categories regional_collab['collab_size_category'] = pd.cut( regional_collab['num_partner_agencies'], bins=[0, 1, 3, 5, float('inf')], labels=['Single agency', 'Small collab (2-3)', 'Medium collab (4-5)', 'Large collab (6+)'] ) # Store the result self.data['regional_collaboration'] = regional_collab logger.info(f"Created regional collaboration dataset with {len(regional_collab)} records") except Exception as e: logger.error(f"Error creating regional collaboration dataset: {e}", exc_info=True) raise DataLoadError(f"Failed to create regional collaboration dataset: {str(e)}") def _create_temporal_trends_dataset(self): """Create dataset for temporal trend analysis.""" try: if 'cci_projects' not in self.data: raise DataLoadError("CCI project data not loaded") df = self.data['cci_projects'] # Group by year and collaboration size temporal_collab = df.groupby(['funding_year', 'num_partner_agencies']).agg({ 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'project_id': 'count', 'ghg_efficiency': 'mean', 'dac_benefit_percentage': 'mean' }).reset_index() temporal_collab = temporal_collab.rename(columns={ 'project_id': 'num_projects' }) # Calculate percentages temporal_collab['pct_dac_funding'] = 100 * temporal_collab['funding_benefiting_dac'] / temporal_collab['total_ggrf_funding'] # Add collaboration size categories temporal_collab['collab_size_category'] = pd.cut( temporal_collab['num_partner_agencies'], bins=[0, 1, 3, 5, float('inf')], labels=['Single agency', 'Small collab (2-3)', 'Medium collab (4-5)', 'Large collab (6+)'] ) # Store the result self.data['temporal_collaboration'] = temporal_collab logger.info(f"Created temporal collaboration dataset with {len(temporal_collab)} records") except Exception as e: logger.error(f"Error creating temporal trends dataset: {e}", exc_info=True) raise DataLoadError(f"Failed to create temporal trends dataset: {str(e)}") # Analysis methods for Research Question 1: Collaboration structure and scale def analyze_collaboration_structure(self) -> Dict[str, Any]: """ Analyze how the structure and scale of inter-agency collaboration shapes outcomes. Returns: Dictionary containing collaboration structure analysis results """ try: if 'collaboration_outcomes' not in self.data: raise AnalysisError("Collaboration outcomes dataset not available") df = self.data['collaboration_outcomes'] # Calculate correlations between collaboration size and outcomes corr_metrics = ['num_partner_agencies', 'ghg_efficiency', 'pct_dac_funding'] correlations = df[corr_metrics].corr(method='spearman') # Aggregate by collaboration size category collab_size_analysis = df.groupby('collab_size_category').agg({ 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'num_projects': 'sum', 'ghg_efficiency': 'mean', 'dac_benefit_percentage': 'mean', 'program_name': 'nunique' }).reset_index() # Calculate percentages and metrics collab_size_analysis['pct_total_funding'] = 100 * collab_size_analysis['total_ggrf_funding'] / collab_size_analysis['total_ggrf_funding'].sum() collab_size_analysis['pct_total_ghg'] = 100 * collab_size_analysis['total_project_ghg_reductions'] / collab_size_analysis['total_project_ghg_reductions'].sum() collab_size_analysis['pct_dac_funding'] = 100 * collab_size_analysis['funding_benefiting_dac'] / collab_size_analysis['total_ggrf_funding'] # Regression analysis of collaboration size on outcomes X = sm.add_constant(df['num_partner_agencies']) # GHG efficiency regression y_ghg = df['ghg_efficiency'] model_ghg = sm.OLS(y_ghg, X).fit() # DAC benefit regression y_dac = df['pct_dac_funding'] model_dac = sm.OLS(y_dac, X).fit() # Prepare results results = { 'correlations': correlations.to_dict(), 'collaboration_size_summary': collab_size_analysis.to_dict('records'), 'ghg_efficiency_regression': { 'coefficients': model_ghg.params.to_dict(), 'p_values': model_ghg.pvalues.to_dict(), 'r_squared': model_ghg.rsquared, 'adj_r_squared': model_ghg.rsquared_adj }, 'dac_benefit_regression': { 'coefficients': model_dac.params.to_dict(), 'p_values': model_dac.pvalues.to_dict(), 'r_squared': model_dac.rsquared, 'adj_r_squared': model_dac.rsquared_adj } } logger.info("Completed collaboration structure analysis") return results except Exception as e: logger.error(f"Error analyzing collaboration structure: {e}", exc_info=True) raise AnalysisError(f"Failed to analyze collaboration structure: {str(e)}") def analyze_regional_patterns(self) -> Dict[str, Any]: """ Analyze regional patterns in collaboration and outcomes. Returns: Dictionary containing regional analysis results """ try: if 'regional_collaboration' not in self.data: raise AnalysisError("Regional collaboration dataset not available") df = self.data['regional_collaboration'] # Regional collaboration summary regional_summary = df.groupby('region').agg({ 'num_partner_agencies': 'mean', 'total_ggrf_funding': 'sum', 'total_project_ghg_reductions': 'sum', 'funding_benefiting_dac': 'sum', 'ghg_efficiency': 'mean', 'pct_dac_funding': 'mean', 'num_projects': 'sum' }).reset_index() # Calculate percentages regional_summary['pct_total_funding'] = 100 * regional_summary['total_ggrf_funding'] / regional_summary['total_ggrf_funding'].sum() regional_summary['pct_total_ghg'] = 100 * regional_summary['total_project_ghg_reductions'] / regional_summary['total_project_ghg_reductions'].sum() # Analyze Multi-Region projects separately multi_region = df[df['region'] == 'Multi-Region'] single_region = df[df['region'] != 'Multi-Region'] multi_vs_single = { 'multi_region': { 'avg_partners': multi_region['num_partner_agencies'].mean(), 'avg_ghg_efficiency': multi_region['ghg_efficiency'].mean(), 'avg_dac_benefit': multi_region['pct_dac_funding'].mean(), 'total_funding': multi_region['total_ggrf_funding'].sum(), 'total_ghg_reductions': multi_region['total_project_ghg_reductions'].sum(), 'num_projects': multi_region['num_projects'].sum() }, 'single_region': { 'avg_partners': single_region['num_partner_agencies'].mean(), 'avg_ghg_efficiency': single_region['ghg_efficiency'].mean(), 'avg_dac_benefit': single_region['pct_dac_funding'].mean(), 'total_funding': single_region['total_ggrf_funding'].sum(), 'total_ghg_reductions': single_region['total_project_ghg_reductions'].sum(), 'num_projects': single_region['num_projects'].sum() } } # Per-region regressions of collaboration size on outcomes region_models = {} for region in df['region'].unique(): region_df = df[df['region'] == region] if len(region_df) > 5: # Only run regression if enough data points try: X = sm.add_constant(region_df['num_partner_agencies']) # GHG efficiency regression y_ghg = region_df['ghg_efficiency'] if not y_ghg.isna().all(): model_ghg = sm.OLS(y_ghg, X).fit() # DAC benefit regression y_dac = region_df['pct_dac_funding'] model_dac = sm.OLS(y_dac, X).fit() region_models[region] = { 'ghg_efficiency': { 'coefficient': model_ghg.params['num_partner_agencies'], 'p_value': model_ghg.pvalues['num_partner_agencies'], 'r_squared': model_ghg.rsquared }, 'dac_benefit': { 'coefficient': model_dac.params['num_partner_agencies'], 'p_value': model_dac.pvalues['num_partner_agencies'], 'r_squared': model_dac.rsquared } } except Exception as e: logger.warning(f"Could not run regression for region {region}: {e}") # Prepare results results = { 'regional_summary': regional_summary.to_dict('records'), 'multi_vs_single_region': multi_vs_single, 'region_models': region_models } logger.info("Completed regional patterns analysis") return results except Exception as e: logger.error(f"Error analyzing regional patterns: {e}", exc_info=True) raise AnalysisError(f"Failed to analyze regional patterns: {str(e)}") # Analysis methods for Research Question 2: Temporal trends def analyze_temporal_trends(self) -> Dict[str, Any]: """ Analyze temporal trends in funding, collaboration, and outcomes. Returns: Dictionary containing temporal trends analysis results """ try: if 'temporal_collaboration' not in self.data or 'yearly_summary' not in self.data: raise AnalysisError("Temporal datasets not available") temp_df = self.data['temporal_collaboration'] yearly_df = self.data['yearly_summary'] # Analyze yearly trends yearly_trends = yearly_df.to_dict('records') # Calculate year-over-year changes yearly_df['funding_change'] = yearly_df['total_ggrf_funding'].pct_change() * 100 yearly_df['ghg_reduction_change'] = yearly_df['total_project_ghg_reductions'].pct_change() * 100 yearly_df['dac_funding_change'] = yearly_df['funding_benefiting_dac'].pct_change() * 100 yearly_df['partners_change'] = yearly_df['avg_partners'].pct_change() * 100 yoy_changes = yearly_df[['funding_year', 'funding_change', 'ghg_reduction_change', 'dac_funding_change', 'partners_change']].dropna() # Pre/post breakpoint analysis if 'breakpoint_summary' in self.data: breakpoint_df = self.data['breakpoint_summary'] breakpoint_analysis = breakpoint_df.to_dict('records') # Calculate percentage changes between pre and post periods pre_row = breakpoint_df[breakpoint_df['post_breakpoint'] == 0].iloc[0] if len(breakpoint_df[breakpoint_df['post_breakpoint'] == 0]) > 0 else None post_row = breakpoint_df[breakpoint_df['post_breakpoint'] == 1].iloc[0] if len(breakpoint_df[breakpoint_df['post_breakpoint'] == 1]) > 0 else None if pre_row is not None and post_row is not None: pct_changes = { 'avg_project_funding_change': ((post_row['total_ggrf_funding_mean'] / pre_row['total_ggrf_funding_mean']) - 1) * 100, 'avg_partners_change': ((post_row['num_partner_agencies_mean'] / pre_row['num_partner_agencies_mean']) - 1) * 100, 'ghg_efficiency_change': ((post_row['ghg_efficiency_mean'] / pre_row['ghg_efficiency_mean']) - 1) * 100, 'dac_benefit_change': ((post_row['dac_benefit_percentage_mean'] / pre_row['dac_benefit_percentage_mean']) - 1) * 100 } else: pct_changes = {} else: breakpoint_analysis = {} pct_changes = {} # Regression: collaboration trends over time yearly_collab_trends = temp_df.groupby('funding_year').agg({ 'num_partner_agencies': 'mean', 'ghg_efficiency': 'mean', 'pct_dac_funding': 'mean' }).reset_index() if len(yearly_collab_trends) > 3: # Only run regression if enough years X = sm.add_constant(yearly_collab_trends['funding_year']) # Partners over time regression y_partners = yearly_collab_trends['num_partner_agencies'] model_partners = sm.OLS(y_partners, X).fit() # GHG efficiency over time regression y_ghg = yearly_collab_trends['ghg_efficiency'] model_ghg = sm.OLS(y_ghg, X).fit() # DAC benefit over time regression y_dac = yearly_collab_trends['pct_dac_funding'] model_dac = sm.OLS(y_dac, X).fit() time_regressions = { 'partners_over_time': { 'coefficient': model_partners.params['funding_year'], 'p_value': model_partners.pvalues['funding_year'], 'r_squared': model_partners.rsquared }, 'ghg_efficiency_over_time': { 'coefficient': model_ghg.params['funding_year'], 'p_value': model_ghg.pvalues['funding_year'], 'r_squared': model_ghg.rsquared }, 'dac_benefit_over_time': { 'coefficient': model_dac.params['funding_year'], 'p_value': model_dac.pvalues['funding_year'], 'r_squared': model_dac.rsquared } } else: time_regressions = {} # Prepare results results = { 'yearly_trends': yearly_trends, 'year_over_year_changes': yoy_changes.to_dict('records'), 'breakpoint_analysis': breakpoint_analysis, 'breakpoint_pct_changes': pct_changes, 'time_regressions': time_regressions } logger.info("Completed temporal trends analysis") return results except Exception as e: logger.error(f"Error analyzing temporal trends: {e}", exc_info=True) raise AnalysisError(f"Failed to analyze temporal trends: {str(e)}") # Comprehensive analysis methods that integrate multiple analyses def get_comprehensive_analysis(self) -> Dict[str, Dict]: """ Get comprehensive analysis results addressing all research questions. Returns: Dictionary containing all analysis results """ try: # Run all analyses collab_structure = self.analyze_collaboration_structure() regional_patterns = self.analyze_regional_patterns() temporal_trends = self.analyze_temporal_trends() # Integrate the results comprehensive_results = { 'collaboration_structure': collab_structure, 'regional_patterns': regional_patterns, 'temporal_trends': temporal_trends, 'summary_findings': self._generate_summary_findings( collab_structure, regional_patterns, temporal_trends ) } return comprehensive_results except Exception as e: logger.error(f"Error getting comprehensive analysis: {e}", exc_info=True) raise AnalysisError("Failed to get comprehensive analysis") def _generate_summary_findings(self, collab_structure: Dict[str, Any], regional_patterns: Dict[str, Any], temporal_trends: Dict[str, Any]) -> Dict[str, Any]: """ Generate key summary findings from all analyses. Args: collab_structure: Collaboration structure analysis results regional_patterns: Regional patterns analysis results temporal_trends: Temporal trends analysis results Returns: Dictionary containing key integrated findings """ try: # Extract key metrics from collaboration structure analysis collab_corr_ghg = collab_structure['correlations'].get('num_partner_agencies', {}).get('ghg_efficiency', 0) collab_corr_dac = collab_structure['correlations'].get('num_partner_agencies', {}).get('pct_dac_funding', 0) # Extract key metrics from regional patterns multi_region_data = regional_patterns['multi_vs_single_region']['multi_region'] single_region_data = regional_patterns['multi_vs_single_region']['single_region'] # Extract key metrics from temporal trends if temporal_trends.get('breakpoint_pct_changes'): breakpoint_changes = temporal_trends['breakpoint_pct_changes'] post_2020_funding_change = breakpoint_changes.get('avg_project_funding_change', 0) post_2020_partners_change = breakpoint_changes.get('avg_partners_change', 0) post_2020_dac_change = breakpoint_changes.get('dac_benefit_change', 0) else: post_2020_funding_change = 0 post_2020_partners_change = 0 post_2020_dac_change = 0 # Compile key findings key_findings = { 'collaboration_impact': { 'correlation_partners_ghg_efficiency': collab_corr_ghg, 'correlation_partners_dac_benefit': collab_corr_dac, 'ghg_efficiency_by_collab_size': { 'single_agency': collab_structure['collaboration_size_summary'][0]['ghg_efficiency'] if len(collab_structure['collaboration_size_summary']) > 0 else None, 'large_collab': collab_structure['collaboration_size_summary'][-1]['ghg_efficiency'] if len(collab_structure['collaboration_size_summary']) > 0 else None } }, 'multi_vs_single_region': { 'multi_region_partners': multi_region_data['avg_partners'], 'single_region_partners': single_region_data['avg_partners'], 'partner_ratio': multi_region_data['avg_partners'] / single_region_data['avg_partners'] if single_region_data['avg_partners'] > 0 else float('inf'), 'multi_region_dac_benefit': multi_region_data['avg_dac_benefit'], 'single_region_dac_benefit': single_region_data['avg_dac_benefit'], 'dac_benefit_ratio': multi_region_data['avg_dac_benefit'] / single_region_data['avg_dac_benefit'] if single_region_data['avg_dac_benefit'] > 0 else float('inf') }, 'temporal_shifts': { 'post_2020_funding_change_pct': post_2020_funding_change, 'post_2020_partners_change_pct': post_2020_partners_change, 'post_2020_dac_benefit_change_pct': post_2020_dac_change } } # Get most efficient region if 'regional_summary' in regional_patterns and len(regional_patterns['regional_summary']) > 0: regional_ghg_df = pd.DataFrame(regional_patterns['regional_summary']) if 'ghg_efficiency' in regional_ghg_df.columns and not regional_ghg_df['ghg_efficiency'].isna().all(): most_efficient_region = regional_ghg_df.loc[regional_ghg_df['ghg_efficiency'].idxmin()] key_findings['regional_efficiency'] = { 'most_efficient_region': most_efficient_region['region'], 'efficiency_value': most_efficient_region['ghg_efficiency'], 'avg_partners': most_efficient_region['num_partner_agencies'] } return key_findings except Exception as e: logger.error(f"Error generating summary findings: {e}", exc_info=True) return {'error': str(e)} # Export and visualization methods def export_analysis(self, filename: str) -> None: """ Export comprehensive analysis results to a JSON file. Args: filename: Path to output file Raises: AnalysisError: If export fails """ try: analysis = self.get_comprehensive_analysis() # Ensure directory exists output_path = Path(filename) output_path.parent.mkdir(parents=True, exist_ok=True) # Convert numpy types to Python types for JSON serialization def json_serialize(obj): if isinstance(obj, (np.integer, np.int64)): return int(obj) elif isinstance(obj, (np.floating, np.float64)): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() elif pd.isna(obj): return None else: return obj # Export with proper encoding and formatting with open(output_path, 'w', encoding='utf-8') as f: json.dump(analysis, f, indent=2, default=json_serialize) logger.info(f"Analysis exported to {filename}") except Exception as e: logger.error(f"Error exporting analysis: {e}", exc_info=True) raise AnalysisError(f"Failed to export analysis to {filename}") def print_summary_findings(self) -> None: """Print a formatted summary of key findings.""" try: # Get the comprehensive analysis analysis = self.get_comprehensive_analysis() # Extract key findings findings = analysis['summary_findings'] # Print summary header print("\n" + "="*80) print(" CALIFORNIA CLIMATE INVESTMENTS (CCI) PROGRAM ANALYSIS SUMMARY ") print("="*80 + "\n") # Print collaboration impact findings print("COLLABORATION STRUCTURE FINDINGS:") collab_impact = findings.get('collaboration_impact', {}) print(f"- Correlation between number of partners and GHG efficiency: {collab_impact.get('correlation_partners_ghg_efficiency', 'N/A'):.3f}") print(f"- Correlation between number of partners and DAC benefits: {collab_impact.get('correlation_partners_dac_benefit', 'N/A'):.3f}") ghg_by_collab = collab_impact.get('ghg_efficiency_by_collab_size', {}) single_agency_ghg = ghg_by_collab.get('single_agency', 'N/A') large_collab_ghg = ghg_by_collab.get('large_collab', 'N/A') if single_agency_ghg != 'N/A' and large_collab_ghg != 'N/A': print(f"- GHG Efficiency in single-agency projects: ${single_agency_ghg:.2f} per ton of GHG reduced") print(f"- GHG Efficiency in large collaborations (6+ partners): ${large_collab_ghg:.2f} per ton of GHG reduced") print(f"- Efficiency ratio (large collab / single agency): {large_collab_ghg/single_agency_ghg:.2f}x") # Print regional comparison findings print("\nMULTI-REGION VS SINGLE-REGION FINDINGS:") region_comp = findings.get('multi_vs_single_region', {}) print(f"- Multi-region projects average {region_comp.get('multi_region_partners', 'N/A'):.1f} partners vs {region_comp.get('single_region_partners', 'N/A'):.1f} partners in single-region projects") print(f"- Partner ratio (multi/single): {region_comp.get('partner_ratio', 'N/A'):.1f}x") print(f"- DAC benefit in multi-region projects: {region_comp.get('multi_region_dac_benefit', 'N/A'):.1f}%") print(f"- DAC benefit in single-region projects: {region_comp.get('single_region_dac_benefit', 'N/A'):.1f}%") print(f"- DAC benefit ratio (multi/single): {region_comp.get('dac_benefit_ratio', 'N/A'):.1f}x") # Print regional efficiency findings if 'regional_efficiency' in findings: reg_eff = findings['regional_efficiency'] print(f"\nMOST EFFICIENT REGION: {reg_eff.get('most_efficient_region', 'N/A')}") print(f"- GHG Efficiency: ${reg_eff.get('efficiency_value', 'N/A'):.3f} per ton of GHG reduced") print(f"- Average partners: {reg_eff.get('avg_partners', 'N/A'):.1f}") # Print temporal trend findings print("\nTEMPORAL TRENDS (PRE vs POST 2020):") temp_shifts = findings.get('temporal_shifts', {}) print(f"- Project funding size change: {temp_shifts.get('post_2020_funding_change_pct', 'N/A'):.1f}%") print(f"- Number of partners change: {temp_shifts.get('post_2020_partners_change_pct', 'N/A'):.1f}%") print(f"- DAC benefit change: {temp_shifts.get('post_2020_dac_benefit_change_pct', 'N/A'):.1f}%") print("\n" + "="*80) except Exception as e: logger.error(f"Error printing summary findings: {e}", exc_info=True) print(f"Error generating summary: {str(e)}") def plot_collaboration_outcomes(self, output_file: Optional[str] = None) -> None: """ Create visualizations of collaboration outcomes. Args: output_file: Path to save visualization (optional) """ try: if 'collaboration_outcomes' not in self.data: raise AnalysisError("Collaboration outcomes dataset not available") df = self.data['collaboration_outcomes'] # Set up the figure plt.figure(figsize=(15, 10)) # Plot 1: Collaboration Size vs. GHG Efficiency plt.subplot(2, 2, 1) plt.scatter(df['num_partner_agencies'], df['ghg_efficiency'], alpha=0.6) plt.xlabel('Number of Partner Agencies') plt.ylabel('GHG Efficiency ($ per ton of GHG reduced)') plt.title('Collaboration Size vs. GHG Efficiency') # Add trend line if len(df) > 1: z = np.polyfit(df['num_partner_agencies'], df['ghg_efficiency'], 1) p = np.poly1d(z) plt.plot(df['num_partner_agencies'], p(df['num_partner_agencies']), "r--", alpha=0.8) # Plot 2: Collaboration Size vs. DAC Benefit plt.subplot(2, 2, 2) plt.scatter(df['num_partner_agencies'], df['pct_dac_funding'], alpha=0.6) plt.xlabel('Number of Partner Agencies') plt.ylabel('DAC Benefit Percentage') plt.title('Collaboration Size vs. DAC Benefit') # Add trend line if len(df) > 1: z = np.polyfit(df['num_partner_agencies'], df['pct_dac_funding'], 1) p = np.poly1d(z) plt.plot(df['num_partner_agencies'], p(df['num_partner_agencies']), "r--", alpha=0.8) # Plot 3: Collaboration Size Categories - GHG Efficiency plt.subplot(2, 2, 3) category_means = df.groupby('collab_size_category')['ghg_efficiency'].mean().reindex([ 'Single agency', 'Small collab (2-3)', 'Medium collab (4-5)', 'Large collab (6+)' ]) category_means.plot(kind='bar') plt.xlabel('Collaboration Size Category') plt.ylabel('Average GHG Efficiency') plt.title('GHG Efficiency by Collaboration Size') plt.xticks(rotation=45) # Plot 4: Collaboration Size Categories - DAC Benefit plt.subplot(2, 2, 4) category_means = df.groupby('collab_size_category')['pct_dac_funding'].mean().reindex([ 'Single agency', 'Small collab (2-3)', 'Medium collab (4-5)', 'Large collab (6+)' ]) category_means.plot(kind='bar') plt.xlabel('Collaboration Size Category') plt.ylabel('Average DAC Benefit Percentage') plt.title('DAC Benefit by Collaboration Size') plt.xticks(rotation=45) plt.tight_layout() # Save or display if output_file: plt.savefig(output_file, dpi=300, bbox_inches='tight') logger.info(f"Collaboration outcomes visualization saved to {output_file}") else: plt.show() plt.close() except Exception as e: logger.error(f"Error plotting collaboration outcomes: {e}", exc_info=True) raise AnalysisError(f"Failed to create collaboration outcomes visualization: {str(e)}") def plot_temporal_trends(self, output_file: Optional[str] = None) -> None: """ Create visualizations of temporal trends. Args: output_file: Path to save visualization (optional) """ try: if 'yearly_summary' not in self.data: raise AnalysisError("Yearly summary dataset not available") df = self.data['yearly_summary'] # Set up the figure plt.figure(figsize=(15, 15)) # Plot 1: Funding Over Time plt.subplot(3, 2, 1) plt.bar(df['funding_year'], df['total_ggrf_funding'] / 1_000_000) plt.xlabel('Year') plt.ylabel('Total Funding ($ Millions)') plt.title('CCI Funding by Year') # Add vertical line at breakpoint year plt.axvline(x=self.config.temporal_breakpoint, color='r', linestyle='--', alpha=0.7) # Plot 2: Number of Partners Over Time plt.subplot(3, 2, 2) plt.plot(df['funding_year'], df['avg_partners'], marker='o') plt.xlabel('Year') plt.ylabel('Average Number of Partners') plt.title('Agency Collaboration Over Time') # Add vertical line at breakpoint year plt.axvline(x=self.config.temporal_breakpoint, color='r', linestyle='--', alpha=0.7) # Plot 3: GHG Reductions Over Time plt.subplot(3, 2, 3) plt.bar(df['funding_year'], df['total_project_ghg_reductions'] / 1_000) plt.xlabel('Year') plt.ylabel('Total GHG Reductions (Thousands of Tons)') plt.title('GHG Reductions by Year') # Add vertical line at breakpoint year plt.axvline(x=self.config.temporal_breakpoint, color='r', linestyle='--', alpha=0.7) # Plot 4: DAC Benefit Over Time plt.subplot(3, 2, 4) plt.plot(df['funding_year'], df['pct_dac_funding'], marker='o') plt.xlabel('Year') plt.ylabel('DAC Benefit Percentage') plt.title('DAC Benefits by Year') # Add vertical line at breakpoint year plt.axvline(x=self.config.temporal_breakpoint, color='r', linestyle='--', alpha=0.7) # Plot 5: GHG Efficiency Over Time plt.subplot(3, 2, 5) plt.plot(df['funding_year'], df['ghg_efficiency'], marker='o') plt.xlabel('Year') plt.ylabel('GHG Efficiency ($ per ton reduced)') plt.title('GHG Efficiency Over Time') # Add vertical line at breakpoint year plt.axvline(x=self.config.temporal_breakpoint, color='r', linestyle='--', alpha=0.7) # Plot 6: Pre/Post 2020 Comparison plt.subplot(3, 2, 6) if 'breakpoint_summary' in self.data: bp_df = self.data['breakpoint_summary'] pre = bp_df[bp_df['post_breakpoint'] == 0] post = bp_df[bp_df['post_breakpoint'] == 1] if not pre.empty and not post.empty: metrics = ['num_partner_agencies_mean', 'ghg_efficiency_mean', 'dac_benefit_percentage_mean'] labels = ['Avg Partners', 'GHG Efficiency', 'DAC Benefit %'] x = np.arange(len(metrics)) width = 0.35 # Normalize values for comparison pre_values = [pre[m].iloc[0] for m in metrics] post_values = [post[m].iloc[0] for m in metrics] # Calculate percentage change for display pct_changes = [(post_values[i] / pre_values[i] - 1) * 100 if pre_values[i] != 0 else 0 for i in range(len(metrics))] plt.bar(x - width/2, pre_values, width, label='Pre-2020') plt.bar(x + width/2, post_values, width, label='Post-2020') plt.xlabel('Metric') plt.ylabel('Value') plt.title('Pre vs Post 2020 Comparison') plt.xticks(x, labels) plt.legend() # Add percentage change labels for i, pct in enumerate(pct_changes): plt.annotate(f"{pct:.1f}%", xy=(i, max(pre_values[i], post_values[i])), ha='center', va='bottom') plt.tight_layout() # Save or display if output_file: plt.savefig(output_file, dpi=300, bbox_inches='tight') logger.info(f"Temporal trends visualization saved to {output_file}") else: plt.show() plt.close() except Exception as e: logger.error(f"Error plotting temporal trends: {e}", exc_info=True) raise AnalysisError(f"Failed to create temporal trends visualization: {str(e)}") def plot_regional_analysis(self, output_file: Optional[str] = None) -> None: """ Create visualizations of regional collaboration and outcomes. Args: output_file: Path to save visualization (optional) """ try: if 'regional_summary' not in self.data: raise AnalysisError("Regional summary dataset not available") df = self.data['regional_summary'] # Set up the figure plt.figure(figsize=(15, 15)) # Plot 1: Funding by Region plt.subplot(3, 2, 1) funding_by_region = df.sort_values('total_ggrf_funding', ascending=False) plt.bar(funding_by_region['region'], funding_by_region['total_ggrf_funding'] / 1_000_000) plt.xlabel('Region') plt.ylabel('Total Funding ($ Millions)') plt.title('CCI Funding by Region') plt.xticks(rotation=45, ha='right') # Plot 2: Average Partners by Region plt.subplot(3, 2, 2) partners_by_region = df.sort_values('avg_partners', ascending=False) plt.bar(partners_by_region['region'], partners_by_region['avg_partners']) plt.xlabel('Region') plt.ylabel('Average Number of Partners') plt.title('Agency Collaboration by Region') plt.xticks(rotation=45, ha='right') # Plot 3: GHG Efficiency by Region plt.subplot(3, 2, 3) ghg_eff_by_region = df.sort_values('ghg_efficiency') # Lower is better for efficiency plt.bar(ghg_eff_by_region['region'], ghg_eff_by_region['ghg_efficiency']) plt.xlabel('Region') plt.ylabel('GHG Efficiency ($ per ton reduced)') plt.title('GHG Efficiency by Region') plt.xticks(rotation=45, ha='right') # Plot 4: DAC Benefit by Region plt.subplot(3, 2, 4) dac_by_region = df.sort_values('dac_benefit_percentage', ascending=False) plt.bar(dac_by_region['region'], dac_by_region['dac_benefit_percentage']) plt.xlabel('Region') plt.ylabel('DAC Benefit Percentage') plt.title('DAC Benefits by Region') plt.xticks(rotation=45, ha='right') # Plot 5: Partners vs. GHG Efficiency Scatter plt.subplot(3, 2, 5) plt.scatter(df['avg_partners'], df['ghg_efficiency']) # Add region labels to points for i, row in df.iterrows(): plt.annotate(row['region'], (row['avg_partners'], row['ghg_efficiency']), fontsize=8, ha='right') plt.xlabel('Average Number of Partners') plt.ylabel('GHG Efficiency ($ per ton reduced)') plt.title('Partners vs. GHG Efficiency by Region') # Plot 6: Partners vs. DAC Benefit Scatter plt.subplot(3, 2, 6) plt.scatter(df['avg_partners'], df['dac_benefit_percentage']) # Add region labels to points for i, row in df.iterrows(): plt.annotate(row['region'], (row['avg_partners'], row['dac_benefit_percentage']), fontsize=8, ha='right') plt.xlabel('Average Number of Partners') plt.ylabel('DAC Benefit Percentage') plt.title('Partners vs. DAC Benefit by Region') plt.tight_layout() # Save or display if output_file: plt.savefig(output_file, dpi=300, bbox_inches='tight') logger.info(f"Regional analysis visualization saved to {output_file}") else: plt.show() plt.close() except Exception as e: logger.error(f"Error plotting regional analysis: {e}", exc_info=True) raise AnalysisError(f"Failed to create regional analysis visualization: {str(e)}") def generate_findings_for_paper(self) -> str: """ Generate key findings formatted for inclusion in a research paper. Returns: String containing formatted findings """ try: # Get the comprehensive analysis analysis = self.get_comprehensive_analysis() # Extract key findings findings = analysis['summary_findings'] # Format the findings as text text = """## Key Findings from California Climate Investments (CCI) Analysis ### Collaboration Structure and Scale Findings Our analysis of the structure and scale of inter-agency collaboration in the CCI program reveals several key patterns: """ # Add collaboration impact findings collab_impact = findings.get('collaboration_impact', {}) corr_ghg = collab_impact.get('correlation_partners_ghg_efficiency', 0) corr_dac = collab_impact.get('correlation_partners_dac_benefit', 0) text += f"* The correlation between number of partners and GHG efficiency is {corr_ghg:.3f}, " if corr_ghg > 0.1: text += "suggesting larger collaborations may be less efficient in reducing GHG emissions per dollar spent.\n" elif corr_ghg < -0.1: text += "suggesting larger collaborations may be more efficient in reducing GHG emissions per dollar spent.\n" else: text += "indicating minimal relationship between collaboration size and GHG reduction efficiency.\n" text += f"* The correlation between number of partners and DAC benefits is {corr_dac:.3f}, " if corr_dac > 0.1: text += "suggesting larger collaborations may be more effective at delivering benefits to disadvantaged communities.\n" elif corr_dac < -0.1: text += "suggesting smaller collaborations may be more effective at delivering benefits to disadvantaged communities.\n" else: text += "indicating minimal relationship between collaboration size and benefits to disadvantaged communities.\n" # Add regional comparison findings text += "\n### Regional Patterns\n\n" region_comp = findings.get('multi_vs_single_region', {}) multi_partners = region_comp.get('multi_region_partners', 0) single_partners = region_comp.get('single_region_partners', 0) partner_ratio = region_comp.get('partner_ratio', 0) multi_dac = region_comp.get('multi_region_dac_benefit', 0) single_dac = region_comp.get('single_region_dac_benefit', 0) dac_ratio = region_comp.get('dac_benefit_ratio', 0) text += f"* Multi-regional projects involve significantly more collaborating agencies ({multi_partners:.1f} partners on average) compared to single-region projects ({single_partners:.1f} partners), a ratio of {partner_ratio:.1f}x.\n" text += f"* Multi-regional projects demonstrate {multi_dac:.1f}% DAC benefit compared to {single_dac:.1f}% for single-region projects, a ratio of {dac_ratio:.1f}x. " if dac_ratio > 1.2: text += "This suggests that cross-regional collaborations may be more effective at addressing equity concerns.\n" elif dac_ratio < 0.8: text += "This suggests that more localized efforts may be more effective at addressing equity concerns.\n" else: text += "This indicates similar equity outcomes regardless of regional scope.\n" # Add regional efficiency findings if 'regional_efficiency' in findings: reg_eff = findings['regional_efficiency'] most_eff_region = reg_eff.get('most_efficient_region', '') eff_value = reg_eff.get('efficiency_value', 0) avg_partners = reg_eff.get('avg_partners', 0) text += f"* The {most_eff_region} region achieved the highest GHG efficiency (${eff_value:.3f} per ton of GHG reduced) despite having fewer collaborating partners ({avg_partners:.1f} on average), suggesting that smaller, more focused collaborations may achieve better climate outcomes in some contexts.\n" # Add temporal trend findings text += "\n### Temporal Trends (Pre vs Post 2020)\n\n" temp_shifts = findings.get('temporal_shifts', {}) funding_change = temp_shifts.get('post_2020_funding_change_pct', 0) partners_change = temp_shifts.get('post_2020_partners_change_pct', 0) dac_change = temp_shifts.get('post_2020_dac_benefit_change_pct', 0) text += f"* Post-2020 projects show a {funding_change:.1f}% increase in average project funding size and a {partners_change:.1f}% increase in the number of participating partners.\n" text += f"* Despite increased resources and collaboration, DAC benefits changed by {dac_change:.1f}% in the post-2020 period. " if dac_change < -5: text += "This substantial decline in equity outcomes despite increased collaboration suggests a potential shift in program priorities or implementation challenges.\n" elif dac_change > 5: text += "This substantial improvement in equity outcomes alongside increased collaboration suggests successful program maturation and better integration of equity goals.\n" else: text += "This relatively stable outcome suggests consistent attention to equity goals despite changing program structure.\n" # Add conclusion text += "\n### Summary\n\n" text += "Our analysis reveals complex tradeoffs between collaboration structure, GHG efficiency, and equity outcomes in California's Climate Investments program. While multi-agency partnerships can enhance program reach and equity outcomes, they may face efficiency tradeoffs that must be carefully managed. The significant temporal shifts observed post-2020 highlight the dynamic nature of collaborative climate governance and the challenges of balancing multiple policy objectives.\n" return text except Exception as e: logger.error(f"Error generating findings for paper: {e}", exc_info=True) raise AnalysisError(f"Failed to generate findings for paper: {str(e)}") def run_hierarchical_regression(self) -> Dict[str, Any]: """ Run hierarchical regression analysis as described in research methods. Returns: Dictionary containing hierarchical regression results """ try: if 'cci_projects' not in self.data: raise AnalysisError("CCI project data not available") df = self.data['cci_projects'] # 1. Base model: Control for project size, agency type, and regional fixed effects model1_ghg = smf.ols('ghg_efficiency ~ total_ggrf_funding + agency_name + region', data=df).fit() model1_dac = smf.ols('dac_benefit_percentage ~ total_ggrf_funding + agency_name + region', data=df).fit() # 2. Add partnership variables model2_ghg = smf.ols('ghg_efficiency ~ total_ggrf_funding + agency_name + region + num_partner_agencies', data=df).fit() model2_dac = smf.ols('dac_benefit_percentage ~ total_ggrf_funding + agency_name + region + num_partner_agencies', data=df).fit() # 3. Add interaction between partnerships and regional characteristics model3_formula_ghg = 'ghg_efficiency ~ total_ggrf_funding + agency_name + region + num_partner_agencies + num_partner_agencies:region' model3_formula_dac = 'dac_benefit_percentage ~ total_ggrf_funding + agency_name + region + num_partner_agencies + num_partner_agencies:region' # Try to fit the interaction models, but handle potential convergence issues try: model3_ghg = smf.ols(model3_formula_ghg, data=df).fit() except Exception as e: logger.warning(f"Could not fit interaction model for GHG efficiency: {e}") model3_ghg = None try: model3_dac = smf.ols(model3_formula_dac, data=df).fit() except Exception as e: logger.warning(f"Could not fit interaction model for DAC benefits: {e}") model3_dac = None # Prepare results results = { 'ghg_efficiency': { 'base_model': { 'r_squared': model1_ghg.rsquared, 'adj_r_squared': model1_ghg.rsquared_adj, 'aic': model1_ghg.aic, 'significant_predictors': [var for var, pval in model1_ghg.pvalues.items() if pval < 0.05] }, 'partnership_model': { 'r_squared': model2_ghg.rsquared, 'adj_r_squared': model2_ghg.rsquared_adj, 'aic': model2_ghg.aic, 'significant_predictors': [var for var, pval in model2_ghg.pvalues.items() if pval < 0.05], 'partnership_coefficient': model2_ghg.params.get('num_partner_agencies', None), 'partnership_p_value': model2_ghg.pvalues.get('num_partner_agencies', None) } }, 'dac_benefit': { 'base_model': { 'r_squared': model1_dac.rsquared, 'adj_r_squared': model1_dac.rsquared_adj, 'aic': model1_dac.aic, 'significant_predictors': [var for var, pval in model1_dac.pvalues.items() if pval < 0.05] }, 'partnership_model': { 'r_squared': model2_dac.rsquared, 'adj_r_squared': model2_dac.rsquared_adj, 'aic': model2_dac.aic, 'significant_predictors': [var for var, pval in model2_dac.pvalues.items() if pval < 0.05], 'partnership_coefficient': model2_dac.params.get('num_partner_agencies', None), 'partnership_p_value': model2_dac.pvalues.get('num_partner_agencies', None) } } } # Add interaction model results if available if model3_ghg is not None: results['ghg_efficiency']['interaction_model'] = { 'r_squared': model3_ghg.rsquared, 'adj_r_squared': model3_ghg.rsquared_adj, 'aic': model3_ghg.aic, 'significant_predictors': [var for var, pval in model3_ghg.pvalues.items() if pval < 0.05], 'significant_interactions': [var for var, pval in model3_ghg.pvalues.items() if pval < 0.05 and ':' in var] } if model3_dac is not None: results['dac_benefit']['interaction_model'] = { 'r_squared': model3_dac.rsquared, 'adj_r_squared': model3_dac.rsquared_adj, 'aic': model3_dac.aic, 'significant_predictors': [var for var, pval in model3_dac.pvalues.items() if pval < 0.05], 'significant_interactions': [var for var, pval in model3_dac.pvalues.items() if pval < 0.05 and ':' in var] } # Model comparison: R-squared improvement results['ghg_efficiency']['r_squared_improvement'] = model2_ghg.rsquared - model1_ghg.rsquared results['dac_benefit']['r_squared_improvement'] = model2_dac.rsquared - model1_dac.rsquared logger.info("Completed hierarchical regression analysis") return results except Exception as e: logger.error(f"Error running hierarchical regression: {e}", exc_info=True) raise AnalysisError(f"Failed to run hierarchical regression: {str(e)}") def prepare_propensity_score_matching(self) -> Dict[str, Any]: """ Perform propensity score matching to compare high vs. low collaboration projects. Returns: Dictionary containing propensity score matching results """ try: if 'cci_projects' not in self.data: raise AnalysisError("CCI project data not available") df = self.data['cci_projects'] # Define high collaboration intensity (over five partners) df['high_collab'] = (df['num_partner_agencies'] > 5).astype(int) # Variables to control for in matching control_vars = ['total_ggrf_funding', 'region', 'agency_name'] # Prepare data for matching model_data = pd.get_dummies(df[control_vars + ['high_collab']], columns=['region', 'agency_name'], drop_first=True) # Fit propensity score model ps_model = sm.Logit(model_data['high_collab'], sm.add_constant(model_data.drop('high_collab', axis=1))).fit(disp=0) # Calculate propensity scores df['propensity_score'] = ps_model.predict() # Create matched groups high_collab = df[df['high_collab'] == 1] low_collab = df[df['high_collab'] == 0] # Simple nearest neighbor matching matched_indices = [] for idx, high_row in high_collab.iterrows(): ps = high_row['propensity_score'] # Find nearest neighbor in propensity score nearest_idx = abs(low_collab['propensity_score'] - ps).idxmin() matched_indices.append((idx, nearest_idx)) # Create matched dataset matched_high = df.loc[[i[0] for i in matched_indices]] matched_low = df.loc[[i[1] for i in matched_indices]] # Compare outcomes between matched groups high_ghg_eff = matched_high['ghg_efficiency'].mean() low_ghg_eff = matched_low['ghg_efficiency'].mean() high_dac = matched_high['dac_benefit_percentage'].mean() low_dac = matched_low['dac_benefit_percentage'].mean() # T-tests on matched groups ghg_ttest = stats.ttest_ind( matched_high['ghg_efficiency'].dropna(), matched_low['ghg_efficiency'].dropna(), equal_var=False ) dac_ttest = stats.ttest_ind( matched_high['dac_benefit_percentage'].dropna(), matched_low['dac_benefit_percentage'].dropna(), equal_var=False ) # Results results = { 'matching_summary': { 'num_high_collab_projects': len(high_collab), 'num_low_collab_projects': len(low_collab), 'num_matched_pairs': len(matched_indices) }, 'outcome_comparison': { 'ghg_efficiency': { 'high_collab_mean': high_ghg_eff, 'low_collab_mean': low_ghg_eff, 'difference': high_ghg_eff - low_ghg_eff, 'pct_difference': 100 * (high_ghg_eff - low_ghg_eff) / low_ghg_eff if low_ghg_eff != 0 else 0, 't_stat': ghg_ttest.statistic, 'p_value': ghg_ttest.pvalue }, 'dac_benefit': { 'high_collab_mean': high_dac, 'low_collab_mean': low_dac, 'difference': high_dac - low_dac, 'pct_difference': 100 * (high_dac - low_dac) / low_dac if low_dac != 0 else 0, 't_stat': dac_ttest.statistic, 'p_value': dac_ttest.pvalue } } } logger.info(f"Completed propensity score matching with {len(matched_indices)} matched pairs") return results except Exception as e: logger.error(f"Error in propensity score matching: {e}", exc_info=True) raise AnalysisError(f"Failed to perform propensity score matching: {str(e)}") if __name__ == "__main__": # Example usage try: # Initialize the analyzer analyzer = CCIAnalyzer( data_path="data/cci_programs_data_reduced.csv", output_path="./output", start_year=2015, end_year=2024, temporal_breakpoint=2020 ) # Run analysis and print summary analyzer.print_summary_findings() # Generate visualizations analyzer.plot_collaboration_outcomes("./output/collaboration_outcomes.png") analyzer.plot_temporal_trends("./output/temporal_trends.png") analyzer.plot_regional_analysis("./output/regional_analysis.png") # Export full analysis results analyzer.export_analysis("./output/cci_analysis_results.json") # Generate findings for paper findings_text = analyzer.generate_findings_for_paper() with open("./output/paper_findings.md", "w") as f: f.write(findings_text) print("Analysis complete. Results saved to ./output directory.") except Exception as e: print(f"Analysis failed: {e}")