import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path import logging import warnings # Configure basic logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("cci_analyzer") # Suppress pandas warnings warnings.filterwarnings('ignore') class CCIDataAnalyzer: """Simplified analyzer for California Climate Investments data.""" def __init__(self, data_path, output_path="./output"): self.data_path = Path(data_path) self.output_path = Path(output_path) self.output_path.mkdir(parents=True, exist_ok=True) self.data = {} logger.info(f"Initialized with data path: {self.data_path}") def load_data(self): """Load CCI data with special handling for encoding issues.""" try: logger.info(f"Loading data from {self.data_path}") # Read as string to avoid conversion errors df = pd.read_csv(self.data_path, dtype=str) logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns") # Clean and process the data df = self._clean_data(df) # Store in data dictionary self.data['cci_projects'] = df # Create separate datasets for CARB and non-CARB projects self._create_carb_datasets() return True except Exception as e: logger.error(f"Error loading data: {e}") return False def _clean_data(self, df): """Clean and process the CCI data.""" try: # 1. Fix column names df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns] # 2. Handle the problematic lat_long column with LibreOffice encoding if 'lat_long' in df.columns: logger.info("Processing coordinates with special encoding handling") # Function to clean LibreOffice encoding def clean_libreoffice_encoding(text): if pd.isna(text): return text # Special LibreOffice character replacements replacements = { '+AC0-': '-', # Minus sign '+ACI-': '"', # Quote mark } cleaned = str(text) for code, char in replacements.items(): cleaned = cleaned.replace(code, char) return cleaned # Clean the lat_long column df['lat_long'] = df['lat_long'].apply(clean_libreoffice_encoding) # Extract latitude and longitude def extract_coords(coord_str): if pd.isna(coord_str): return (np.nan, np.nan) try: # Try to split by comma if ',' in coord_str: parts = coord_str.split(',') if len(parts) >= 2: lon = parts[0].strip() lat = parts[1].strip() return (float(lat), float(lon)) except: pass return (np.nan, np.nan) # Extract coordinates safely try: coords = df['lat_long'].apply(extract_coords) df['latitude'] = coords.apply(lambda x: x[0]) df['longitude'] = coords.apply(lambda x: x[1]) except Exception as e: logger.error(f"Error extracting coordinates: {e}") # 3. Convert numeric columns numeric_cols = [ 'total_project_cost', 'total_program_ggrffunding', 'project_life_years', 'total_project_ghgreductions', 'annual_project_ghgreductions' ] for col in df.columns: # Find matching columns (case insensitive) if any(num_col in col.lower() for num_col in ['cost', 'funding', 'ghg', 'reductions', 'years']): df[col] = pd.to_numeric(df[col], errors='coerce') # 4. Convert date columns date_cols = [col for col in df.columns if 'date' in col.lower()] for col in date_cols: df[col] = pd.to_datetime(df[col], errors='coerce') # 5. Extract funding year fiscal_year_cols = [col for col in df.columns if 'fiscal_year' in col.lower()] if fiscal_year_cols: try: # Handle different possible formats of fiscal year column year_col = fiscal_year_cols[0] # Try multiple approaches to extract year try: # Handle standard fiscal year format like "2019-20" df['funding_year'] = df[year_col].astype(str).str.extract(r'(\d{4})').astype('Int64') except Exception: logger.warning(f"Could not extract year with regex pattern, trying direct conversion") # Try direct conversion if it's already a year df['funding_year'] = pd.to_numeric(df[year_col], errors='coerce').astype('Int64') except Exception as e: logger.error(f"Error extracting funding year: {e}") # 6. Calculate derived metrics if columns exist funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()] dac_col = [col for col in df.columns if 'funding_benefiting' in col.lower()] if funding_col and ghg_col: df['ghg_efficiency'] = np.where( df[ghg_col[0]] > 0, df[funding_col[0]] / df[ghg_col[0]], np.nan ) if funding_col and dac_col: df['dac_benefit_percentage'] = np.where( df[funding_col[0]] > 0, 100 * df[dac_col[0]] / df[funding_col[0]], 0 ) logger.info("Data cleaning and processing complete") return df except Exception as e: logger.error(f"Error cleaning data: {e}") return df def _create_carb_datasets(self): """Create separate datasets for CARB and non-CARB projects.""" if 'cci_projects' not in self.data: logger.error("No data available to create CARB datasets") return df = self.data['cci_projects'] try: # Check if agency_name column exists if 'agency_name' not in df.columns: logger.error("agency_name column not found") return # Create CARB dataset carb_mask = df['agency_name'].str.contains('Air Resources Board', case=False, na=False) self.data['carb_projects'] = df[carb_mask].copy() self.data['non_carb_projects'] = df[~carb_mask].copy() logger.info(f"Created CARB dataset with {len(self.data['carb_projects'])} projects") logger.info(f"Created non-CARB dataset with {len(self.data['non_carb_projects'])} projects") # Identify EV rebate/voucher projects within CARB if len(self.data['carb_projects']) > 0: carb_df = self.data['carb_projects'] # Look for EV-related projects using various columns ev_indicators = ['electric vehicle', 'ev ', 'rebate', 'voucher', 'clean vehicle'] # Check program name for EV indicators if 'program_name' in carb_df.columns: ev_mask = carb_df['program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False) elif 'sub_program_name' in carb_df.columns: ev_mask = carb_df['sub_program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False) else: # If specific columns not found, try to find any column that might indicate EV projects ev_mask = pd.Series(False, index=carb_df.index) for col in carb_df.columns: if carb_df[col].dtype == 'object': try: ev_mask = ev_mask | carb_df[col].astype(str).str.lower().str.contains('|'.join(ev_indicators), na=False) except: pass self.data['ev_projects'] = carb_df[ev_mask].copy() logger.info(f"Identified {len(self.data['ev_projects'])} potential EV rebate/voucher projects") except Exception as e: logger.error(f"Error creating CARB datasets: {e}") def analyze_data(self, include_carb_breakdown=True): """Basic analysis of CCI data with optional CARB breakdown.""" if 'cci_projects' not in self.data: logger.error("No data available for analysis") return None df = self.data['cci_projects'] # Get agency information if 'agency_name' in df.columns: agency_counts = df['agency_name'].value_counts() print("\nAgencies involved in CCI projects:") for agency, count in agency_counts.head(10).items(): print(f" {agency}: {count} projects") # Analyze funding distribution funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] if funding_col: total_funding = df[funding_col[0]].sum() print(f"\nTotal CCI funding: ${total_funding:,.2f}") print(f"Average project funding: ${df[funding_col[0]].mean():,.2f}") # Analyze GHG reductions ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()] if ghg_col: total_ghg = df[ghg_col[0]].sum() print(f"\nTotal GHG reductions: {total_ghg:,.2f} tons") print(f"Average GHG reduction per project: {df[ghg_col[0]].mean():,.2f} tons") # Analyze DAC benefits if 'dac_benefit_percentage' in df.columns: avg_dac = df['dac_benefit_percentage'].mean() print(f"\nAverage DAC benefit percentage: {avg_dac:.2f}%") # CARB vs. Non-CARB Analysis if include_carb_breakdown and 'carb_projects' in self.data and 'non_carb_projects' in self.data: carb_df = self.data['carb_projects'] non_carb_df = self.data['non_carb_projects'] print("\n--- CARB vs. Non-CARB Analysis ---") print(f"CARB projects: {len(carb_df)} ({len(carb_df)/len(df)*100:.1f}% of total)") print(f"Non-CARB projects: {len(non_carb_df)} ({len(non_carb_df)/len(df)*100:.1f}% of total)") if funding_col: carb_funding = carb_df[funding_col[0]].sum() non_carb_funding = non_carb_df[funding_col[0]].sum() print(f"\nCARB funding: ${carb_funding:,.2f} ({carb_funding/total_funding*100:.1f}% of total)") print(f"Non-CARB funding: ${non_carb_funding:,.2f} ({non_carb_funding/total_funding*100:.1f}% of total)") print(f"Average CARB project: ${carb_df[funding_col[0]].mean():,.2f}") print(f"Average non-CARB project: ${non_carb_df[funding_col[0]].mean():,.2f}") if ghg_col: carb_ghg = carb_df[ghg_col[0]].sum() non_carb_ghg = non_carb_df[ghg_col[0]].sum() print(f"\nCARB GHG reductions: {carb_ghg:,.2f} tons ({carb_ghg/total_ghg*100:.1f}% of total)") print(f"Non-CARB GHG reductions: {non_carb_ghg:,.2f} tons ({non_carb_ghg/total_ghg*100:.1f}% of total)") # Calculate efficiency if funding_col: carb_efficiency = carb_funding / carb_ghg if carb_ghg > 0 else 0 non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0 print(f"\nCARB efficiency: ${carb_efficiency:.2f} per ton CO2e") print(f"Non-CARB efficiency: ${non_carb_efficiency:.2f} per ton CO2e") # EV Projects Analysis if 'ev_projects' in self.data: ev_df = self.data['ev_projects'] print("\n--- Electric Vehicle Projects Analysis ---") print(f"EV projects: {len(ev_df)} ({len(ev_df)/len(carb_df)*100:.1f}% of CARB projects)") if funding_col: ev_funding = ev_df[funding_col[0]].sum() print(f"EV funding: ${ev_funding:,.2f} ({ev_funding/carb_funding*100:.1f}% of CARB funding)") print(f"Average EV project: ${ev_df[funding_col[0]].mean():,.2f}") if ghg_col: ev_ghg = ev_df[ghg_col[0]].sum() print(f"EV GHG reductions: {ev_ghg:,.2f} tons ({ev_ghg/carb_ghg*100:.1f}% of CARB reductions)") # Calculate efficiency if funding_col: ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0 print(f"EV efficiency: ${ev_efficiency:.2f} per ton CO2e") return { "total_projects": len(df), "total_funding": total_funding if funding_col else None, "total_ghg_reductions": total_ghg if ghg_col else None, "carb_projects": len(self.data['carb_projects']) if 'carb_projects' in self.data else None, "ev_projects": len(self.data['ev_projects']) if 'ev_projects' in self.data else None } def plot_agency_comparison(self): """Create visualizations comparing agencies.""" if 'cci_projects' not in self.data: logger.error("No data available for visualization") return df = self.data['cci_projects'] # Ensure agency_name column exists if 'agency_name' not in df.columns: logger.error("agency_name column not found") return # Find funding column funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] if not funding_col: logger.error("Funding column not found") return funding_col = funding_col[0] # Find GHG reduction column ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()] if not ghg_col: logger.error("GHG reduction column not found") return ghg_col = ghg_col[0] # Create figure plt.figure(figsize=(15, 12)) # 1. Project count by agency plt.subplot(2, 2, 1) agency_counts = df['agency_name'].value_counts().head(10) agency_counts.plot(kind='barh') plt.title('Number of Projects by Agency (Top 10)') plt.xlabel('Number of Projects') # 2. Funding by agency plt.subplot(2, 2, 2) agency_funding = df.groupby('agency_name')[funding_col].sum().sort_values(ascending=False).head(10) / 1_000_000 agency_funding.plot(kind='barh') plt.title('Total Funding by Agency ($ Millions)') plt.xlabel('Funding ($ Millions)') # 3. GHG reductions by agency plt.subplot(2, 2, 3) agency_ghg = df.groupby('agency_name')[ghg_col].sum().sort_values(ascending=False).head(10) / 1_000 agency_ghg.plot(kind='barh') plt.title('GHG Reductions by Agency (Thousand Tons)') plt.xlabel('GHG Reductions (Thousand Tons)') # 4. Efficiency by agency ($/ton) plt.subplot(2, 2, 4) agency_efficiency = df.groupby('agency_name').apply( lambda x: x[funding_col].sum() / x[ghg_col].sum() if x[ghg_col].sum() > 0 else np.nan ).dropna().sort_values().head(10) agency_efficiency.plot(kind='barh') plt.title('Cost Efficiency by Agency ($ per Ton CO2e)') plt.xlabel('Cost per Ton GHG Reduced ($)') plt.tight_layout() # Save visualization output_file = self.output_path / "agency_comparison.png" plt.savefig(output_file, dpi=300, bbox_inches='tight') logger.info(f"Agency comparison visualization saved to {output_file}") plt.show() def plot_carb_analysis(self): """Create visualizations specifically for CARB vs non-CARB analysis.""" if 'carb_projects' not in self.data or 'non_carb_projects' not in self.data: logger.error("CARB datasets not available") return # Find funding column funding_col = None ghg_col = None # Check if we have funding data for key in ['carb_projects', 'non_carb_projects']: df = self.data[key] funding_cols = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] if funding_cols: funding_col = funding_cols[0] ghg_cols = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()] if ghg_cols: ghg_col = ghg_cols[0] if not funding_col or not ghg_col: logger.error("Required columns not found") return # Prepare data for comparison carb_df = self.data['carb_projects'] non_carb_df = self.data['non_carb_projects'] ev_df = self.data.get('ev_projects', pd.DataFrame()) # Create figure plt.figure(figsize=(15, 12)) # 1. Project count comparison plt.subplot(2, 2, 1) project_counts = pd.Series({ 'CARB (non-EV)': len(carb_df) - len(ev_df), 'CARB (EV Projects)': len(ev_df), 'Non-CARB': len(non_carb_df) }) project_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90) plt.title('Distribution of Projects') plt.ylabel('') # Hide ylabel # 2. Funding comparison plt.subplot(2, 2, 2) if funding_col: carb_non_ev_funding = carb_df[~carb_df.index.isin(ev_df.index)][funding_col].sum() if not ev_df.empty else carb_df[funding_col].sum() ev_funding = ev_df[funding_col].sum() if not ev_df.empty else 0 non_carb_funding = non_carb_df[funding_col].sum() funding_distribution = pd.Series({ 'CARB (non-EV)': carb_non_ev_funding, 'CARB (EV Projects)': ev_funding, 'Non-CARB': non_carb_funding }) funding_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90) plt.title('Distribution of Funding') plt.ylabel('') # Hide ylabel # 3. GHG reductions comparison plt.subplot(2, 2, 3) if ghg_col: carb_non_ev_ghg = carb_df[~carb_df.index.isin(ev_df.index)][ghg_col].sum() if not ev_df.empty else carb_df[ghg_col].sum() ev_ghg = ev_df[ghg_col].sum() if not ev_df.empty else 0 non_carb_ghg = non_carb_df[ghg_col].sum() ghg_distribution = pd.Series({ 'CARB (non-EV)': carb_non_ev_ghg, 'CARB (EV Projects)': ev_ghg, 'Non-CARB': non_carb_ghg }) ghg_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90) plt.title('Distribution of GHG Reductions') plt.ylabel('') # Hide ylabel # 4. Efficiency comparison ($/ton) plt.subplot(2, 2, 4) if funding_col and ghg_col: carb_non_ev_efficiency = carb_non_ev_funding / carb_non_ev_ghg if carb_non_ev_ghg > 0 else 0 ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0 non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0 efficiency_comparison = pd.Series({ 'CARB (non-EV)': carb_non_ev_efficiency, 'CARB (EV Projects)': ev_efficiency, 'Non-CARB': non_carb_efficiency }) efficiency_comparison.plot(kind='bar') plt.title('Cost Efficiency ($ per Ton CO2e)') plt.ylabel('Cost per Ton GHG Reduced ($)') plt.xticks(rotation=45) plt.tight_layout() # Save visualization output_file = self.output_path / "carb_analysis.png" plt.savefig(output_file, dpi=300, bbox_inches='tight') logger.info(f"CARB analysis visualization saved to {output_file}") plt.show() def plot_temporal_analysis(self): """Create visualizations showing trends over time.""" if 'cci_projects' not in self.data: logger.error("No data available for visualization") return df = self.data['cci_projects'] # Check if we have year data if 'funding_year' not in df.columns: logger.error("funding_year column not found") return # Find funding and GHG columns funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()] dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()] if not funding_col or not ghg_col: logger.error("Required columns not found") return funding_col = funding_col[0] ghg_col = ghg_col[0] dac_col = dac_col[0] if dac_col else None # Separate CARB data if available carb_df = self.data.get('carb_projects', None) ev_df = self.data.get('ev_projects', None) # Create figure plt.figure(figsize=(15, 12)) # 1. Funding by year plt.subplot(2, 2, 1) yearly_funding = df.groupby('funding_year')[funding_col].sum() / 1_000_000 # Add CARB and EV breakdowns if available if carb_df is not None: carb_yearly = carb_df.groupby('funding_year')[funding_col].sum() / 1_000_000 non_carb_yearly = yearly_funding - carb_yearly if ev_df is not None: ev_yearly = ev_df.groupby('funding_year')[funding_col].sum() / 1_000_000 carb_non_ev_yearly = carb_yearly - ev_yearly # Plot stacked bar chart years = sorted(yearly_funding.index) bottom = np.zeros(len(years)) plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom) bottom += non_carb_yearly.reindex(years, fill_value=0) plt.bar(years, carb_non_ev_yearly.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom) bottom += carb_non_ev_yearly.reindex(years, fill_value=0) plt.bar(years, ev_yearly.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom) plt.legend() else: # Plot CARB vs non-CARB years = sorted(yearly_funding.index) plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB') plt.bar(years, carb_yearly.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly.reindex(years, fill_value=0)) plt.legend() else: # Simple yearly plot yearly_funding.plot(kind='bar') plt.title('CCI Funding by Year') plt.xlabel('Funding Year') plt.ylabel('Funding ($ Millions)') plt.xticks(rotation=45) # 2. GHG reductions by year plt.subplot(2, 2, 2) yearly_ghg = df.groupby('funding_year')[ghg_col].sum() / 1_000 # Add CARB and EV breakdowns if available if carb_df is not None: carb_yearly_ghg = carb_df.groupby('funding_year')[ghg_col].sum() / 1_000 non_carb_yearly_ghg = yearly_ghg - carb_yearly_ghg if ev_df is not None: ev_yearly_ghg = ev_df.groupby('funding_year')[ghg_col].sum() / 1_000 carb_non_ev_yearly_ghg = carb_yearly_ghg - ev_yearly_ghg # Plot stacked bar chart years = sorted(yearly_ghg.index) bottom = np.zeros(len(years)) plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom) bottom += non_carb_yearly_ghg.reindex(years, fill_value=0) plt.bar(years, carb_non_ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom) bottom += carb_non_ev_yearly_ghg.reindex(years, fill_value=0) plt.bar(years, ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom) plt.legend() else: # Plot CARB vs non-CARB years = sorted(yearly_ghg.index) plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB') plt.bar(years, carb_yearly_ghg.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_ghg.reindex(years, fill_value=0)) plt.legend() else: # Simple yearly plot yearly_ghg.plot(kind='bar') plt.title('GHG Reductions by Year') plt.xlabel('Funding Year') plt.ylabel('GHG Reductions (Thousand Tons)') plt.xticks(rotation=45) # 3. Project counts by year plt.subplot(2, 2, 3) yearly_projects = df.groupby('funding_year').size() # Add CARB and EV breakdowns if available if carb_df is not None: carb_yearly_projects = carb_df.groupby('funding_year').size() non_carb_yearly_projects = yearly_projects - carb_yearly_projects if ev_df is not None: ev_yearly_projects = ev_df.groupby('funding_year').size() carb_non_ev_yearly_projects = carb_yearly_projects - ev_yearly_projects # Plot stacked bar chart years = sorted(yearly_projects.index) bottom = np.zeros(len(years)) plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom) bottom += non_carb_yearly_projects.reindex(years, fill_value=0) plt.bar(years, carb_non_ev_yearly_projects.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom) bottom += carb_non_ev_yearly_projects.reindex(years, fill_value=0) plt.bar(years, ev_yearly_projects.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom) plt.legend() else: # Plot CARB vs non-CARB years = sorted(yearly_projects.index) plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB') plt.bar(years, carb_yearly_projects.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_projects.reindex(years, fill_value=0)) plt.legend() else: # Simple yearly plot yearly_projects.plot(kind='bar') plt.title('Number of Projects by Year') plt.xlabel('Funding Year') plt.ylabel('Number of Projects') plt.xticks(rotation=45) # 4. DAC benefit percentage by year plt.subplot(2, 2, 4) if dac_col: yearly_dac = df.groupby('funding_year')[dac_col].mean() # Compare CARB vs non-CARB if available if carb_df is not None: carb_yearly_dac = carb_df.groupby('funding_year')[dac_col].mean() non_carb_yearly_dac = self.data['non_carb_projects'].groupby('funding_year')[dac_col].mean() # Plot lines years = sorted(yearly_dac.index) plt.plot(years, yearly_dac.reindex(years), 'k-', label='Overall', linewidth=2) plt.plot(years, carb_yearly_dac.reindex(years), 'b-', label='CARB', linewidth=1.5) plt.plot(years, non_carb_yearly_dac.reindex(years), 'r-', label='Non-CARB', linewidth=1.5) if ev_df is not None and not ev_df.empty: ev_yearly_dac = ev_df.groupby('funding_year')[dac_col].mean() plt.plot(years, ev_yearly_dac.reindex(years), 'g-', label='EV Projects', linewidth=1.5) plt.legend() else: yearly_dac.plot(kind='line', marker='o') plt.title('DAC Benefit Percentage by Year') plt.xlabel('Funding Year') plt.ylabel('Average DAC Benefit (%)') plt.grid(True, linestyle='--', alpha=0.7) plt.xticks(rotation=45) plt.tight_layout() # Save visualization output_file = self.output_path / "temporal_analysis.png" plt.savefig(output_file, dpi=300, bbox_inches='tight') logger.info(f"Temporal analysis visualization saved to {output_file}") plt.show() def identify_collaboration_patterns(self): """ Analyze collaboration patterns in CCI projects to address the research question. This examines how inter-agency collaboration affects outcomes. """ if 'cci_projects' not in self.data: logger.error("No data available for analysis") return df = self.data['cci_projects'] # Check if we can identify collaborative projects collab_indicators = [] # Look for program name patterns that might indicate collaboration if 'program_name' in df.columns: collab_indicators.append('program_name') if 'sub_program_name' in df.columns: collab_indicators.append('sub_program_name') if 'agency_name' in df.columns: collab_indicators.append('agency_name') if not collab_indicators: logger.error("Could not identify columns for collaboration analysis") return print("\n--- Collaboration Analysis ---") try: # Identify unique programs if 'program_name' in df.columns: unique_programs = df['program_name'].nunique() print(f"Number of unique programs: {unique_programs}") # Count agencies per program program_agencies = df.groupby('program_name')['agency_name'].nunique().sort_values(ascending=False) multi_agency_programs = program_agencies[program_agencies > 1] print(f"Programs with multiple agencies: {len(multi_agency_programs)} ({len(multi_agency_programs)/unique_programs*100:.1f}% of programs)") if len(multi_agency_programs) > 0: print("\nTop multi-agency programs:") for program, count in multi_agency_programs.head(5).items(): print(f" {program}: {count} agencies") # Analyze outcomes for multi-agency vs single-agency programs funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()] ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()] dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()] if funding_col and ghg_col: funding_col = funding_col[0] ghg_col = ghg_col[0] # Create multi-agency flag df['multi_agency_program'] = df['program_name'].map(lambda x: program_agencies[x] > 1 if x in program_agencies else False) # Group by multi-agency flag multi_df = df[df['multi_agency_program']].copy() single_df = df[~df['multi_agency_program']].copy() # Compare outcomes print("\nComparison of Multi-agency vs Single-agency Programs:") print(f"Multi-agency projects: {len(multi_df)} ({len(multi_df)/len(df)*100:.1f}% of total)") print(f"Single-agency projects: {len(single_df)} ({len(single_df)/len(df)*100:.1f}% of total)") multi_funding = multi_df[funding_col].sum() single_funding = single_df[funding_col].sum() total_funding = df[funding_col].sum() print(f"\nMulti-agency funding: ${multi_funding:,.2f} ({multi_funding/total_funding*100:.1f}% of total)") print(f"Single-agency funding: ${single_funding:,.2f} ({single_funding/total_funding*100:.1f}% of total)") multi_ghg = multi_df[ghg_col].sum() single_ghg = single_df[ghg_col].sum() total_ghg = df[ghg_col].sum() print(f"\nMulti-agency GHG reductions: {multi_ghg:,.2f} tons ({multi_ghg/total_ghg*100:.1f}% of total)") print(f"Single-agency GHG reductions: {single_ghg:,.2f} tons ({single_ghg/total_ghg*100:.1f}% of total)") # Calculate efficiency multi_efficiency = multi_funding / multi_ghg if multi_ghg > 0 else 0 single_efficiency = single_funding / single_ghg if single_ghg > 0 else 0 print(f"\nMulti-agency efficiency: ${multi_efficiency:.2f} per ton CO2e") print(f"Single-agency efficiency: ${single_efficiency:.2f} per ton CO2e") # DAC benefits if dac_col: dac_col = dac_col[0] multi_dac = multi_df[dac_col].mean() single_dac = single_df[dac_col].mean() print(f"\nMulti-agency DAC benefit: {multi_dac:.2f}%") print(f"Single-agency DAC benefit: {single_dac:.2f}%") # Create visualization plt.figure(figsize=(15, 10)) # 1. Project distribution plt.subplot(2, 2, 1) project_dist = pd.Series({ 'Multi-agency Programs': len(multi_df), 'Single-agency Programs': len(single_df) }) project_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90) plt.title('Distribution of Projects') plt.ylabel('') # 2. Funding distribution plt.subplot(2, 2, 2) funding_dist = pd.Series({ 'Multi-agency Programs': multi_funding, 'Single-agency Programs': single_funding }) funding_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90) plt.title('Distribution of Funding') plt.ylabel('') # 3. GHG reduction distribution plt.subplot(2, 2, 3) ghg_dist = pd.Series({ 'Multi-agency Programs': multi_ghg, 'Single-agency Programs': single_ghg }) ghg_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90) plt.title('Distribution of GHG Reductions') plt.ylabel('') # 4. Efficiency & DAC comparison plt.subplot(2, 2, 4) metrics = ['Cost Efficiency ($/ton)', 'DAC Benefit (%)'] multi_values = [multi_efficiency] single_values = [single_efficiency] if dac_col: multi_values.append(multi_dac) single_values.append(single_dac) x = np.arange(len(metrics)) width = 0.35 plt.bar(x - width/2, multi_values, width, label='Multi-agency') plt.bar(x + width/2, single_values, width, label='Single-agency') plt.xlabel('Metric') plt.ylabel('Value') plt.title('Performance Comparison') plt.xticks(x, metrics) plt.legend() plt.tight_layout() # Save visualization output_file = self.output_path / "collaboration_analysis.png" plt.savefig(output_file, dpi=300, bbox_inches='tight') logger.info(f"Collaboration analysis visualization saved to {output_file}") plt.show() except Exception as e: logger.error(f"Error in collaboration analysis: {e}") # Usage example if __name__ == "__main__": analyzer = CCIDataAnalyzer(data_path="data/cci_programs_data_reduced.csv") if analyzer.load_data(): print("Data loaded successfully!") results = analyzer.analyze_data() # Run agency comparison analysis analyzer.plot_agency_comparison() # Run CARB vs non-CARB analysis analyzer.plot_carb_analysis() # Run temporal analysis analyzer.plot_temporal_analysis() # Run collaboration analysis analyzer.identify_collaboration_patterns() else: print("Failed to load data. Check file path and format.")