i hope were getting some where

2025-04-09 22:51:07 -07:00
parent 28ad830bef
commit 81ec68b3cc
11 changed files with 148547 additions and 144416 deletions
--- a/01_analyzer.ipynb
+++ b/01_analyzer.ipynb
--- a/CCIDataAnalyzer.py
+++ b/CCIDataAnalyzer.py
@@ -0,0 +1,857 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from pathlib import Path
 import logging
 import warnings
 # Configure basic logging
 logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("cci_analyzer")
 # Suppress pandas warnings
 warnings.filterwarnings('ignore')
 class CCIDataAnalyzer:
    """Simplified analyzer for California Climate Investments data."""
    def __init__(self, data_path, output_path="./output"):
        self.data_path = Path(data_path)
        self.output_path = Path(output_path)
        self.output_path.mkdir(parents=True, exist_ok=True)
        self.data = {}
        logger.info(f"Initialized with data path: {self.data_path}")
    def load_data(self):
        """Load CCI data with special handling for encoding issues."""
        try:
            logger.info(f"Loading data from {self.data_path}")
            # Read as string to avoid conversion errors
            df = pd.read_csv(self.data_path, dtype=str)
            logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
            # Clean and process the data
            df = self._clean_data(df)
            # Store in data dictionary
            self.data['cci_projects'] = df
            # Create separate datasets for CARB and non-CARB projects
            self._create_carb_datasets()
            return True
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            return False
    def _clean_data(self, df):
        """Clean and process the CCI data."""
        try:
            # 1. Fix column names
            df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
            # 2. Handle the problematic lat_long column with LibreOffice encoding
            if 'lat_long' in df.columns:
                logger.info("Processing coordinates with special encoding handling")
                # Function to clean LibreOffice encoding
                def clean_libreoffice_encoding(text):
                    if pd.isna(text):
                        return text
                    # Special LibreOffice character replacements
                    replacements = {
                        '+AC0-': '-',  # Minus sign
                        '+ACI-': '"',  # Quote mark
                    }
                    cleaned = str(text)
                    for code, char in replacements.items():
                        cleaned = cleaned.replace(code, char)
                    return cleaned
                # Clean the lat_long column
                df['lat_long'] = df['lat_long'].apply(clean_libreoffice_encoding)
                # Extract latitude and longitude
                def extract_coords(coord_str):
                    if pd.isna(coord_str):
                        return (np.nan, np.nan)
                    try:
                        # Try to split by comma
                        if ',' in coord_str:
                            parts = coord_str.split(',')
                            if len(parts) >= 2:
                                lon = parts[0].strip()
                                lat = parts[1].strip()
                                return (float(lat), float(lon))
                    except:
                        pass
                    return (np.nan, np.nan)
                # Extract coordinates safely
                try:
                    coords = df['lat_long'].apply(extract_coords)
                    df['latitude'] = coords.apply(lambda x: x[0])
                    df['longitude'] = coords.apply(lambda x: x[1])
                except Exception as e:
                    logger.error(f"Error extracting coordinates: {e}")
            # 3. Convert numeric columns
            numeric_cols = [
                'total_project_cost', 
                'total_program_ggrffunding', 
                'project_life_years',
                'total_project_ghgreductions', 
                'annual_project_ghgreductions'
            ]
            for col in df.columns:
                # Find matching columns (case insensitive)
                if any(num_col in col.lower() for num_col in 
                      ['cost', 'funding', 'ghg', 'reductions', 'years']):
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            # 4. Convert date columns
            date_cols = [col for col in df.columns if 'date' in col.lower()]
            for col in date_cols:
                df[col] = pd.to_datetime(df[col], errors='coerce')
            # 5. Extract funding year
            fiscal_year_cols = [col for col in df.columns if 'fiscal_year' in col.lower()]
            if fiscal_year_cols:
                try:
                    # Handle different possible formats of fiscal year column
                    year_col = fiscal_year_cols[0]
                    # Try multiple approaches to extract year
                    try:
                        # Handle standard fiscal year format like "2019-20"
                        df['funding_year'] = df[year_col].astype(str).str.extract(r'(\d{4})').astype('Int64')
                    except Exception:
                        logger.warning(f"Could not extract year with regex pattern, trying direct conversion")
                        # Try direct conversion if it's already a year
                        df['funding_year'] = pd.to_numeric(df[year_col], errors='coerce').astype('Int64')
                except Exception as e:
                    logger.error(f"Error extracting funding year: {e}")
            # 6. Calculate derived metrics if columns exist
            funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
            ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
            dac_col = [col for col in df.columns if 'funding_benefiting' in col.lower()]
            if funding_col and ghg_col:
                df['ghg_efficiency'] = np.where(
                    df[ghg_col[0]] > 0,
                    df[funding_col[0]] / df[ghg_col[0]],
                    np.nan
                )
            if funding_col and dac_col:
                df['dac_benefit_percentage'] = np.where(
                    df[funding_col[0]] > 0,
                    100 * df[dac_col[0]] / df[funding_col[0]],
                    0
                )
            logger.info("Data cleaning and processing complete")
            return df
        except Exception as e:
            logger.error(f"Error cleaning data: {e}")
            return df
    def _create_carb_datasets(self):
        """Create separate datasets for CARB and non-CARB projects."""
        if 'cci_projects' not in self.data:
            logger.error("No data available to create CARB datasets")
            return
        df = self.data['cci_projects']
        try:
            # Check if agency_name column exists
            if 'agency_name' not in df.columns:
                logger.error("agency_name column not found")
                return
            # Create CARB dataset
            carb_mask = df['agency_name'].str.contains('Air Resources Board', case=False, na=False)
            self.data['carb_projects'] = df[carb_mask].copy()
            self.data['non_carb_projects'] = df[~carb_mask].copy()
            logger.info(f"Created CARB dataset with {len(self.data['carb_projects'])} projects")
            logger.info(f"Created non-CARB dataset with {len(self.data['non_carb_projects'])} projects")
            # Identify EV rebate/voucher projects within CARB
            if len(self.data['carb_projects']) > 0:
                carb_df = self.data['carb_projects']
                # Look for EV-related projects using various columns
                ev_indicators = ['electric vehicle', 'ev ', 'rebate', 'voucher', 'clean vehicle']
                # Check program name for EV indicators
                if 'program_name' in carb_df.columns:
                    ev_mask = carb_df['program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False)
                elif 'sub_program_name' in carb_df.columns:
                    ev_mask = carb_df['sub_program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False)
                else:
                    # If specific columns not found, try to find any column that might indicate EV projects
                    ev_mask = pd.Series(False, index=carb_df.index)
                    for col in carb_df.columns:
                        if carb_df[col].dtype == 'object':
                            try:
                                ev_mask = ev_mask | carb_df[col].astype(str).str.lower().str.contains('|'.join(ev_indicators), na=False)
                            except:
                                pass
                self.data['ev_projects'] = carb_df[ev_mask].copy()
                logger.info(f"Identified {len(self.data['ev_projects'])} potential EV rebate/voucher projects")
        except Exception as e:
            logger.error(f"Error creating CARB datasets: {e}")
    def analyze_data(self, include_carb_breakdown=True):
        """Basic analysis of CCI data with optional CARB breakdown."""
        if 'cci_projects' not in self.data:
            logger.error("No data available for analysis")
            return None
        df = self.data['cci_projects']
        # Get agency information
        if 'agency_name' in df.columns:
            agency_counts = df['agency_name'].value_counts()
            print("\nAgencies involved in CCI projects:")
            for agency, count in agency_counts.head(10).items():
                print(f"  {agency}: {count} projects")
        # Analyze funding distribution
        funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
        if funding_col:
            total_funding = df[funding_col[0]].sum()
            print(f"\nTotal CCI funding: ${total_funding:,.2f}")
            print(f"Average project funding: ${df[funding_col[0]].mean():,.2f}")
        # Analyze GHG reductions
        ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
        if ghg_col:
            total_ghg = df[ghg_col[0]].sum()
            print(f"\nTotal GHG reductions: {total_ghg:,.2f} tons")
            print(f"Average GHG reduction per project: {df[ghg_col[0]].mean():,.2f} tons")
        # Analyze DAC benefits
        if 'dac_benefit_percentage' in df.columns:
            avg_dac = df['dac_benefit_percentage'].mean()
            print(f"\nAverage DAC benefit percentage: {avg_dac:.2f}%")
        # CARB vs. Non-CARB Analysis
        if include_carb_breakdown and 'carb_projects' in self.data and 'non_carb_projects' in self.data:
            carb_df = self.data['carb_projects']
            non_carb_df = self.data['non_carb_projects']
            print("\n--- CARB vs. Non-CARB Analysis ---")
            print(f"CARB projects: {len(carb_df)} ({len(carb_df)/len(df)*100:.1f}% of total)")
            print(f"Non-CARB projects: {len(non_carb_df)} ({len(non_carb_df)/len(df)*100:.1f}% of total)")
            if funding_col:
                carb_funding = carb_df[funding_col[0]].sum()
                non_carb_funding = non_carb_df[funding_col[0]].sum()
                print(f"\nCARB funding: ${carb_funding:,.2f} ({carb_funding/total_funding*100:.1f}% of total)")
                print(f"Non-CARB funding: ${non_carb_funding:,.2f} ({non_carb_funding/total_funding*100:.1f}% of total)")
                print(f"Average CARB project: ${carb_df[funding_col[0]].mean():,.2f}")
                print(f"Average non-CARB project: ${non_carb_df[funding_col[0]].mean():,.2f}")
            if ghg_col:
                carb_ghg = carb_df[ghg_col[0]].sum()
                non_carb_ghg = non_carb_df[ghg_col[0]].sum()
                print(f"\nCARB GHG reductions: {carb_ghg:,.2f} tons ({carb_ghg/total_ghg*100:.1f}% of total)")
                print(f"Non-CARB GHG reductions: {non_carb_ghg:,.2f} tons ({non_carb_ghg/total_ghg*100:.1f}% of total)")
                # Calculate efficiency
                if funding_col:
                    carb_efficiency = carb_funding / carb_ghg if carb_ghg > 0 else 0
                    non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0
                    print(f"\nCARB efficiency: ${carb_efficiency:.2f} per ton CO2e")
                    print(f"Non-CARB efficiency: ${non_carb_efficiency:.2f} per ton CO2e")
            # EV Projects Analysis
            if 'ev_projects' in self.data:
                ev_df = self.data['ev_projects']
                print("\n--- Electric Vehicle Projects Analysis ---")
                print(f"EV projects: {len(ev_df)} ({len(ev_df)/len(carb_df)*100:.1f}% of CARB projects)")
                if funding_col:
                    ev_funding = ev_df[funding_col[0]].sum()
                    print(f"EV funding: ${ev_funding:,.2f} ({ev_funding/carb_funding*100:.1f}% of CARB funding)")
                    print(f"Average EV project: ${ev_df[funding_col[0]].mean():,.2f}")
                if ghg_col:
                    ev_ghg = ev_df[ghg_col[0]].sum()
                    print(f"EV GHG reductions: {ev_ghg:,.2f} tons ({ev_ghg/carb_ghg*100:.1f}% of CARB reductions)")
                    # Calculate efficiency
                    if funding_col:
                        ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0
                        print(f"EV efficiency: ${ev_efficiency:.2f} per ton CO2e")
        return {
            "total_projects": len(df),
            "total_funding": total_funding if funding_col else None,
            "total_ghg_reductions": total_ghg if ghg_col else None,
            "carb_projects": len(self.data['carb_projects']) if 'carb_projects' in self.data else None,
            "ev_projects": len(self.data['ev_projects']) if 'ev_projects' in self.data else None
        }
    def plot_agency_comparison(self):
        """Create visualizations comparing agencies."""
        if 'cci_projects' not in self.data:
            logger.error("No data available for visualization")
            return
        df = self.data['cci_projects']
        # Ensure agency_name column exists
        if 'agency_name' not in df.columns:
            logger.error("agency_name column not found")
            return
        # Find funding column
        funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
        if not funding_col:
            logger.error("Funding column not found")
            return
        funding_col = funding_col[0]
        # Find GHG reduction column
        ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
        if not ghg_col:
            logger.error("GHG reduction column not found")
            return
        ghg_col = ghg_col[0]
        # Create figure
        plt.figure(figsize=(15, 12))
        # 1. Project count by agency
        plt.subplot(2, 2, 1)
        agency_counts = df['agency_name'].value_counts().head(10)
        agency_counts.plot(kind='barh')
        plt.title('Number of Projects by Agency (Top 10)')
        plt.xlabel('Number of Projects')
        # 2. Funding by agency
        plt.subplot(2, 2, 2)
        agency_funding = df.groupby('agency_name')[funding_col].sum().sort_values(ascending=False).head(10) / 1_000_000
        agency_funding.plot(kind='barh')
        plt.title('Total Funding by Agency ($ Millions)')
        plt.xlabel('Funding ($ Millions)')
        # 3. GHG reductions by agency
        plt.subplot(2, 2, 3)
        agency_ghg = df.groupby('agency_name')[ghg_col].sum().sort_values(ascending=False).head(10) / 1_000
        agency_ghg.plot(kind='barh')
        plt.title('GHG Reductions by Agency (Thousand Tons)')
        plt.xlabel('GHG Reductions (Thousand Tons)')
        # 4. Efficiency by agency ($/ton)
        plt.subplot(2, 2, 4)
        agency_efficiency = df.groupby('agency_name').apply(
            lambda x: x[funding_col].sum() / x[ghg_col].sum() if x[ghg_col].sum() > 0 else np.nan
        ).dropna().sort_values().head(10)
        agency_efficiency.plot(kind='barh')
        plt.title('Cost Efficiency by Agency ($ per Ton CO2e)')
        plt.xlabel('Cost per Ton GHG Reduced ($)')
        plt.tight_layout()
        # Save visualization
        output_file = self.output_path / "agency_comparison.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        logger.info(f"Agency comparison visualization saved to {output_file}")
        plt.show()
    def plot_carb_analysis(self):
        """Create visualizations specifically for CARB vs non-CARB analysis."""
        if 'carb_projects' not in self.data or 'non_carb_projects' not in self.data:
            logger.error("CARB datasets not available")
            return
        # Find funding column
        funding_col = None
        ghg_col = None
        # Check if we have funding data
        for key in ['carb_projects', 'non_carb_projects']:
            df = self.data[key]
            funding_cols = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
            if funding_cols:
                funding_col = funding_cols[0]
            ghg_cols = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
            if ghg_cols:
                ghg_col = ghg_cols[0]
        if not funding_col or not ghg_col:
            logger.error("Required columns not found")
            return
        # Prepare data for comparison
        carb_df = self.data['carb_projects']
        non_carb_df = self.data['non_carb_projects']
        ev_df = self.data.get('ev_projects', pd.DataFrame())
        # Create figure
        plt.figure(figsize=(15, 12))
        # 1. Project count comparison
        plt.subplot(2, 2, 1)
        project_counts = pd.Series({
            'CARB (non-EV)': len(carb_df) - len(ev_df), 
            'CARB (EV Projects)': len(ev_df),
            'Non-CARB': len(non_carb_df)
        })
        project_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90)
        plt.title('Distribution of Projects')
        plt.ylabel('')  # Hide ylabel
        # 2. Funding comparison
        plt.subplot(2, 2, 2)
        if funding_col:
            carb_non_ev_funding = carb_df[~carb_df.index.isin(ev_df.index)][funding_col].sum() if not ev_df.empty else carb_df[funding_col].sum()
            ev_funding = ev_df[funding_col].sum() if not ev_df.empty else 0
            non_carb_funding = non_carb_df[funding_col].sum()
            funding_distribution = pd.Series({
                'CARB (non-EV)': carb_non_ev_funding,
                'CARB (EV Projects)': ev_funding,
                'Non-CARB': non_carb_funding
            })
            funding_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90)
            plt.title('Distribution of Funding')
            plt.ylabel('')  # Hide ylabel
        # 3. GHG reductions comparison
        plt.subplot(2, 2, 3)
        if ghg_col:
            carb_non_ev_ghg = carb_df[~carb_df.index.isin(ev_df.index)][ghg_col].sum() if not ev_df.empty else carb_df[ghg_col].sum()
            ev_ghg = ev_df[ghg_col].sum() if not ev_df.empty else 0
            non_carb_ghg = non_carb_df[ghg_col].sum()
            ghg_distribution = pd.Series({
                'CARB (non-EV)': carb_non_ev_ghg,
                'CARB (EV Projects)': ev_ghg,
                'Non-CARB': non_carb_ghg
            })
            ghg_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90)
            plt.title('Distribution of GHG Reductions')
            plt.ylabel('')  # Hide ylabel
        # 4. Efficiency comparison ($/ton)
        plt.subplot(2, 2, 4)
        if funding_col and ghg_col:
            carb_non_ev_efficiency = carb_non_ev_funding / carb_non_ev_ghg if carb_non_ev_ghg > 0 else 0
            ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0
            non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0
            efficiency_comparison = pd.Series({
                'CARB (non-EV)': carb_non_ev_efficiency,
                'CARB (EV Projects)': ev_efficiency,
                'Non-CARB': non_carb_efficiency
            })
            efficiency_comparison.plot(kind='bar')
            plt.title('Cost Efficiency ($ per Ton CO2e)')
            plt.ylabel('Cost per Ton GHG Reduced ($)')
            plt.xticks(rotation=45)
        plt.tight_layout()
        # Save visualization
        output_file = self.output_path / "carb_analysis.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        logger.info(f"CARB analysis visualization saved to {output_file}")
        plt.show()
    def plot_temporal_analysis(self):
        """Create visualizations showing trends over time."""
        if 'cci_projects' not in self.data:
            logger.error("No data available for visualization")
            return
        df = self.data['cci_projects']
        # Check if we have year data
        if 'funding_year' not in df.columns:
            logger.error("funding_year column not found")
            return
        # Find funding and GHG columns
        funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
        ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
        dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()]
        if not funding_col or not ghg_col:
            logger.error("Required columns not found")
            return
        funding_col = funding_col[0]
        ghg_col = ghg_col[0]
        dac_col = dac_col[0] if dac_col else None
        # Separate CARB data if available
        carb_df = self.data.get('carb_projects', None)
        ev_df = self.data.get('ev_projects', None)
        # Create figure
        plt.figure(figsize=(15, 12))
        # 1. Funding by year
        plt.subplot(2, 2, 1)
        yearly_funding = df.groupby('funding_year')[funding_col].sum() / 1_000_000
        # Add CARB and EV breakdowns if available
        if carb_df is not None:
            carb_yearly = carb_df.groupby('funding_year')[funding_col].sum() / 1_000_000
            non_carb_yearly = yearly_funding - carb_yearly
            if ev_df is not None:
                ev_yearly = ev_df.groupby('funding_year')[funding_col].sum() / 1_000_000
                carb_non_ev_yearly = carb_yearly - ev_yearly
                # Plot stacked bar chart
                years = sorted(yearly_funding.index)
                bottom = np.zeros(len(years))
                plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
                bottom += non_carb_yearly.reindex(years, fill_value=0)
                plt.bar(years, carb_non_ev_yearly.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
                bottom += carb_non_ev_yearly.reindex(years, fill_value=0)
                plt.bar(years, ev_yearly.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
                plt.legend()
            else:
                # Plot CARB vs non-CARB
                years = sorted(yearly_funding.index)
                plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB')
                plt.bar(years, carb_yearly.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly.reindex(years, fill_value=0))
                plt.legend()
        else:
            # Simple yearly plot
            yearly_funding.plot(kind='bar')
        plt.title('CCI Funding by Year')
        plt.xlabel('Funding Year')
        plt.ylabel('Funding ($ Millions)')
        plt.xticks(rotation=45)
        # 2. GHG reductions by year
        plt.subplot(2, 2, 2)
        yearly_ghg = df.groupby('funding_year')[ghg_col].sum() / 1_000
        # Add CARB and EV breakdowns if available
        if carb_df is not None:
            carb_yearly_ghg = carb_df.groupby('funding_year')[ghg_col].sum() / 1_000
            non_carb_yearly_ghg = yearly_ghg - carb_yearly_ghg
            if ev_df is not None:
                ev_yearly_ghg = ev_df.groupby('funding_year')[ghg_col].sum() / 1_000
                carb_non_ev_yearly_ghg = carb_yearly_ghg - ev_yearly_ghg
                # Plot stacked bar chart
                years = sorted(yearly_ghg.index)
                bottom = np.zeros(len(years))
                plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
                bottom += non_carb_yearly_ghg.reindex(years, fill_value=0)
                plt.bar(years, carb_non_ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
                bottom += carb_non_ev_yearly_ghg.reindex(years, fill_value=0)
                plt.bar(years, ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
                plt.legend()
            else:
                # Plot CARB vs non-CARB
                years = sorted(yearly_ghg.index)
                plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB')
                plt.bar(years, carb_yearly_ghg.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_ghg.reindex(years, fill_value=0))
                plt.legend()
        else:
            # Simple yearly plot
            yearly_ghg.plot(kind='bar')
        plt.title('GHG Reductions by Year')
        plt.xlabel('Funding Year')
        plt.ylabel('GHG Reductions (Thousand Tons)')
        plt.xticks(rotation=45)
        # 3. Project counts by year
        plt.subplot(2, 2, 3)
        yearly_projects = df.groupby('funding_year').size()
        # Add CARB and EV breakdowns if available
        if carb_df is not None:
            carb_yearly_projects = carb_df.groupby('funding_year').size()
            non_carb_yearly_projects = yearly_projects - carb_yearly_projects
            if ev_df is not None:
                ev_yearly_projects = ev_df.groupby('funding_year').size()
                carb_non_ev_yearly_projects = carb_yearly_projects - ev_yearly_projects
                # Plot stacked bar chart
                years = sorted(yearly_projects.index)
                bottom = np.zeros(len(years))
                plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
                bottom += non_carb_yearly_projects.reindex(years, fill_value=0)
                plt.bar(years, carb_non_ev_yearly_projects.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
                bottom += carb_non_ev_yearly_projects.reindex(years, fill_value=0)
                plt.bar(years, ev_yearly_projects.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
                plt.legend()
            else:
                # Plot CARB vs non-CARB
                years = sorted(yearly_projects.index)
                plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB')
                plt.bar(years, carb_yearly_projects.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_projects.reindex(years, fill_value=0))
                plt.legend()
        else:
            # Simple yearly plot
            yearly_projects.plot(kind='bar')
        plt.title('Number of Projects by Year')
        plt.xlabel('Funding Year')
        plt.ylabel('Number of Projects')
        plt.xticks(rotation=45)
        # 4. DAC benefit percentage by year
        plt.subplot(2, 2, 4)
        if dac_col:
            yearly_dac = df.groupby('funding_year')[dac_col].mean()
            # Compare CARB vs non-CARB if available
            if carb_df is not None:
                carb_yearly_dac = carb_df.groupby('funding_year')[dac_col].mean()
                non_carb_yearly_dac = self.data['non_carb_projects'].groupby('funding_year')[dac_col].mean()
                # Plot lines
                years = sorted(yearly_dac.index)
                plt.plot(years, yearly_dac.reindex(years), 'k-', label='Overall', linewidth=2)
                plt.plot(years, carb_yearly_dac.reindex(years), 'b-', label='CARB', linewidth=1.5)
                plt.plot(years, non_carb_yearly_dac.reindex(years), 'r-', label='Non-CARB', linewidth=1.5)
                if ev_df is not None and not ev_df.empty:
                    ev_yearly_dac = ev_df.groupby('funding_year')[dac_col].mean()
                    plt.plot(years, ev_yearly_dac.reindex(years), 'g-', label='EV Projects', linewidth=1.5)
                plt.legend()
            else:
                yearly_dac.plot(kind='line', marker='o')
            plt.title('DAC Benefit Percentage by Year')
            plt.xlabel('Funding Year')
            plt.ylabel('Average DAC Benefit (%)')
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.xticks(rotation=45)
        plt.tight_layout()
        # Save visualization
        output_file = self.output_path / "temporal_analysis.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        logger.info(f"Temporal analysis visualization saved to {output_file}")
        plt.show()
    def identify_collaboration_patterns(self):
        """
        Analyze collaboration patterns in CCI projects to address the research question.
        This examines how inter-agency collaboration affects outcomes.
        """
        if 'cci_projects' not in self.data:
            logger.error("No data available for analysis")
            return
        df = self.data['cci_projects']
        # Check if we can identify collaborative projects
        collab_indicators = []
        # Look for program name patterns that might indicate collaboration
        if 'program_name' in df.columns:
            collab_indicators.append('program_name')
        if 'sub_program_name' in df.columns:
            collab_indicators.append('sub_program_name')
        if 'agency_name' in df.columns:
            collab_indicators.append('agency_name')
        if not collab_indicators:
            logger.error("Could not identify columns for collaboration analysis")
            return
        print("\n--- Collaboration Analysis ---")
        try:
            # Identify unique programs
            if 'program_name' in df.columns:
                unique_programs = df['program_name'].nunique()
                print(f"Number of unique programs: {unique_programs}")
                # Count agencies per program
                program_agencies = df.groupby('program_name')['agency_name'].nunique().sort_values(ascending=False)
                multi_agency_programs = program_agencies[program_agencies > 1]
                print(f"Programs with multiple agencies: {len(multi_agency_programs)} ({len(multi_agency_programs)/unique_programs*100:.1f}% of programs)")
                if len(multi_agency_programs) > 0:
                    print("\nTop multi-agency programs:")
                    for program, count in multi_agency_programs.head(5).items():
                        print(f"  {program}: {count} agencies")
                    # Analyze outcomes for multi-agency vs single-agency programs
                    funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
                    ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
                    dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()]
                    if funding_col and ghg_col:
                        funding_col = funding_col[0]
                        ghg_col = ghg_col[0]
                        # Create multi-agency flag
                        df['multi_agency_program'] = df['program_name'].map(lambda x: program_agencies[x] > 1 if x in program_agencies else False)
                        # Group by multi-agency flag
                        multi_df = df[df['multi_agency_program']].copy()
                        single_df = df[~df['multi_agency_program']].copy()
                        # Compare outcomes
                        print("\nComparison of Multi-agency vs Single-agency Programs:")
                        print(f"Multi-agency projects: {len(multi_df)} ({len(multi_df)/len(df)*100:.1f}% of total)")
                        print(f"Single-agency projects: {len(single_df)} ({len(single_df)/len(df)*100:.1f}% of total)")
                        multi_funding = multi_df[funding_col].sum()
                        single_funding = single_df[funding_col].sum()
                        total_funding = df[funding_col].sum()
                        print(f"\nMulti-agency funding: ${multi_funding:,.2f} ({multi_funding/total_funding*100:.1f}% of total)")
                        print(f"Single-agency funding: ${single_funding:,.2f} ({single_funding/total_funding*100:.1f}% of total)")
                        multi_ghg = multi_df[ghg_col].sum()
                        single_ghg = single_df[ghg_col].sum()
                        total_ghg = df[ghg_col].sum()
                        print(f"\nMulti-agency GHG reductions: {multi_ghg:,.2f} tons ({multi_ghg/total_ghg*100:.1f}% of total)")
                        print(f"Single-agency GHG reductions: {single_ghg:,.2f} tons ({single_ghg/total_ghg*100:.1f}% of total)")
                        # Calculate efficiency
                        multi_efficiency = multi_funding / multi_ghg if multi_ghg > 0 else 0
                        single_efficiency = single_funding / single_ghg if single_ghg > 0 else 0
                        print(f"\nMulti-agency efficiency: ${multi_efficiency:.2f} per ton CO2e")
                        print(f"Single-agency efficiency: ${single_efficiency:.2f} per ton CO2e")
                        # DAC benefits
                        if dac_col:
                            dac_col = dac_col[0]
                            multi_dac = multi_df[dac_col].mean()
                            single_dac = single_df[dac_col].mean()
                            print(f"\nMulti-agency DAC benefit: {multi_dac:.2f}%")
                            print(f"Single-agency DAC benefit: {single_dac:.2f}%")
                        # Create visualization
                        plt.figure(figsize=(15, 10))
                        # 1. Project distribution
                        plt.subplot(2, 2, 1)
                        project_dist = pd.Series({
                            'Multi-agency Programs': len(multi_df),
                            'Single-agency Programs': len(single_df)
                        })
                        project_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
                        plt.title('Distribution of Projects')
                        plt.ylabel('')
                        # 2. Funding distribution
                        plt.subplot(2, 2, 2)
                        funding_dist = pd.Series({
                            'Multi-agency Programs': multi_funding,
                            'Single-agency Programs': single_funding
                        })
                        funding_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
                        plt.title('Distribution of Funding')
                        plt.ylabel('')
                        # 3. GHG reduction distribution
                        plt.subplot(2, 2, 3)
                        ghg_dist = pd.Series({
                            'Multi-agency Programs': multi_ghg,
                            'Single-agency Programs': single_ghg
                        })
                        ghg_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
                        plt.title('Distribution of GHG Reductions')
                        plt.ylabel('')
                        # 4. Efficiency & DAC comparison
                        plt.subplot(2, 2, 4)
                        metrics = ['Cost Efficiency ($/ton)', 'DAC Benefit (%)']
                        multi_values = [multi_efficiency]
                        single_values = [single_efficiency]
                        if dac_col:
                            multi_values.append(multi_dac)
                            single_values.append(single_dac)
                        x = np.arange(len(metrics))
                        width = 0.35
                        plt.bar(x - width/2, multi_values, width, label='Multi-agency')
                        plt.bar(x + width/2, single_values, width, label='Single-agency')
                        plt.xlabel('Metric')
                        plt.ylabel('Value')
                        plt.title('Performance Comparison')
                        plt.xticks(x, metrics)
                        plt.legend()
                        plt.tight_layout()
                        # Save visualization
                        output_file = self.output_path / "collaboration_analysis.png"
                        plt.savefig(output_file, dpi=300, bbox_inches='tight')
                        logger.info(f"Collaboration analysis visualization saved to {output_file}")
                        plt.show()
        except Exception as e:
            logger.error(f"Error in collaboration analysis: {e}")
 # Usage example
 if __name__ == "__main__":
    analyzer = CCIDataAnalyzer(data_path="data/cci_programs_data_reduced.csv")
    if analyzer.load_data():
        print("Data loaded successfully!")
        results = analyzer.analyze_data()
        # Run agency comparison analysis
        analyzer.plot_agency_comparison()
        # Run CARB vs non-CARB analysis
        analyzer.plot_carb_analysis()
        # Run temporal analysis
        analyzer.plot_temporal_analysis()
        # Run collaboration analysis
        analyzer.identify_collaboration_patterns()
    else:
        print("Failed to load data. Check file path and format.")
--- a/cci_analyzer.py
+++ b/cci_analyzer.py
--- a/cci_collaboration_analysis.py
+++ b/cci_collaboration_analysis.py
--- a/data/cci_programs_data_reduced.csv
+++ b/data/cci_programs_data_reduced.csv
--- a/data/cci_programs_data_reduced_header.csv
+++ b/data/cci_programs_data_reduced_header.csv
@@ -1,7 +0,0 @@
 Project IDNumber,Reporting Cycle Name,Agency Name,Program Name,Sub Program Name,Record Type,Census Tract,Lat Long,"Senate
 District","Assembly
 District",County,Total Project Cost,Total Program GGRFFunding,Project Life Years,Total Project GHGReductions,Annual Project GHGReductions,Project Count,Fiscal Year Funding Project,Is Benefit Disadvantaged Communities,Disadvantaged Community Criteria,Total GGRFDisadvantaged Community Funding,Funding Benefiting Disadvantaged Communities,Estimated Num Vehicles In Service,Funding Within Disadvantage Communities,VMTReductions,Number Of Housing Units,Number Of Affordable Housing Units,Estimated Number Of Trees To Be Planted,Energy Cost Savings,Estimated Energy Saved KWH,Estimated Energy Saved Therms,Estimated Water Saved Gallons,Estimated Energy Generated KWH,Estimated Fuel Use Reduction Gal,Vouchers Benefiting Disadvantaged Communities,Number Of Rebates Issued,Rebates Within Disadvantaged Communities,Date Operational,Project Completion Date,Is AB1550Buffer Region,Is Benefit DAC1550Communities,Is Low Income Communities
 G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001201,,14,31,Fresno,0,5000,3,14,0,1,,True,1-1A,5000,0,0,5000,0,0,0,0,0,0,0,0,0,0,0,1,0,,05/01/2018,False,False,False
 G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001410,,14,31,Fresno,0,5000,3,12,0,1,,True,1-1A,5000,0,0,5000,0,0,0,0,0,0,0,0,0,0,0,1,0,,05/01/2018,False,False,False
 G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001414,,14,31,Fresno,0,5000,3,15,0,2,,True,1-1A,5000,0,0,5000,0,0,0,0,0,0,0,0,0,0,0,2,0,,05/01/2018,False,False,False
 G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001800,,14,31,Fresno,0,2500,3,8,0,1,,True,1-1A,2500,0,0,2500,0,0,0,0,0,0,0,0,0,0,0,1,0,,05/01/2018,False,False,False
--- a/output/agency_comparison.png
+++ b/output/agency_comparison.png
--- a/output/carb_analysis.png
+++ b/output/carb_analysis.png
--- a/output/cci_overview.png
+++ b/output/cci_overview.png
--- a/output/collaboration_analysis.png
+++ b/output/collaboration_analysis.png
--- a/output/temporal_analysis.png
+++ b/output/temporal_analysis.png