import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("regional_analysis") def analyze_regional_distribution(input_path, output_path=None): """ Analyze the regional distribution of CCI projects and its relationship to GHG efficiency and DAC benefits. Parameters: input_path (str): Path to the cleaned CCI data CSV file output_path (str, optional): Path to save findings and visualizations """ logger.info(f"Loading data from {input_path}") # Load the data df = pd.read_csv(input_path, low_memory=False) logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns") # Check if we have the regional data if 'ca_region' not in df.columns: logger.error("Regional data not found in the dataset") return # Define output directory if provided output_dir = None if output_path: output_dir = Path(output_path) output_dir.mkdir(parents=True, exist_ok=True) # 1. Basic regional distribution analysis region_counts = df['ca_region'].value_counts() region_percent = df['ca_region'].value_counts(normalize=True) * 100 logger.info("Regional distribution of CCI projects:") for region, count in region_counts.items(): logger.info(f" {region}: {count} projects ({region_percent[region]:.1f}%)") # Visualize regional distribution plt.figure(figsize=(10, 6)) region_counts.plot(kind='bar') plt.title('Number of CCI Projects by Region') plt.xlabel('Region') plt.ylabel('Number of Projects') plt.xticks(rotation=45, ha='right') plt.tight_layout() if output_dir: plt.savefig(output_dir / "regional_distribution.png", dpi=300) plt.close() # 2. EV vouchers vs non-EV projects by region if 'is_ev_voucher' in df.columns: ev_by_region = df[df['is_ev_voucher']]['ca_region'].value_counts() nonev_by_region = df[~df['is_ev_voucher']]['ca_region'].value_counts() # Calculate percentages ev_percent = 100 * ev_by_region / ev_by_region.sum() nonev_percent = 100 * nonev_by_region / nonev_by_region.sum() # Combine for comparison comparison_df = pd.DataFrame({ 'EV Vouchers': ev_percent, 'Non-EV Projects': nonev_percent }) # Fill missing values with 0 comparison_df = comparison_df.fillna(0) # Visualize comparison plt.figure(figsize=(12, 6)) comparison_df.plot(kind='bar') plt.title('Regional Distribution: EV Vouchers vs. Non-EV Projects') plt.xlabel('Region') plt.ylabel('Percentage of Projects') plt.xticks(rotation=45, ha='right') plt.legend(title='Project Type') plt.tight_layout() if output_dir: plt.savefig(output_dir / "regional_ev_comparison.png", dpi=300) plt.close() # 3. GHG efficiency by region if 'ghg_efficiency' in df.columns: # Filter to valid efficiency values and non-extreme outliers valid_data = df[(df['ghg_efficiency'].notna()) & (df['ghg_efficiency'] > 0) & (df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))] # Calculate median efficiency by region efficiency_by_region = valid_data.groupby('ca_region')['ghg_efficiency'].median().sort_values() logger.info("GHG efficiency by region ($ per ton CO2e, median):") for region, efficiency in efficiency_by_region.items(): logger.info(f" {region}: ${efficiency:.2f}") # Visualize efficiency by region plt.figure(figsize=(10, 6)) efficiency_by_region.plot(kind='barh') plt.title('GHG Efficiency by Region (lower is better)') plt.xlabel('GHG Efficiency ($ per ton CO2e)') plt.ylabel('Region') plt.grid(axis='x', alpha=0.3) plt.tight_layout() if output_dir: plt.savefig(output_dir / "regional_efficiency.png", dpi=300) plt.close() # 4. DAC benefit by region if 'dac_benefit_percentage' in df.columns: # Calculate mean DAC benefit by region dac_by_region = df.groupby('ca_region')['dac_benefit_percentage'].mean().sort_values(ascending=False) logger.info("DAC benefit percentage by region:") for region, dac in dac_by_region.items(): logger.info(f" {region}: {dac:.2f}%") # Visualize DAC benefit by region plt.figure(figsize=(10, 6)) dac_by_region.plot(kind='barh') plt.title('DAC Benefit Percentage by Region') plt.xlabel('DAC Benefit Percentage') plt.ylabel('Region') plt.grid(axis='x', alpha=0.3) plt.tight_layout() if output_dir: plt.savefig(output_dir / "regional_dac_benefit.png", dpi=300) plt.close() # 5. Efficiency vs Equity by Region if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns: # Filter to valid data valid_data = df[(df['ghg_efficiency'].notna()) & (df['dac_benefit_percentage'].notna()) & (df['ghg_efficiency'] > 0) & (df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))] # Calculate regional metrics region_metrics = valid_data.groupby('ca_region').agg({ 'ghg_efficiency': 'median', 'dac_benefit_percentage': 'mean', 'ca_region': 'count' }).rename(columns={'ca_region': 'project_count'}) # Create scatter plot plt.figure(figsize=(10, 8)) scatter = plt.scatter( region_metrics['ghg_efficiency'], region_metrics['dac_benefit_percentage'], s=region_metrics['project_count'] / 10, # Size based on project count alpha=0.7 ) # Add region labels for region in region_metrics.index: plt.annotate( region, (region_metrics.loc[region, 'ghg_efficiency'], region_metrics.loc[region, 'dac_benefit_percentage']), textcoords="offset points", xytext=(5, 5), ha='left' ) # Add quadrant lines median_efficiency = region_metrics['ghg_efficiency'].median() median_dac = region_metrics['dac_benefit_percentage'].median() plt.axvline(x=median_efficiency, color='gray', linestyle='--', alpha=0.5) plt.axhline(y=median_dac, color='gray', linestyle='--', alpha=0.5) # Add quadrant labels plt.text(0.98, 0.98, 'High Cost,\nHigh Equity', transform=plt.gca().transAxes, ha='right', va='top', bbox=dict(facecolor='white', alpha=0.7)) plt.text(0.02, 0.98, 'Low Cost,\nHigh Equity', transform=plt.gca().transAxes, ha='left', va='top', bbox=dict(facecolor='white', alpha=0.7)) plt.text(0.98, 0.02, 'High Cost,\nLow Equity', transform=plt.gca().transAxes, ha='right', va='bottom', bbox=dict(facecolor='white', alpha=0.7)) plt.text(0.02, 0.02, 'Low Cost,\nLow Equity', transform=plt.gca().transAxes, ha='left', va='bottom', bbox=dict(facecolor='white', alpha=0.7)) plt.xlabel('GHG Efficiency ($ per ton CO2e)') plt.ylabel('DAC Benefit Percentage') plt.title('Efficiency vs. Equity by Region') plt.grid(True, linestyle='--', alpha=0.7) if output_dir: plt.savefig(output_dir / "regional_efficiency_equity.png", dpi=300) plt.close() # 6. Generate a summary text file if output_dir: with open(output_dir / "regional_analysis_summary.txt", 'w') as f: f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) REGIONAL ANALYSIS\n") f.write("===================================================\n\n") f.write("REGIONAL DISTRIBUTION\n") for region, count in region_counts.items(): f.write(f"{region}: {count} projects ({region_percent[region]:.1f}%)\n") if 'ghg_efficiency' in df.columns: f.write("\nGHG EFFICIENCY BY REGION ($ PER TON CO2E, MEDIAN)\n") for region, efficiency in efficiency_by_region.items(): f.write(f"{region}: ${efficiency:.2f}\n") if 'dac_benefit_percentage' in df.columns: f.write("\nDAC BENEFIT PERCENTAGE BY REGION\n") for region, dac in dac_by_region.items(): f.write(f"{region}: {dac:.2f}%\n") f.write("\nKEY FINDINGS\n") # Add key findings based on the analysis if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns: # Identify top performing regions best_efficiency_region = efficiency_by_region.index[0] best_dac_region = dac_by_region.index[0] f.write(f"1. {best_efficiency_region} achieves the best GHG efficiency (${efficiency_by_region[best_efficiency_region]:.2f} per ton).\n") f.write(f"2. {best_dac_region} achieves the highest DAC benefit ({dac_by_region[best_dac_region]:.2f}%).\n") # Identify balanced regions (good in both dimensions) low_cost_high_equity = region_metrics[(region_metrics['ghg_efficiency'] < median_efficiency) & (region_metrics['dac_benefit_percentage'] > median_dac)] if len(low_cost_high_equity) > 0: top_balanced = low_cost_high_equity.index[0] f.write(f"3. {top_balanced} achieves the best balance between efficiency and equity.\n") # Check for regional disparities max_efficiency_diff = efficiency_by_region.max() / efficiency_by_region.min() if efficiency_by_region.min() > 0 else 0 max_dac_diff = dac_by_region.max() - dac_by_region.min() f.write(f"4. Regional disparities: {max_efficiency_diff:.1f}x variation in efficiency, {max_dac_diff:.1f} percentage point variation in DAC benefits.\n") logger.info(f"Saved regional analysis summary to {output_dir / 'regional_analysis_summary.txt'}") logger.info("Regional analysis completed") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Analyze regional distribution of CCI projects') parser.add_argument('--input_path', type=str, required=True, help='Path to the cleaned CCI data CSV file') parser.add_argument('--output_path', type=str, help='Path to save findings and visualizations') args = parser.parse_args() analyze_regional_distribution(args.input_path, args.output_path)