262 lines
11 KiB
Python
262 lines
11 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from pathlib import Path
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger("regional_analysis")
|
|
|
|
def analyze_regional_distribution(input_path, output_path=None):
|
|
"""
|
|
Analyze the regional distribution of CCI projects and its
|
|
relationship to GHG efficiency and DAC benefits.
|
|
|
|
Parameters:
|
|
input_path (str): Path to the cleaned CCI data CSV file
|
|
output_path (str, optional): Path to save findings and visualizations
|
|
"""
|
|
logger.info(f"Loading data from {input_path}")
|
|
|
|
# Load the data
|
|
df = pd.read_csv(input_path, low_memory=False)
|
|
|
|
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
|
|
|
|
# Check if we have the regional data
|
|
if 'ca_region' not in df.columns:
|
|
logger.error("Regional data not found in the dataset")
|
|
return
|
|
|
|
# Define output directory if provided
|
|
output_dir = None
|
|
if output_path:
|
|
output_dir = Path(output_path)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# 1. Basic regional distribution analysis
|
|
region_counts = df['ca_region'].value_counts()
|
|
region_percent = df['ca_region'].value_counts(normalize=True) * 100
|
|
|
|
logger.info("Regional distribution of CCI projects:")
|
|
for region, count in region_counts.items():
|
|
logger.info(f" {region}: {count} projects ({region_percent[region]:.1f}%)")
|
|
|
|
# Visualize regional distribution
|
|
plt.figure(figsize=(10, 6))
|
|
region_counts.plot(kind='bar')
|
|
plt.title('Number of CCI Projects by Region')
|
|
plt.xlabel('Region')
|
|
plt.ylabel('Number of Projects')
|
|
plt.xticks(rotation=45, ha='right')
|
|
plt.tight_layout()
|
|
|
|
if output_dir:
|
|
plt.savefig(output_dir / "regional_distribution.png", dpi=300)
|
|
plt.close()
|
|
|
|
# 2. EV vouchers vs non-EV projects by region
|
|
if 'is_ev_voucher' in df.columns:
|
|
ev_by_region = df[df['is_ev_voucher']]['ca_region'].value_counts()
|
|
nonev_by_region = df[~df['is_ev_voucher']]['ca_region'].value_counts()
|
|
|
|
# Calculate percentages
|
|
ev_percent = 100 * ev_by_region / ev_by_region.sum()
|
|
nonev_percent = 100 * nonev_by_region / nonev_by_region.sum()
|
|
|
|
# Combine for comparison
|
|
comparison_df = pd.DataFrame({
|
|
'EV Vouchers': ev_percent,
|
|
'Non-EV Projects': nonev_percent
|
|
})
|
|
|
|
# Fill missing values with 0
|
|
comparison_df = comparison_df.fillna(0)
|
|
|
|
# Visualize comparison
|
|
plt.figure(figsize=(12, 6))
|
|
comparison_df.plot(kind='bar')
|
|
plt.title('Regional Distribution: EV Vouchers vs. Non-EV Projects')
|
|
plt.xlabel('Region')
|
|
plt.ylabel('Percentage of Projects')
|
|
plt.xticks(rotation=45, ha='right')
|
|
plt.legend(title='Project Type')
|
|
plt.tight_layout()
|
|
|
|
if output_dir:
|
|
plt.savefig(output_dir / "regional_ev_comparison.png", dpi=300)
|
|
plt.close()
|
|
|
|
# 3. GHG efficiency by region
|
|
if 'ghg_efficiency' in df.columns:
|
|
# Filter to valid efficiency values and non-extreme outliers
|
|
valid_data = df[(df['ghg_efficiency'].notna()) &
|
|
(df['ghg_efficiency'] > 0) &
|
|
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
|
|
|
|
# Calculate median efficiency by region
|
|
efficiency_by_region = valid_data.groupby('ca_region')['ghg_efficiency'].median().sort_values()
|
|
|
|
logger.info("GHG efficiency by region ($ per ton CO2e, median):")
|
|
for region, efficiency in efficiency_by_region.items():
|
|
logger.info(f" {region}: ${efficiency:.2f}")
|
|
|
|
# Visualize efficiency by region
|
|
plt.figure(figsize=(10, 6))
|
|
efficiency_by_region.plot(kind='barh')
|
|
plt.title('GHG Efficiency by Region (lower is better)')
|
|
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.ylabel('Region')
|
|
plt.grid(axis='x', alpha=0.3)
|
|
plt.tight_layout()
|
|
|
|
if output_dir:
|
|
plt.savefig(output_dir / "regional_efficiency.png", dpi=300)
|
|
plt.close()
|
|
|
|
# 4. DAC benefit by region
|
|
if 'dac_benefit_percentage' in df.columns:
|
|
# Calculate mean DAC benefit by region
|
|
dac_by_region = df.groupby('ca_region')['dac_benefit_percentage'].mean().sort_values(ascending=False)
|
|
|
|
logger.info("DAC benefit percentage by region:")
|
|
for region, dac in dac_by_region.items():
|
|
logger.info(f" {region}: {dac:.2f}%")
|
|
|
|
# Visualize DAC benefit by region
|
|
plt.figure(figsize=(10, 6))
|
|
dac_by_region.plot(kind='barh')
|
|
plt.title('DAC Benefit Percentage by Region')
|
|
plt.xlabel('DAC Benefit Percentage')
|
|
plt.ylabel('Region')
|
|
plt.grid(axis='x', alpha=0.3)
|
|
plt.tight_layout()
|
|
|
|
if output_dir:
|
|
plt.savefig(output_dir / "regional_dac_benefit.png", dpi=300)
|
|
plt.close()
|
|
|
|
# 5. Efficiency vs Equity by Region
|
|
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
|
|
# Filter to valid data
|
|
valid_data = df[(df['ghg_efficiency'].notna()) &
|
|
(df['dac_benefit_percentage'].notna()) &
|
|
(df['ghg_efficiency'] > 0) &
|
|
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
|
|
|
|
# Calculate regional metrics
|
|
region_metrics = valid_data.groupby('ca_region').agg({
|
|
'ghg_efficiency': 'median',
|
|
'dac_benefit_percentage': 'mean',
|
|
'ca_region': 'count'
|
|
}).rename(columns={'ca_region': 'project_count'})
|
|
|
|
# Create scatter plot
|
|
plt.figure(figsize=(10, 8))
|
|
|
|
scatter = plt.scatter(
|
|
region_metrics['ghg_efficiency'],
|
|
region_metrics['dac_benefit_percentage'],
|
|
s=region_metrics['project_count'] / 10, # Size based on project count
|
|
alpha=0.7
|
|
)
|
|
|
|
# Add region labels
|
|
for region in region_metrics.index:
|
|
plt.annotate(
|
|
region,
|
|
(region_metrics.loc[region, 'ghg_efficiency'],
|
|
region_metrics.loc[region, 'dac_benefit_percentage']),
|
|
textcoords="offset points",
|
|
xytext=(5, 5),
|
|
ha='left'
|
|
)
|
|
|
|
# Add quadrant lines
|
|
median_efficiency = region_metrics['ghg_efficiency'].median()
|
|
median_dac = region_metrics['dac_benefit_percentage'].median()
|
|
|
|
plt.axvline(x=median_efficiency, color='gray', linestyle='--', alpha=0.5)
|
|
plt.axhline(y=median_dac, color='gray', linestyle='--', alpha=0.5)
|
|
|
|
# Add quadrant labels
|
|
plt.text(0.98, 0.98, 'High Cost,\nHigh Equity', transform=plt.gca().transAxes,
|
|
ha='right', va='top', bbox=dict(facecolor='white', alpha=0.7))
|
|
plt.text(0.02, 0.98, 'Low Cost,\nHigh Equity', transform=plt.gca().transAxes,
|
|
ha='left', va='top', bbox=dict(facecolor='white', alpha=0.7))
|
|
plt.text(0.98, 0.02, 'High Cost,\nLow Equity', transform=plt.gca().transAxes,
|
|
ha='right', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
|
|
plt.text(0.02, 0.02, 'Low Cost,\nLow Equity', transform=plt.gca().transAxes,
|
|
ha='left', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
|
|
|
|
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
|
plt.ylabel('DAC Benefit Percentage')
|
|
plt.title('Efficiency vs. Equity by Region')
|
|
plt.grid(True, linestyle='--', alpha=0.7)
|
|
|
|
if output_dir:
|
|
plt.savefig(output_dir / "regional_efficiency_equity.png", dpi=300)
|
|
plt.close()
|
|
|
|
# 6. Generate a summary text file
|
|
if output_dir:
|
|
with open(output_dir / "regional_analysis_summary.txt", 'w') as f:
|
|
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) REGIONAL ANALYSIS\n")
|
|
f.write("===================================================\n\n")
|
|
|
|
f.write("REGIONAL DISTRIBUTION\n")
|
|
for region, count in region_counts.items():
|
|
f.write(f"{region}: {count} projects ({region_percent[region]:.1f}%)\n")
|
|
|
|
if 'ghg_efficiency' in df.columns:
|
|
f.write("\nGHG EFFICIENCY BY REGION ($ PER TON CO2E, MEDIAN)\n")
|
|
for region, efficiency in efficiency_by_region.items():
|
|
f.write(f"{region}: ${efficiency:.2f}\n")
|
|
|
|
if 'dac_benefit_percentage' in df.columns:
|
|
f.write("\nDAC BENEFIT PERCENTAGE BY REGION\n")
|
|
for region, dac in dac_by_region.items():
|
|
f.write(f"{region}: {dac:.2f}%\n")
|
|
|
|
f.write("\nKEY FINDINGS\n")
|
|
|
|
# Add key findings based on the analysis
|
|
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
|
|
# Identify top performing regions
|
|
best_efficiency_region = efficiency_by_region.index[0]
|
|
best_dac_region = dac_by_region.index[0]
|
|
|
|
f.write(f"1. {best_efficiency_region} achieves the best GHG efficiency (${efficiency_by_region[best_efficiency_region]:.2f} per ton).\n")
|
|
f.write(f"2. {best_dac_region} achieves the highest DAC benefit ({dac_by_region[best_dac_region]:.2f}%).\n")
|
|
|
|
# Identify balanced regions (good in both dimensions)
|
|
low_cost_high_equity = region_metrics[(region_metrics['ghg_efficiency'] < median_efficiency) &
|
|
(region_metrics['dac_benefit_percentage'] > median_dac)]
|
|
|
|
if len(low_cost_high_equity) > 0:
|
|
top_balanced = low_cost_high_equity.index[0]
|
|
f.write(f"3. {top_balanced} achieves the best balance between efficiency and equity.\n")
|
|
|
|
# Check for regional disparities
|
|
max_efficiency_diff = efficiency_by_region.max() / efficiency_by_region.min() if efficiency_by_region.min() > 0 else 0
|
|
max_dac_diff = dac_by_region.max() - dac_by_region.min()
|
|
|
|
f.write(f"4. Regional disparities: {max_efficiency_diff:.1f}x variation in efficiency, {max_dac_diff:.1f} percentage point variation in DAC benefits.\n")
|
|
|
|
logger.info(f"Saved regional analysis summary to {output_dir / 'regional_analysis_summary.txt'}")
|
|
|
|
logger.info("Regional analysis completed")
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Analyze regional distribution of CCI projects')
|
|
parser.add_argument('--input_path', type=str, required=True, help='Path to the cleaned CCI data CSV file')
|
|
parser.add_argument('--output_path', type=str, help='Path to save findings and visualizations')
|
|
|
|
args = parser.parse_args()
|
|
|
|
analyze_regional_distribution(args.input_path, args.output_path) |