This commit is contained in:
2025-04-10 00:03:30 -07:00
parent 81ec68b3cc
commit 03ae352949
12 changed files with 150373 additions and 0 deletions

262
regional_analysis_script.py Normal file
View File

@@ -0,0 +1,262 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("regional_analysis")
def analyze_regional_distribution(input_path, output_path=None):
"""
Analyze the regional distribution of CCI projects and its
relationship to GHG efficiency and DAC benefits.
Parameters:
input_path (str): Path to the cleaned CCI data CSV file
output_path (str, optional): Path to save findings and visualizations
"""
logger.info(f"Loading data from {input_path}")
# Load the data
df = pd.read_csv(input_path, low_memory=False)
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
# Check if we have the regional data
if 'ca_region' not in df.columns:
logger.error("Regional data not found in the dataset")
return
# Define output directory if provided
output_dir = None
if output_path:
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
# 1. Basic regional distribution analysis
region_counts = df['ca_region'].value_counts()
region_percent = df['ca_region'].value_counts(normalize=True) * 100
logger.info("Regional distribution of CCI projects:")
for region, count in region_counts.items():
logger.info(f" {region}: {count} projects ({region_percent[region]:.1f}%)")
# Visualize regional distribution
plt.figure(figsize=(10, 6))
region_counts.plot(kind='bar')
plt.title('Number of CCI Projects by Region')
plt.xlabel('Region')
plt.ylabel('Number of Projects')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_distribution.png", dpi=300)
plt.close()
# 2. EV vouchers vs non-EV projects by region
if 'is_ev_voucher' in df.columns:
ev_by_region = df[df['is_ev_voucher']]['ca_region'].value_counts()
nonev_by_region = df[~df['is_ev_voucher']]['ca_region'].value_counts()
# Calculate percentages
ev_percent = 100 * ev_by_region / ev_by_region.sum()
nonev_percent = 100 * nonev_by_region / nonev_by_region.sum()
# Combine for comparison
comparison_df = pd.DataFrame({
'EV Vouchers': ev_percent,
'Non-EV Projects': nonev_percent
})
# Fill missing values with 0
comparison_df = comparison_df.fillna(0)
# Visualize comparison
plt.figure(figsize=(12, 6))
comparison_df.plot(kind='bar')
plt.title('Regional Distribution: EV Vouchers vs. Non-EV Projects')
plt.xlabel('Region')
plt.ylabel('Percentage of Projects')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Project Type')
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_ev_comparison.png", dpi=300)
plt.close()
# 3. GHG efficiency by region
if 'ghg_efficiency' in df.columns:
# Filter to valid efficiency values and non-extreme outliers
valid_data = df[(df['ghg_efficiency'].notna()) &
(df['ghg_efficiency'] > 0) &
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
# Calculate median efficiency by region
efficiency_by_region = valid_data.groupby('ca_region')['ghg_efficiency'].median().sort_values()
logger.info("GHG efficiency by region ($ per ton CO2e, median):")
for region, efficiency in efficiency_by_region.items():
logger.info(f" {region}: ${efficiency:.2f}")
# Visualize efficiency by region
plt.figure(figsize=(10, 6))
efficiency_by_region.plot(kind='barh')
plt.title('GHG Efficiency by Region (lower is better)')
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
plt.ylabel('Region')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_efficiency.png", dpi=300)
plt.close()
# 4. DAC benefit by region
if 'dac_benefit_percentage' in df.columns:
# Calculate mean DAC benefit by region
dac_by_region = df.groupby('ca_region')['dac_benefit_percentage'].mean().sort_values(ascending=False)
logger.info("DAC benefit percentage by region:")
for region, dac in dac_by_region.items():
logger.info(f" {region}: {dac:.2f}%")
# Visualize DAC benefit by region
plt.figure(figsize=(10, 6))
dac_by_region.plot(kind='barh')
plt.title('DAC Benefit Percentage by Region')
plt.xlabel('DAC Benefit Percentage')
plt.ylabel('Region')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_dac_benefit.png", dpi=300)
plt.close()
# 5. Efficiency vs Equity by Region
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
# Filter to valid data
valid_data = df[(df['ghg_efficiency'].notna()) &
(df['dac_benefit_percentage'].notna()) &
(df['ghg_efficiency'] > 0) &
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
# Calculate regional metrics
region_metrics = valid_data.groupby('ca_region').agg({
'ghg_efficiency': 'median',
'dac_benefit_percentage': 'mean',
'ca_region': 'count'
}).rename(columns={'ca_region': 'project_count'})
# Create scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
region_metrics['ghg_efficiency'],
region_metrics['dac_benefit_percentage'],
s=region_metrics['project_count'] / 10, # Size based on project count
alpha=0.7
)
# Add region labels
for region in region_metrics.index:
plt.annotate(
region,
(region_metrics.loc[region, 'ghg_efficiency'],
region_metrics.loc[region, 'dac_benefit_percentage']),
textcoords="offset points",
xytext=(5, 5),
ha='left'
)
# Add quadrant lines
median_efficiency = region_metrics['ghg_efficiency'].median()
median_dac = region_metrics['dac_benefit_percentage'].median()
plt.axvline(x=median_efficiency, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=median_dac, color='gray', linestyle='--', alpha=0.5)
# Add quadrant labels
plt.text(0.98, 0.98, 'High Cost,\nHigh Equity', transform=plt.gca().transAxes,
ha='right', va='top', bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.02, 0.98, 'Low Cost,\nHigh Equity', transform=plt.gca().transAxes,
ha='left', va='top', bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.98, 0.02, 'High Cost,\nLow Equity', transform=plt.gca().transAxes,
ha='right', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.02, 0.02, 'Low Cost,\nLow Equity', transform=plt.gca().transAxes,
ha='left', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
plt.ylabel('DAC Benefit Percentage')
plt.title('Efficiency vs. Equity by Region')
plt.grid(True, linestyle='--', alpha=0.7)
if output_dir:
plt.savefig(output_dir / "regional_efficiency_equity.png", dpi=300)
plt.close()
# 6. Generate a summary text file
if output_dir:
with open(output_dir / "regional_analysis_summary.txt", 'w') as f:
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) REGIONAL ANALYSIS\n")
f.write("===================================================\n\n")
f.write("REGIONAL DISTRIBUTION\n")
for region, count in region_counts.items():
f.write(f"{region}: {count} projects ({region_percent[region]:.1f}%)\n")
if 'ghg_efficiency' in df.columns:
f.write("\nGHG EFFICIENCY BY REGION ($ PER TON CO2E, MEDIAN)\n")
for region, efficiency in efficiency_by_region.items():
f.write(f"{region}: ${efficiency:.2f}\n")
if 'dac_benefit_percentage' in df.columns:
f.write("\nDAC BENEFIT PERCENTAGE BY REGION\n")
for region, dac in dac_by_region.items():
f.write(f"{region}: {dac:.2f}%\n")
f.write("\nKEY FINDINGS\n")
# Add key findings based on the analysis
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
# Identify top performing regions
best_efficiency_region = efficiency_by_region.index[0]
best_dac_region = dac_by_region.index[0]
f.write(f"1. {best_efficiency_region} achieves the best GHG efficiency (${efficiency_by_region[best_efficiency_region]:.2f} per ton).\n")
f.write(f"2. {best_dac_region} achieves the highest DAC benefit ({dac_by_region[best_dac_region]:.2f}%).\n")
# Identify balanced regions (good in both dimensions)
low_cost_high_equity = region_metrics[(region_metrics['ghg_efficiency'] < median_efficiency) &
(region_metrics['dac_benefit_percentage'] > median_dac)]
if len(low_cost_high_equity) > 0:
top_balanced = low_cost_high_equity.index[0]
f.write(f"3. {top_balanced} achieves the best balance between efficiency and equity.\n")
# Check for regional disparities
max_efficiency_diff = efficiency_by_region.max() / efficiency_by_region.min() if efficiency_by_region.min() > 0 else 0
max_dac_diff = dac_by_region.max() - dac_by_region.min()
f.write(f"4. Regional disparities: {max_efficiency_diff:.1f}x variation in efficiency, {max_dac_diff:.1f} percentage point variation in DAC benefits.\n")
logger.info(f"Saved regional analysis summary to {output_dir / 'regional_analysis_summary.txt'}")
logger.info("Regional analysis completed")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Analyze regional distribution of CCI projects')
parser.add_argument('--input_path', type=str, required=True, help='Path to the cleaned CCI data CSV file')
parser.add_argument('--output_path', type=str, help='Path to save findings and visualizations')
args = parser.parse_args()
analyze_regional_distribution(args.input_path, args.output_path)