idk
This commit is contained in:
262
regional_analysis_script.py
Normal file
262
regional_analysis_script.py
Normal file
@@ -0,0 +1,262 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("regional_analysis")
|
||||
|
||||
def analyze_regional_distribution(input_path, output_path=None):
|
||||
"""
|
||||
Analyze the regional distribution of CCI projects and its
|
||||
relationship to GHG efficiency and DAC benefits.
|
||||
|
||||
Parameters:
|
||||
input_path (str): Path to the cleaned CCI data CSV file
|
||||
output_path (str, optional): Path to save findings and visualizations
|
||||
"""
|
||||
logger.info(f"Loading data from {input_path}")
|
||||
|
||||
# Load the data
|
||||
df = pd.read_csv(input_path, low_memory=False)
|
||||
|
||||
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
|
||||
|
||||
# Check if we have the regional data
|
||||
if 'ca_region' not in df.columns:
|
||||
logger.error("Regional data not found in the dataset")
|
||||
return
|
||||
|
||||
# Define output directory if provided
|
||||
output_dir = None
|
||||
if output_path:
|
||||
output_dir = Path(output_path)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Basic regional distribution analysis
|
||||
region_counts = df['ca_region'].value_counts()
|
||||
region_percent = df['ca_region'].value_counts(normalize=True) * 100
|
||||
|
||||
logger.info("Regional distribution of CCI projects:")
|
||||
for region, count in region_counts.items():
|
||||
logger.info(f" {region}: {count} projects ({region_percent[region]:.1f}%)")
|
||||
|
||||
# Visualize regional distribution
|
||||
plt.figure(figsize=(10, 6))
|
||||
region_counts.plot(kind='bar')
|
||||
plt.title('Number of CCI Projects by Region')
|
||||
plt.xlabel('Region')
|
||||
plt.ylabel('Number of Projects')
|
||||
plt.xticks(rotation=45, ha='right')
|
||||
plt.tight_layout()
|
||||
|
||||
if output_dir:
|
||||
plt.savefig(output_dir / "regional_distribution.png", dpi=300)
|
||||
plt.close()
|
||||
|
||||
# 2. EV vouchers vs non-EV projects by region
|
||||
if 'is_ev_voucher' in df.columns:
|
||||
ev_by_region = df[df['is_ev_voucher']]['ca_region'].value_counts()
|
||||
nonev_by_region = df[~df['is_ev_voucher']]['ca_region'].value_counts()
|
||||
|
||||
# Calculate percentages
|
||||
ev_percent = 100 * ev_by_region / ev_by_region.sum()
|
||||
nonev_percent = 100 * nonev_by_region / nonev_by_region.sum()
|
||||
|
||||
# Combine for comparison
|
||||
comparison_df = pd.DataFrame({
|
||||
'EV Vouchers': ev_percent,
|
||||
'Non-EV Projects': nonev_percent
|
||||
})
|
||||
|
||||
# Fill missing values with 0
|
||||
comparison_df = comparison_df.fillna(0)
|
||||
|
||||
# Visualize comparison
|
||||
plt.figure(figsize=(12, 6))
|
||||
comparison_df.plot(kind='bar')
|
||||
plt.title('Regional Distribution: EV Vouchers vs. Non-EV Projects')
|
||||
plt.xlabel('Region')
|
||||
plt.ylabel('Percentage of Projects')
|
||||
plt.xticks(rotation=45, ha='right')
|
||||
plt.legend(title='Project Type')
|
||||
plt.tight_layout()
|
||||
|
||||
if output_dir:
|
||||
plt.savefig(output_dir / "regional_ev_comparison.png", dpi=300)
|
||||
plt.close()
|
||||
|
||||
# 3. GHG efficiency by region
|
||||
if 'ghg_efficiency' in df.columns:
|
||||
# Filter to valid efficiency values and non-extreme outliers
|
||||
valid_data = df[(df['ghg_efficiency'].notna()) &
|
||||
(df['ghg_efficiency'] > 0) &
|
||||
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
|
||||
|
||||
# Calculate median efficiency by region
|
||||
efficiency_by_region = valid_data.groupby('ca_region')['ghg_efficiency'].median().sort_values()
|
||||
|
||||
logger.info("GHG efficiency by region ($ per ton CO2e, median):")
|
||||
for region, efficiency in efficiency_by_region.items():
|
||||
logger.info(f" {region}: ${efficiency:.2f}")
|
||||
|
||||
# Visualize efficiency by region
|
||||
plt.figure(figsize=(10, 6))
|
||||
efficiency_by_region.plot(kind='barh')
|
||||
plt.title('GHG Efficiency by Region (lower is better)')
|
||||
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
||||
plt.ylabel('Region')
|
||||
plt.grid(axis='x', alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
if output_dir:
|
||||
plt.savefig(output_dir / "regional_efficiency.png", dpi=300)
|
||||
plt.close()
|
||||
|
||||
# 4. DAC benefit by region
|
||||
if 'dac_benefit_percentage' in df.columns:
|
||||
# Calculate mean DAC benefit by region
|
||||
dac_by_region = df.groupby('ca_region')['dac_benefit_percentage'].mean().sort_values(ascending=False)
|
||||
|
||||
logger.info("DAC benefit percentage by region:")
|
||||
for region, dac in dac_by_region.items():
|
||||
logger.info(f" {region}: {dac:.2f}%")
|
||||
|
||||
# Visualize DAC benefit by region
|
||||
plt.figure(figsize=(10, 6))
|
||||
dac_by_region.plot(kind='barh')
|
||||
plt.title('DAC Benefit Percentage by Region')
|
||||
plt.xlabel('DAC Benefit Percentage')
|
||||
plt.ylabel('Region')
|
||||
plt.grid(axis='x', alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
if output_dir:
|
||||
plt.savefig(output_dir / "regional_dac_benefit.png", dpi=300)
|
||||
plt.close()
|
||||
|
||||
# 5. Efficiency vs Equity by Region
|
||||
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
|
||||
# Filter to valid data
|
||||
valid_data = df[(df['ghg_efficiency'].notna()) &
|
||||
(df['dac_benefit_percentage'].notna()) &
|
||||
(df['ghg_efficiency'] > 0) &
|
||||
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
|
||||
|
||||
# Calculate regional metrics
|
||||
region_metrics = valid_data.groupby('ca_region').agg({
|
||||
'ghg_efficiency': 'median',
|
||||
'dac_benefit_percentage': 'mean',
|
||||
'ca_region': 'count'
|
||||
}).rename(columns={'ca_region': 'project_count'})
|
||||
|
||||
# Create scatter plot
|
||||
plt.figure(figsize=(10, 8))
|
||||
|
||||
scatter = plt.scatter(
|
||||
region_metrics['ghg_efficiency'],
|
||||
region_metrics['dac_benefit_percentage'],
|
||||
s=region_metrics['project_count'] / 10, # Size based on project count
|
||||
alpha=0.7
|
||||
)
|
||||
|
||||
# Add region labels
|
||||
for region in region_metrics.index:
|
||||
plt.annotate(
|
||||
region,
|
||||
(region_metrics.loc[region, 'ghg_efficiency'],
|
||||
region_metrics.loc[region, 'dac_benefit_percentage']),
|
||||
textcoords="offset points",
|
||||
xytext=(5, 5),
|
||||
ha='left'
|
||||
)
|
||||
|
||||
# Add quadrant lines
|
||||
median_efficiency = region_metrics['ghg_efficiency'].median()
|
||||
median_dac = region_metrics['dac_benefit_percentage'].median()
|
||||
|
||||
plt.axvline(x=median_efficiency, color='gray', linestyle='--', alpha=0.5)
|
||||
plt.axhline(y=median_dac, color='gray', linestyle='--', alpha=0.5)
|
||||
|
||||
# Add quadrant labels
|
||||
plt.text(0.98, 0.98, 'High Cost,\nHigh Equity', transform=plt.gca().transAxes,
|
||||
ha='right', va='top', bbox=dict(facecolor='white', alpha=0.7))
|
||||
plt.text(0.02, 0.98, 'Low Cost,\nHigh Equity', transform=plt.gca().transAxes,
|
||||
ha='left', va='top', bbox=dict(facecolor='white', alpha=0.7))
|
||||
plt.text(0.98, 0.02, 'High Cost,\nLow Equity', transform=plt.gca().transAxes,
|
||||
ha='right', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
|
||||
plt.text(0.02, 0.02, 'Low Cost,\nLow Equity', transform=plt.gca().transAxes,
|
||||
ha='left', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
|
||||
|
||||
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
||||
plt.ylabel('DAC Benefit Percentage')
|
||||
plt.title('Efficiency vs. Equity by Region')
|
||||
plt.grid(True, linestyle='--', alpha=0.7)
|
||||
|
||||
if output_dir:
|
||||
plt.savefig(output_dir / "regional_efficiency_equity.png", dpi=300)
|
||||
plt.close()
|
||||
|
||||
# 6. Generate a summary text file
|
||||
if output_dir:
|
||||
with open(output_dir / "regional_analysis_summary.txt", 'w') as f:
|
||||
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) REGIONAL ANALYSIS\n")
|
||||
f.write("===================================================\n\n")
|
||||
|
||||
f.write("REGIONAL DISTRIBUTION\n")
|
||||
for region, count in region_counts.items():
|
||||
f.write(f"{region}: {count} projects ({region_percent[region]:.1f}%)\n")
|
||||
|
||||
if 'ghg_efficiency' in df.columns:
|
||||
f.write("\nGHG EFFICIENCY BY REGION ($ PER TON CO2E, MEDIAN)\n")
|
||||
for region, efficiency in efficiency_by_region.items():
|
||||
f.write(f"{region}: ${efficiency:.2f}\n")
|
||||
|
||||
if 'dac_benefit_percentage' in df.columns:
|
||||
f.write("\nDAC BENEFIT PERCENTAGE BY REGION\n")
|
||||
for region, dac in dac_by_region.items():
|
||||
f.write(f"{region}: {dac:.2f}%\n")
|
||||
|
||||
f.write("\nKEY FINDINGS\n")
|
||||
|
||||
# Add key findings based on the analysis
|
||||
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
|
||||
# Identify top performing regions
|
||||
best_efficiency_region = efficiency_by_region.index[0]
|
||||
best_dac_region = dac_by_region.index[0]
|
||||
|
||||
f.write(f"1. {best_efficiency_region} achieves the best GHG efficiency (${efficiency_by_region[best_efficiency_region]:.2f} per ton).\n")
|
||||
f.write(f"2. {best_dac_region} achieves the highest DAC benefit ({dac_by_region[best_dac_region]:.2f}%).\n")
|
||||
|
||||
# Identify balanced regions (good in both dimensions)
|
||||
low_cost_high_equity = region_metrics[(region_metrics['ghg_efficiency'] < median_efficiency) &
|
||||
(region_metrics['dac_benefit_percentage'] > median_dac)]
|
||||
|
||||
if len(low_cost_high_equity) > 0:
|
||||
top_balanced = low_cost_high_equity.index[0]
|
||||
f.write(f"3. {top_balanced} achieves the best balance between efficiency and equity.\n")
|
||||
|
||||
# Check for regional disparities
|
||||
max_efficiency_diff = efficiency_by_region.max() / efficiency_by_region.min() if efficiency_by_region.min() > 0 else 0
|
||||
max_dac_diff = dac_by_region.max() - dac_by_region.min()
|
||||
|
||||
f.write(f"4. Regional disparities: {max_efficiency_diff:.1f}x variation in efficiency, {max_dac_diff:.1f} percentage point variation in DAC benefits.\n")
|
||||
|
||||
logger.info(f"Saved regional analysis summary to {output_dir / 'regional_analysis_summary.txt'}")
|
||||
|
||||
logger.info("Regional analysis completed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Analyze regional distribution of CCI projects')
|
||||
parser.add_argument('--input_path', type=str, required=True, help='Path to the cleaned CCI data CSV file')
|
||||
parser.add_argument('--output_path', type=str, help='Path to save findings and visualizations')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyze_regional_distribution(args.input_path, args.output_path)
|
||||
Reference in New Issue
Block a user