i hope were getting some where
This commit is contained in:
857
CCIDataAnalyzer.py
Normal file
857
CCIDataAnalyzer.py
Normal file
@@ -0,0 +1,857 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
# Configure basic logging
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("cci_analyzer")
|
||||
|
||||
# Suppress pandas warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
class CCIDataAnalyzer:
|
||||
"""Simplified analyzer for California Climate Investments data."""
|
||||
|
||||
def __init__(self, data_path, output_path="./output"):
|
||||
self.data_path = Path(data_path)
|
||||
self.output_path = Path(output_path)
|
||||
self.output_path.mkdir(parents=True, exist_ok=True)
|
||||
self.data = {}
|
||||
|
||||
logger.info(f"Initialized with data path: {self.data_path}")
|
||||
|
||||
def load_data(self):
|
||||
"""Load CCI data with special handling for encoding issues."""
|
||||
try:
|
||||
logger.info(f"Loading data from {self.data_path}")
|
||||
|
||||
# Read as string to avoid conversion errors
|
||||
df = pd.read_csv(self.data_path, dtype=str)
|
||||
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
|
||||
|
||||
# Clean and process the data
|
||||
df = self._clean_data(df)
|
||||
|
||||
# Store in data dictionary
|
||||
self.data['cci_projects'] = df
|
||||
|
||||
# Create separate datasets for CARB and non-CARB projects
|
||||
self._create_carb_datasets()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading data: {e}")
|
||||
return False
|
||||
|
||||
def _clean_data(self, df):
|
||||
"""Clean and process the CCI data."""
|
||||
try:
|
||||
# 1. Fix column names
|
||||
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
|
||||
|
||||
# 2. Handle the problematic lat_long column with LibreOffice encoding
|
||||
if 'lat_long' in df.columns:
|
||||
logger.info("Processing coordinates with special encoding handling")
|
||||
|
||||
# Function to clean LibreOffice encoding
|
||||
def clean_libreoffice_encoding(text):
|
||||
if pd.isna(text):
|
||||
return text
|
||||
|
||||
# Special LibreOffice character replacements
|
||||
replacements = {
|
||||
'+AC0-': '-', # Minus sign
|
||||
'+ACI-': '"', # Quote mark
|
||||
}
|
||||
|
||||
cleaned = str(text)
|
||||
for code, char in replacements.items():
|
||||
cleaned = cleaned.replace(code, char)
|
||||
|
||||
return cleaned
|
||||
|
||||
# Clean the lat_long column
|
||||
df['lat_long'] = df['lat_long'].apply(clean_libreoffice_encoding)
|
||||
|
||||
# Extract latitude and longitude
|
||||
def extract_coords(coord_str):
|
||||
if pd.isna(coord_str):
|
||||
return (np.nan, np.nan)
|
||||
|
||||
try:
|
||||
# Try to split by comma
|
||||
if ',' in coord_str:
|
||||
parts = coord_str.split(',')
|
||||
if len(parts) >= 2:
|
||||
lon = parts[0].strip()
|
||||
lat = parts[1].strip()
|
||||
return (float(lat), float(lon))
|
||||
except:
|
||||
pass
|
||||
|
||||
return (np.nan, np.nan)
|
||||
|
||||
# Extract coordinates safely
|
||||
try:
|
||||
coords = df['lat_long'].apply(extract_coords)
|
||||
df['latitude'] = coords.apply(lambda x: x[0])
|
||||
df['longitude'] = coords.apply(lambda x: x[1])
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting coordinates: {e}")
|
||||
|
||||
# 3. Convert numeric columns
|
||||
numeric_cols = [
|
||||
'total_project_cost',
|
||||
'total_program_ggrffunding',
|
||||
'project_life_years',
|
||||
'total_project_ghgreductions',
|
||||
'annual_project_ghgreductions'
|
||||
]
|
||||
|
||||
for col in df.columns:
|
||||
# Find matching columns (case insensitive)
|
||||
if any(num_col in col.lower() for num_col in
|
||||
['cost', 'funding', 'ghg', 'reductions', 'years']):
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce')
|
||||
|
||||
# 4. Convert date columns
|
||||
date_cols = [col for col in df.columns if 'date' in col.lower()]
|
||||
for col in date_cols:
|
||||
df[col] = pd.to_datetime(df[col], errors='coerce')
|
||||
|
||||
# 5. Extract funding year
|
||||
fiscal_year_cols = [col for col in df.columns if 'fiscal_year' in col.lower()]
|
||||
if fiscal_year_cols:
|
||||
try:
|
||||
# Handle different possible formats of fiscal year column
|
||||
year_col = fiscal_year_cols[0]
|
||||
# Try multiple approaches to extract year
|
||||
try:
|
||||
# Handle standard fiscal year format like "2019-20"
|
||||
df['funding_year'] = df[year_col].astype(str).str.extract(r'(\d{4})').astype('Int64')
|
||||
except Exception:
|
||||
logger.warning(f"Could not extract year with regex pattern, trying direct conversion")
|
||||
# Try direct conversion if it's already a year
|
||||
df['funding_year'] = pd.to_numeric(df[year_col], errors='coerce').astype('Int64')
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting funding year: {e}")
|
||||
|
||||
# 6. Calculate derived metrics if columns exist
|
||||
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
||||
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
|
||||
dac_col = [col for col in df.columns if 'funding_benefiting' in col.lower()]
|
||||
|
||||
if funding_col and ghg_col:
|
||||
df['ghg_efficiency'] = np.where(
|
||||
df[ghg_col[0]] > 0,
|
||||
df[funding_col[0]] / df[ghg_col[0]],
|
||||
np.nan
|
||||
)
|
||||
|
||||
if funding_col and dac_col:
|
||||
df['dac_benefit_percentage'] = np.where(
|
||||
df[funding_col[0]] > 0,
|
||||
100 * df[dac_col[0]] / df[funding_col[0]],
|
||||
0
|
||||
)
|
||||
|
||||
logger.info("Data cleaning and processing complete")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning data: {e}")
|
||||
return df
|
||||
|
||||
def _create_carb_datasets(self):
|
||||
"""Create separate datasets for CARB and non-CARB projects."""
|
||||
if 'cci_projects' not in self.data:
|
||||
logger.error("No data available to create CARB datasets")
|
||||
return
|
||||
|
||||
df = self.data['cci_projects']
|
||||
|
||||
try:
|
||||
# Check if agency_name column exists
|
||||
if 'agency_name' not in df.columns:
|
||||
logger.error("agency_name column not found")
|
||||
return
|
||||
|
||||
# Create CARB dataset
|
||||
carb_mask = df['agency_name'].str.contains('Air Resources Board', case=False, na=False)
|
||||
self.data['carb_projects'] = df[carb_mask].copy()
|
||||
self.data['non_carb_projects'] = df[~carb_mask].copy()
|
||||
|
||||
logger.info(f"Created CARB dataset with {len(self.data['carb_projects'])} projects")
|
||||
logger.info(f"Created non-CARB dataset with {len(self.data['non_carb_projects'])} projects")
|
||||
|
||||
# Identify EV rebate/voucher projects within CARB
|
||||
if len(self.data['carb_projects']) > 0:
|
||||
carb_df = self.data['carb_projects']
|
||||
|
||||
# Look for EV-related projects using various columns
|
||||
ev_indicators = ['electric vehicle', 'ev ', 'rebate', 'voucher', 'clean vehicle']
|
||||
|
||||
# Check program name for EV indicators
|
||||
if 'program_name' in carb_df.columns:
|
||||
ev_mask = carb_df['program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False)
|
||||
elif 'sub_program_name' in carb_df.columns:
|
||||
ev_mask = carb_df['sub_program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False)
|
||||
else:
|
||||
# If specific columns not found, try to find any column that might indicate EV projects
|
||||
ev_mask = pd.Series(False, index=carb_df.index)
|
||||
for col in carb_df.columns:
|
||||
if carb_df[col].dtype == 'object':
|
||||
try:
|
||||
ev_mask = ev_mask | carb_df[col].astype(str).str.lower().str.contains('|'.join(ev_indicators), na=False)
|
||||
except:
|
||||
pass
|
||||
|
||||
self.data['ev_projects'] = carb_df[ev_mask].copy()
|
||||
logger.info(f"Identified {len(self.data['ev_projects'])} potential EV rebate/voucher projects")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating CARB datasets: {e}")
|
||||
|
||||
def analyze_data(self, include_carb_breakdown=True):
|
||||
"""Basic analysis of CCI data with optional CARB breakdown."""
|
||||
if 'cci_projects' not in self.data:
|
||||
logger.error("No data available for analysis")
|
||||
return None
|
||||
|
||||
df = self.data['cci_projects']
|
||||
|
||||
# Get agency information
|
||||
if 'agency_name' in df.columns:
|
||||
agency_counts = df['agency_name'].value_counts()
|
||||
print("\nAgencies involved in CCI projects:")
|
||||
for agency, count in agency_counts.head(10).items():
|
||||
print(f" {agency}: {count} projects")
|
||||
|
||||
# Analyze funding distribution
|
||||
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
||||
if funding_col:
|
||||
total_funding = df[funding_col[0]].sum()
|
||||
print(f"\nTotal CCI funding: ${total_funding:,.2f}")
|
||||
print(f"Average project funding: ${df[funding_col[0]].mean():,.2f}")
|
||||
|
||||
# Analyze GHG reductions
|
||||
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
|
||||
if ghg_col:
|
||||
total_ghg = df[ghg_col[0]].sum()
|
||||
print(f"\nTotal GHG reductions: {total_ghg:,.2f} tons")
|
||||
print(f"Average GHG reduction per project: {df[ghg_col[0]].mean():,.2f} tons")
|
||||
|
||||
# Analyze DAC benefits
|
||||
if 'dac_benefit_percentage' in df.columns:
|
||||
avg_dac = df['dac_benefit_percentage'].mean()
|
||||
print(f"\nAverage DAC benefit percentage: {avg_dac:.2f}%")
|
||||
|
||||
# CARB vs. Non-CARB Analysis
|
||||
if include_carb_breakdown and 'carb_projects' in self.data and 'non_carb_projects' in self.data:
|
||||
carb_df = self.data['carb_projects']
|
||||
non_carb_df = self.data['non_carb_projects']
|
||||
|
||||
print("\n--- CARB vs. Non-CARB Analysis ---")
|
||||
print(f"CARB projects: {len(carb_df)} ({len(carb_df)/len(df)*100:.1f}% of total)")
|
||||
print(f"Non-CARB projects: {len(non_carb_df)} ({len(non_carb_df)/len(df)*100:.1f}% of total)")
|
||||
|
||||
if funding_col:
|
||||
carb_funding = carb_df[funding_col[0]].sum()
|
||||
non_carb_funding = non_carb_df[funding_col[0]].sum()
|
||||
print(f"\nCARB funding: ${carb_funding:,.2f} ({carb_funding/total_funding*100:.1f}% of total)")
|
||||
print(f"Non-CARB funding: ${non_carb_funding:,.2f} ({non_carb_funding/total_funding*100:.1f}% of total)")
|
||||
print(f"Average CARB project: ${carb_df[funding_col[0]].mean():,.2f}")
|
||||
print(f"Average non-CARB project: ${non_carb_df[funding_col[0]].mean():,.2f}")
|
||||
|
||||
if ghg_col:
|
||||
carb_ghg = carb_df[ghg_col[0]].sum()
|
||||
non_carb_ghg = non_carb_df[ghg_col[0]].sum()
|
||||
print(f"\nCARB GHG reductions: {carb_ghg:,.2f} tons ({carb_ghg/total_ghg*100:.1f}% of total)")
|
||||
print(f"Non-CARB GHG reductions: {non_carb_ghg:,.2f} tons ({non_carb_ghg/total_ghg*100:.1f}% of total)")
|
||||
|
||||
# Calculate efficiency
|
||||
if funding_col:
|
||||
carb_efficiency = carb_funding / carb_ghg if carb_ghg > 0 else 0
|
||||
non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0
|
||||
print(f"\nCARB efficiency: ${carb_efficiency:.2f} per ton CO2e")
|
||||
print(f"Non-CARB efficiency: ${non_carb_efficiency:.2f} per ton CO2e")
|
||||
|
||||
# EV Projects Analysis
|
||||
if 'ev_projects' in self.data:
|
||||
ev_df = self.data['ev_projects']
|
||||
print("\n--- Electric Vehicle Projects Analysis ---")
|
||||
print(f"EV projects: {len(ev_df)} ({len(ev_df)/len(carb_df)*100:.1f}% of CARB projects)")
|
||||
|
||||
if funding_col:
|
||||
ev_funding = ev_df[funding_col[0]].sum()
|
||||
print(f"EV funding: ${ev_funding:,.2f} ({ev_funding/carb_funding*100:.1f}% of CARB funding)")
|
||||
print(f"Average EV project: ${ev_df[funding_col[0]].mean():,.2f}")
|
||||
|
||||
if ghg_col:
|
||||
ev_ghg = ev_df[ghg_col[0]].sum()
|
||||
print(f"EV GHG reductions: {ev_ghg:,.2f} tons ({ev_ghg/carb_ghg*100:.1f}% of CARB reductions)")
|
||||
|
||||
# Calculate efficiency
|
||||
if funding_col:
|
||||
ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0
|
||||
print(f"EV efficiency: ${ev_efficiency:.2f} per ton CO2e")
|
||||
|
||||
return {
|
||||
"total_projects": len(df),
|
||||
"total_funding": total_funding if funding_col else None,
|
||||
"total_ghg_reductions": total_ghg if ghg_col else None,
|
||||
"carb_projects": len(self.data['carb_projects']) if 'carb_projects' in self.data else None,
|
||||
"ev_projects": len(self.data['ev_projects']) if 'ev_projects' in self.data else None
|
||||
}
|
||||
|
||||
def plot_agency_comparison(self):
|
||||
"""Create visualizations comparing agencies."""
|
||||
if 'cci_projects' not in self.data:
|
||||
logger.error("No data available for visualization")
|
||||
return
|
||||
|
||||
df = self.data['cci_projects']
|
||||
|
||||
# Ensure agency_name column exists
|
||||
if 'agency_name' not in df.columns:
|
||||
logger.error("agency_name column not found")
|
||||
return
|
||||
|
||||
# Find funding column
|
||||
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
||||
if not funding_col:
|
||||
logger.error("Funding column not found")
|
||||
return
|
||||
funding_col = funding_col[0]
|
||||
|
||||
# Find GHG reduction column
|
||||
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
|
||||
if not ghg_col:
|
||||
logger.error("GHG reduction column not found")
|
||||
return
|
||||
ghg_col = ghg_col[0]
|
||||
|
||||
# Create figure
|
||||
plt.figure(figsize=(15, 12))
|
||||
|
||||
# 1. Project count by agency
|
||||
plt.subplot(2, 2, 1)
|
||||
agency_counts = df['agency_name'].value_counts().head(10)
|
||||
agency_counts.plot(kind='barh')
|
||||
plt.title('Number of Projects by Agency (Top 10)')
|
||||
plt.xlabel('Number of Projects')
|
||||
|
||||
# 2. Funding by agency
|
||||
plt.subplot(2, 2, 2)
|
||||
agency_funding = df.groupby('agency_name')[funding_col].sum().sort_values(ascending=False).head(10) / 1_000_000
|
||||
agency_funding.plot(kind='barh')
|
||||
plt.title('Total Funding by Agency ($ Millions)')
|
||||
plt.xlabel('Funding ($ Millions)')
|
||||
|
||||
# 3. GHG reductions by agency
|
||||
plt.subplot(2, 2, 3)
|
||||
agency_ghg = df.groupby('agency_name')[ghg_col].sum().sort_values(ascending=False).head(10) / 1_000
|
||||
agency_ghg.plot(kind='barh')
|
||||
plt.title('GHG Reductions by Agency (Thousand Tons)')
|
||||
plt.xlabel('GHG Reductions (Thousand Tons)')
|
||||
|
||||
# 4. Efficiency by agency ($/ton)
|
||||
plt.subplot(2, 2, 4)
|
||||
agency_efficiency = df.groupby('agency_name').apply(
|
||||
lambda x: x[funding_col].sum() / x[ghg_col].sum() if x[ghg_col].sum() > 0 else np.nan
|
||||
).dropna().sort_values().head(10)
|
||||
agency_efficiency.plot(kind='barh')
|
||||
plt.title('Cost Efficiency by Agency ($ per Ton CO2e)')
|
||||
plt.xlabel('Cost per Ton GHG Reduced ($)')
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# Save visualization
|
||||
output_file = self.output_path / "agency_comparison.png"
|
||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||
logger.info(f"Agency comparison visualization saved to {output_file}")
|
||||
plt.show()
|
||||
|
||||
def plot_carb_analysis(self):
|
||||
"""Create visualizations specifically for CARB vs non-CARB analysis."""
|
||||
if 'carb_projects' not in self.data or 'non_carb_projects' not in self.data:
|
||||
logger.error("CARB datasets not available")
|
||||
return
|
||||
|
||||
# Find funding column
|
||||
funding_col = None
|
||||
ghg_col = None
|
||||
|
||||
# Check if we have funding data
|
||||
for key in ['carb_projects', 'non_carb_projects']:
|
||||
df = self.data[key]
|
||||
funding_cols = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
||||
if funding_cols:
|
||||
funding_col = funding_cols[0]
|
||||
|
||||
ghg_cols = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
|
||||
if ghg_cols:
|
||||
ghg_col = ghg_cols[0]
|
||||
|
||||
if not funding_col or not ghg_col:
|
||||
logger.error("Required columns not found")
|
||||
return
|
||||
|
||||
# Prepare data for comparison
|
||||
carb_df = self.data['carb_projects']
|
||||
non_carb_df = self.data['non_carb_projects']
|
||||
ev_df = self.data.get('ev_projects', pd.DataFrame())
|
||||
|
||||
# Create figure
|
||||
plt.figure(figsize=(15, 12))
|
||||
|
||||
# 1. Project count comparison
|
||||
plt.subplot(2, 2, 1)
|
||||
project_counts = pd.Series({
|
||||
'CARB (non-EV)': len(carb_df) - len(ev_df),
|
||||
'CARB (EV Projects)': len(ev_df),
|
||||
'Non-CARB': len(non_carb_df)
|
||||
})
|
||||
project_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90)
|
||||
plt.title('Distribution of Projects')
|
||||
plt.ylabel('') # Hide ylabel
|
||||
|
||||
# 2. Funding comparison
|
||||
plt.subplot(2, 2, 2)
|
||||
if funding_col:
|
||||
carb_non_ev_funding = carb_df[~carb_df.index.isin(ev_df.index)][funding_col].sum() if not ev_df.empty else carb_df[funding_col].sum()
|
||||
ev_funding = ev_df[funding_col].sum() if not ev_df.empty else 0
|
||||
non_carb_funding = non_carb_df[funding_col].sum()
|
||||
|
||||
funding_distribution = pd.Series({
|
||||
'CARB (non-EV)': carb_non_ev_funding,
|
||||
'CARB (EV Projects)': ev_funding,
|
||||
'Non-CARB': non_carb_funding
|
||||
})
|
||||
funding_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90)
|
||||
plt.title('Distribution of Funding')
|
||||
plt.ylabel('') # Hide ylabel
|
||||
|
||||
# 3. GHG reductions comparison
|
||||
plt.subplot(2, 2, 3)
|
||||
if ghg_col:
|
||||
carb_non_ev_ghg = carb_df[~carb_df.index.isin(ev_df.index)][ghg_col].sum() if not ev_df.empty else carb_df[ghg_col].sum()
|
||||
ev_ghg = ev_df[ghg_col].sum() if not ev_df.empty else 0
|
||||
non_carb_ghg = non_carb_df[ghg_col].sum()
|
||||
|
||||
ghg_distribution = pd.Series({
|
||||
'CARB (non-EV)': carb_non_ev_ghg,
|
||||
'CARB (EV Projects)': ev_ghg,
|
||||
'Non-CARB': non_carb_ghg
|
||||
})
|
||||
ghg_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90)
|
||||
plt.title('Distribution of GHG Reductions')
|
||||
plt.ylabel('') # Hide ylabel
|
||||
|
||||
# 4. Efficiency comparison ($/ton)
|
||||
plt.subplot(2, 2, 4)
|
||||
if funding_col and ghg_col:
|
||||
carb_non_ev_efficiency = carb_non_ev_funding / carb_non_ev_ghg if carb_non_ev_ghg > 0 else 0
|
||||
ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0
|
||||
non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0
|
||||
|
||||
efficiency_comparison = pd.Series({
|
||||
'CARB (non-EV)': carb_non_ev_efficiency,
|
||||
'CARB (EV Projects)': ev_efficiency,
|
||||
'Non-CARB': non_carb_efficiency
|
||||
})
|
||||
efficiency_comparison.plot(kind='bar')
|
||||
plt.title('Cost Efficiency ($ per Ton CO2e)')
|
||||
plt.ylabel('Cost per Ton GHG Reduced ($)')
|
||||
plt.xticks(rotation=45)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# Save visualization
|
||||
output_file = self.output_path / "carb_analysis.png"
|
||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||
logger.info(f"CARB analysis visualization saved to {output_file}")
|
||||
plt.show()
|
||||
|
||||
def plot_temporal_analysis(self):
|
||||
"""Create visualizations showing trends over time."""
|
||||
if 'cci_projects' not in self.data:
|
||||
logger.error("No data available for visualization")
|
||||
return
|
||||
|
||||
df = self.data['cci_projects']
|
||||
|
||||
# Check if we have year data
|
||||
if 'funding_year' not in df.columns:
|
||||
logger.error("funding_year column not found")
|
||||
return
|
||||
|
||||
# Find funding and GHG columns
|
||||
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
||||
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
|
||||
dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()]
|
||||
|
||||
if not funding_col or not ghg_col:
|
||||
logger.error("Required columns not found")
|
||||
return
|
||||
|
||||
funding_col = funding_col[0]
|
||||
ghg_col = ghg_col[0]
|
||||
dac_col = dac_col[0] if dac_col else None
|
||||
|
||||
# Separate CARB data if available
|
||||
carb_df = self.data.get('carb_projects', None)
|
||||
ev_df = self.data.get('ev_projects', None)
|
||||
|
||||
# Create figure
|
||||
plt.figure(figsize=(15, 12))
|
||||
|
||||
# 1. Funding by year
|
||||
plt.subplot(2, 2, 1)
|
||||
yearly_funding = df.groupby('funding_year')[funding_col].sum() / 1_000_000
|
||||
|
||||
# Add CARB and EV breakdowns if available
|
||||
if carb_df is not None:
|
||||
carb_yearly = carb_df.groupby('funding_year')[funding_col].sum() / 1_000_000
|
||||
non_carb_yearly = yearly_funding - carb_yearly
|
||||
|
||||
if ev_df is not None:
|
||||
ev_yearly = ev_df.groupby('funding_year')[funding_col].sum() / 1_000_000
|
||||
carb_non_ev_yearly = carb_yearly - ev_yearly
|
||||
|
||||
# Plot stacked bar chart
|
||||
years = sorted(yearly_funding.index)
|
||||
bottom = np.zeros(len(years))
|
||||
|
||||
plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
|
||||
bottom += non_carb_yearly.reindex(years, fill_value=0)
|
||||
|
||||
plt.bar(years, carb_non_ev_yearly.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
|
||||
bottom += carb_non_ev_yearly.reindex(years, fill_value=0)
|
||||
|
||||
plt.bar(years, ev_yearly.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
|
||||
|
||||
plt.legend()
|
||||
else:
|
||||
# Plot CARB vs non-CARB
|
||||
years = sorted(yearly_funding.index)
|
||||
plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB')
|
||||
plt.bar(years, carb_yearly.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly.reindex(years, fill_value=0))
|
||||
plt.legend()
|
||||
else:
|
||||
# Simple yearly plot
|
||||
yearly_funding.plot(kind='bar')
|
||||
|
||||
plt.title('CCI Funding by Year')
|
||||
plt.xlabel('Funding Year')
|
||||
plt.ylabel('Funding ($ Millions)')
|
||||
plt.xticks(rotation=45)
|
||||
|
||||
# 2. GHG reductions by year
|
||||
plt.subplot(2, 2, 2)
|
||||
yearly_ghg = df.groupby('funding_year')[ghg_col].sum() / 1_000
|
||||
|
||||
# Add CARB and EV breakdowns if available
|
||||
if carb_df is not None:
|
||||
carb_yearly_ghg = carb_df.groupby('funding_year')[ghg_col].sum() / 1_000
|
||||
non_carb_yearly_ghg = yearly_ghg - carb_yearly_ghg
|
||||
|
||||
if ev_df is not None:
|
||||
ev_yearly_ghg = ev_df.groupby('funding_year')[ghg_col].sum() / 1_000
|
||||
carb_non_ev_yearly_ghg = carb_yearly_ghg - ev_yearly_ghg
|
||||
|
||||
# Plot stacked bar chart
|
||||
years = sorted(yearly_ghg.index)
|
||||
bottom = np.zeros(len(years))
|
||||
|
||||
plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
|
||||
bottom += non_carb_yearly_ghg.reindex(years, fill_value=0)
|
||||
|
||||
plt.bar(years, carb_non_ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
|
||||
bottom += carb_non_ev_yearly_ghg.reindex(years, fill_value=0)
|
||||
|
||||
plt.bar(years, ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
|
||||
|
||||
plt.legend()
|
||||
else:
|
||||
# Plot CARB vs non-CARB
|
||||
years = sorted(yearly_ghg.index)
|
||||
plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB')
|
||||
plt.bar(years, carb_yearly_ghg.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_ghg.reindex(years, fill_value=0))
|
||||
plt.legend()
|
||||
else:
|
||||
# Simple yearly plot
|
||||
yearly_ghg.plot(kind='bar')
|
||||
|
||||
plt.title('GHG Reductions by Year')
|
||||
plt.xlabel('Funding Year')
|
||||
plt.ylabel('GHG Reductions (Thousand Tons)')
|
||||
plt.xticks(rotation=45)
|
||||
|
||||
# 3. Project counts by year
|
||||
plt.subplot(2, 2, 3)
|
||||
yearly_projects = df.groupby('funding_year').size()
|
||||
|
||||
# Add CARB and EV breakdowns if available
|
||||
if carb_df is not None:
|
||||
carb_yearly_projects = carb_df.groupby('funding_year').size()
|
||||
non_carb_yearly_projects = yearly_projects - carb_yearly_projects
|
||||
|
||||
if ev_df is not None:
|
||||
ev_yearly_projects = ev_df.groupby('funding_year').size()
|
||||
carb_non_ev_yearly_projects = carb_yearly_projects - ev_yearly_projects
|
||||
|
||||
# Plot stacked bar chart
|
||||
years = sorted(yearly_projects.index)
|
||||
bottom = np.zeros(len(years))
|
||||
|
||||
plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
|
||||
bottom += non_carb_yearly_projects.reindex(years, fill_value=0)
|
||||
|
||||
plt.bar(years, carb_non_ev_yearly_projects.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
|
||||
bottom += carb_non_ev_yearly_projects.reindex(years, fill_value=0)
|
||||
|
||||
plt.bar(years, ev_yearly_projects.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
|
||||
|
||||
plt.legend()
|
||||
else:
|
||||
# Plot CARB vs non-CARB
|
||||
years = sorted(yearly_projects.index)
|
||||
plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB')
|
||||
plt.bar(years, carb_yearly_projects.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_projects.reindex(years, fill_value=0))
|
||||
plt.legend()
|
||||
else:
|
||||
# Simple yearly plot
|
||||
yearly_projects.plot(kind='bar')
|
||||
|
||||
plt.title('Number of Projects by Year')
|
||||
plt.xlabel('Funding Year')
|
||||
plt.ylabel('Number of Projects')
|
||||
plt.xticks(rotation=45)
|
||||
|
||||
# 4. DAC benefit percentage by year
|
||||
plt.subplot(2, 2, 4)
|
||||
if dac_col:
|
||||
yearly_dac = df.groupby('funding_year')[dac_col].mean()
|
||||
|
||||
# Compare CARB vs non-CARB if available
|
||||
if carb_df is not None:
|
||||
carb_yearly_dac = carb_df.groupby('funding_year')[dac_col].mean()
|
||||
non_carb_yearly_dac = self.data['non_carb_projects'].groupby('funding_year')[dac_col].mean()
|
||||
|
||||
# Plot lines
|
||||
years = sorted(yearly_dac.index)
|
||||
plt.plot(years, yearly_dac.reindex(years), 'k-', label='Overall', linewidth=2)
|
||||
plt.plot(years, carb_yearly_dac.reindex(years), 'b-', label='CARB', linewidth=1.5)
|
||||
plt.plot(years, non_carb_yearly_dac.reindex(years), 'r-', label='Non-CARB', linewidth=1.5)
|
||||
|
||||
if ev_df is not None and not ev_df.empty:
|
||||
ev_yearly_dac = ev_df.groupby('funding_year')[dac_col].mean()
|
||||
plt.plot(years, ev_yearly_dac.reindex(years), 'g-', label='EV Projects', linewidth=1.5)
|
||||
|
||||
plt.legend()
|
||||
else:
|
||||
yearly_dac.plot(kind='line', marker='o')
|
||||
|
||||
plt.title('DAC Benefit Percentage by Year')
|
||||
plt.xlabel('Funding Year')
|
||||
plt.ylabel('Average DAC Benefit (%)')
|
||||
plt.grid(True, linestyle='--', alpha=0.7)
|
||||
plt.xticks(rotation=45)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# Save visualization
|
||||
output_file = self.output_path / "temporal_analysis.png"
|
||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||
logger.info(f"Temporal analysis visualization saved to {output_file}")
|
||||
plt.show()
|
||||
|
||||
def identify_collaboration_patterns(self):
|
||||
"""
|
||||
Analyze collaboration patterns in CCI projects to address the research question.
|
||||
This examines how inter-agency collaboration affects outcomes.
|
||||
"""
|
||||
if 'cci_projects' not in self.data:
|
||||
logger.error("No data available for analysis")
|
||||
return
|
||||
|
||||
df = self.data['cci_projects']
|
||||
|
||||
# Check if we can identify collaborative projects
|
||||
collab_indicators = []
|
||||
|
||||
# Look for program name patterns that might indicate collaboration
|
||||
if 'program_name' in df.columns:
|
||||
collab_indicators.append('program_name')
|
||||
if 'sub_program_name' in df.columns:
|
||||
collab_indicators.append('sub_program_name')
|
||||
if 'agency_name' in df.columns:
|
||||
collab_indicators.append('agency_name')
|
||||
|
||||
if not collab_indicators:
|
||||
logger.error("Could not identify columns for collaboration analysis")
|
||||
return
|
||||
|
||||
print("\n--- Collaboration Analysis ---")
|
||||
|
||||
try:
|
||||
# Identify unique programs
|
||||
if 'program_name' in df.columns:
|
||||
unique_programs = df['program_name'].nunique()
|
||||
print(f"Number of unique programs: {unique_programs}")
|
||||
|
||||
# Count agencies per program
|
||||
program_agencies = df.groupby('program_name')['agency_name'].nunique().sort_values(ascending=False)
|
||||
multi_agency_programs = program_agencies[program_agencies > 1]
|
||||
|
||||
print(f"Programs with multiple agencies: {len(multi_agency_programs)} ({len(multi_agency_programs)/unique_programs*100:.1f}% of programs)")
|
||||
|
||||
if len(multi_agency_programs) > 0:
|
||||
print("\nTop multi-agency programs:")
|
||||
for program, count in multi_agency_programs.head(5).items():
|
||||
print(f" {program}: {count} agencies")
|
||||
|
||||
# Analyze outcomes for multi-agency vs single-agency programs
|
||||
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
|
||||
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
|
||||
dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()]
|
||||
|
||||
if funding_col and ghg_col:
|
||||
funding_col = funding_col[0]
|
||||
ghg_col = ghg_col[0]
|
||||
|
||||
# Create multi-agency flag
|
||||
df['multi_agency_program'] = df['program_name'].map(lambda x: program_agencies[x] > 1 if x in program_agencies else False)
|
||||
|
||||
# Group by multi-agency flag
|
||||
multi_df = df[df['multi_agency_program']].copy()
|
||||
single_df = df[~df['multi_agency_program']].copy()
|
||||
|
||||
# Compare outcomes
|
||||
print("\nComparison of Multi-agency vs Single-agency Programs:")
|
||||
print(f"Multi-agency projects: {len(multi_df)} ({len(multi_df)/len(df)*100:.1f}% of total)")
|
||||
print(f"Single-agency projects: {len(single_df)} ({len(single_df)/len(df)*100:.1f}% of total)")
|
||||
|
||||
multi_funding = multi_df[funding_col].sum()
|
||||
single_funding = single_df[funding_col].sum()
|
||||
total_funding = df[funding_col].sum()
|
||||
|
||||
print(f"\nMulti-agency funding: ${multi_funding:,.2f} ({multi_funding/total_funding*100:.1f}% of total)")
|
||||
print(f"Single-agency funding: ${single_funding:,.2f} ({single_funding/total_funding*100:.1f}% of total)")
|
||||
|
||||
multi_ghg = multi_df[ghg_col].sum()
|
||||
single_ghg = single_df[ghg_col].sum()
|
||||
total_ghg = df[ghg_col].sum()
|
||||
|
||||
print(f"\nMulti-agency GHG reductions: {multi_ghg:,.2f} tons ({multi_ghg/total_ghg*100:.1f}% of total)")
|
||||
print(f"Single-agency GHG reductions: {single_ghg:,.2f} tons ({single_ghg/total_ghg*100:.1f}% of total)")
|
||||
|
||||
# Calculate efficiency
|
||||
multi_efficiency = multi_funding / multi_ghg if multi_ghg > 0 else 0
|
||||
single_efficiency = single_funding / single_ghg if single_ghg > 0 else 0
|
||||
|
||||
print(f"\nMulti-agency efficiency: ${multi_efficiency:.2f} per ton CO2e")
|
||||
print(f"Single-agency efficiency: ${single_efficiency:.2f} per ton CO2e")
|
||||
|
||||
# DAC benefits
|
||||
if dac_col:
|
||||
dac_col = dac_col[0]
|
||||
multi_dac = multi_df[dac_col].mean()
|
||||
single_dac = single_df[dac_col].mean()
|
||||
|
||||
print(f"\nMulti-agency DAC benefit: {multi_dac:.2f}%")
|
||||
print(f"Single-agency DAC benefit: {single_dac:.2f}%")
|
||||
|
||||
# Create visualization
|
||||
plt.figure(figsize=(15, 10))
|
||||
|
||||
# 1. Project distribution
|
||||
plt.subplot(2, 2, 1)
|
||||
project_dist = pd.Series({
|
||||
'Multi-agency Programs': len(multi_df),
|
||||
'Single-agency Programs': len(single_df)
|
||||
})
|
||||
project_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
|
||||
plt.title('Distribution of Projects')
|
||||
plt.ylabel('')
|
||||
|
||||
# 2. Funding distribution
|
||||
plt.subplot(2, 2, 2)
|
||||
funding_dist = pd.Series({
|
||||
'Multi-agency Programs': multi_funding,
|
||||
'Single-agency Programs': single_funding
|
||||
})
|
||||
funding_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
|
||||
plt.title('Distribution of Funding')
|
||||
plt.ylabel('')
|
||||
|
||||
# 3. GHG reduction distribution
|
||||
plt.subplot(2, 2, 3)
|
||||
ghg_dist = pd.Series({
|
||||
'Multi-agency Programs': multi_ghg,
|
||||
'Single-agency Programs': single_ghg
|
||||
})
|
||||
ghg_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
|
||||
plt.title('Distribution of GHG Reductions')
|
||||
plt.ylabel('')
|
||||
|
||||
# 4. Efficiency & DAC comparison
|
||||
plt.subplot(2, 2, 4)
|
||||
metrics = ['Cost Efficiency ($/ton)', 'DAC Benefit (%)']
|
||||
multi_values = [multi_efficiency]
|
||||
single_values = [single_efficiency]
|
||||
|
||||
if dac_col:
|
||||
multi_values.append(multi_dac)
|
||||
single_values.append(single_dac)
|
||||
|
||||
x = np.arange(len(metrics))
|
||||
width = 0.35
|
||||
|
||||
plt.bar(x - width/2, multi_values, width, label='Multi-agency')
|
||||
plt.bar(x + width/2, single_values, width, label='Single-agency')
|
||||
|
||||
plt.xlabel('Metric')
|
||||
plt.ylabel('Value')
|
||||
plt.title('Performance Comparison')
|
||||
plt.xticks(x, metrics)
|
||||
plt.legend()
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# Save visualization
|
||||
output_file = self.output_path / "collaboration_analysis.png"
|
||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||
logger.info(f"Collaboration analysis visualization saved to {output_file}")
|
||||
plt.show()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in collaboration analysis: {e}")
|
||||
|
||||
# Usage example
|
||||
if __name__ == "__main__":
|
||||
analyzer = CCIDataAnalyzer(data_path="data/cci_programs_data_reduced.csv")
|
||||
if analyzer.load_data():
|
||||
print("Data loaded successfully!")
|
||||
results = analyzer.analyze_data()
|
||||
|
||||
# Run agency comparison analysis
|
||||
analyzer.plot_agency_comparison()
|
||||
|
||||
# Run CARB vs non-CARB analysis
|
||||
analyzer.plot_carb_analysis()
|
||||
|
||||
# Run temporal analysis
|
||||
analyzer.plot_temporal_analysis()
|
||||
|
||||
# Run collaboration analysis
|
||||
analyzer.identify_collaboration_patterns()
|
||||
else:
|
||||
print("Failed to load data. Check file path and format.")
|
||||
Reference in New Issue
Block a user