i hope were getting some where

This commit is contained in:
2025-04-09 22:51:07 -07:00
parent 28ad830bef
commit 81ec68b3cc
11 changed files with 148547 additions and 144416 deletions

File diff suppressed because one or more lines are too long

857
CCIDataAnalyzer.py Normal file
View File

@@ -0,0 +1,857 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import logging
import warnings
# Configure basic logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("cci_analyzer")
# Suppress pandas warnings
warnings.filterwarnings('ignore')
class CCIDataAnalyzer:
"""Simplified analyzer for California Climate Investments data."""
def __init__(self, data_path, output_path="./output"):
self.data_path = Path(data_path)
self.output_path = Path(output_path)
self.output_path.mkdir(parents=True, exist_ok=True)
self.data = {}
logger.info(f"Initialized with data path: {self.data_path}")
def load_data(self):
"""Load CCI data with special handling for encoding issues."""
try:
logger.info(f"Loading data from {self.data_path}")
# Read as string to avoid conversion errors
df = pd.read_csv(self.data_path, dtype=str)
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
# Clean and process the data
df = self._clean_data(df)
# Store in data dictionary
self.data['cci_projects'] = df
# Create separate datasets for CARB and non-CARB projects
self._create_carb_datasets()
return True
except Exception as e:
logger.error(f"Error loading data: {e}")
return False
def _clean_data(self, df):
"""Clean and process the CCI data."""
try:
# 1. Fix column names
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
# 2. Handle the problematic lat_long column with LibreOffice encoding
if 'lat_long' in df.columns:
logger.info("Processing coordinates with special encoding handling")
# Function to clean LibreOffice encoding
def clean_libreoffice_encoding(text):
if pd.isna(text):
return text
# Special LibreOffice character replacements
replacements = {
'+AC0-': '-', # Minus sign
'+ACI-': '"', # Quote mark
}
cleaned = str(text)
for code, char in replacements.items():
cleaned = cleaned.replace(code, char)
return cleaned
# Clean the lat_long column
df['lat_long'] = df['lat_long'].apply(clean_libreoffice_encoding)
# Extract latitude and longitude
def extract_coords(coord_str):
if pd.isna(coord_str):
return (np.nan, np.nan)
try:
# Try to split by comma
if ',' in coord_str:
parts = coord_str.split(',')
if len(parts) >= 2:
lon = parts[0].strip()
lat = parts[1].strip()
return (float(lat), float(lon))
except:
pass
return (np.nan, np.nan)
# Extract coordinates safely
try:
coords = df['lat_long'].apply(extract_coords)
df['latitude'] = coords.apply(lambda x: x[0])
df['longitude'] = coords.apply(lambda x: x[1])
except Exception as e:
logger.error(f"Error extracting coordinates: {e}")
# 3. Convert numeric columns
numeric_cols = [
'total_project_cost',
'total_program_ggrffunding',
'project_life_years',
'total_project_ghgreductions',
'annual_project_ghgreductions'
]
for col in df.columns:
# Find matching columns (case insensitive)
if any(num_col in col.lower() for num_col in
['cost', 'funding', 'ghg', 'reductions', 'years']):
df[col] = pd.to_numeric(df[col], errors='coerce')
# 4. Convert date columns
date_cols = [col for col in df.columns if 'date' in col.lower()]
for col in date_cols:
df[col] = pd.to_datetime(df[col], errors='coerce')
# 5. Extract funding year
fiscal_year_cols = [col for col in df.columns if 'fiscal_year' in col.lower()]
if fiscal_year_cols:
try:
# Handle different possible formats of fiscal year column
year_col = fiscal_year_cols[0]
# Try multiple approaches to extract year
try:
# Handle standard fiscal year format like "2019-20"
df['funding_year'] = df[year_col].astype(str).str.extract(r'(\d{4})').astype('Int64')
except Exception:
logger.warning(f"Could not extract year with regex pattern, trying direct conversion")
# Try direct conversion if it's already a year
df['funding_year'] = pd.to_numeric(df[year_col], errors='coerce').astype('Int64')
except Exception as e:
logger.error(f"Error extracting funding year: {e}")
# 6. Calculate derived metrics if columns exist
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
dac_col = [col for col in df.columns if 'funding_benefiting' in col.lower()]
if funding_col and ghg_col:
df['ghg_efficiency'] = np.where(
df[ghg_col[0]] > 0,
df[funding_col[0]] / df[ghg_col[0]],
np.nan
)
if funding_col and dac_col:
df['dac_benefit_percentage'] = np.where(
df[funding_col[0]] > 0,
100 * df[dac_col[0]] / df[funding_col[0]],
0
)
logger.info("Data cleaning and processing complete")
return df
except Exception as e:
logger.error(f"Error cleaning data: {e}")
return df
def _create_carb_datasets(self):
"""Create separate datasets for CARB and non-CARB projects."""
if 'cci_projects' not in self.data:
logger.error("No data available to create CARB datasets")
return
df = self.data['cci_projects']
try:
# Check if agency_name column exists
if 'agency_name' not in df.columns:
logger.error("agency_name column not found")
return
# Create CARB dataset
carb_mask = df['agency_name'].str.contains('Air Resources Board', case=False, na=False)
self.data['carb_projects'] = df[carb_mask].copy()
self.data['non_carb_projects'] = df[~carb_mask].copy()
logger.info(f"Created CARB dataset with {len(self.data['carb_projects'])} projects")
logger.info(f"Created non-CARB dataset with {len(self.data['non_carb_projects'])} projects")
# Identify EV rebate/voucher projects within CARB
if len(self.data['carb_projects']) > 0:
carb_df = self.data['carb_projects']
# Look for EV-related projects using various columns
ev_indicators = ['electric vehicle', 'ev ', 'rebate', 'voucher', 'clean vehicle']
# Check program name for EV indicators
if 'program_name' in carb_df.columns:
ev_mask = carb_df['program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False)
elif 'sub_program_name' in carb_df.columns:
ev_mask = carb_df['sub_program_name'].str.lower().str.contains('|'.join(ev_indicators), na=False)
else:
# If specific columns not found, try to find any column that might indicate EV projects
ev_mask = pd.Series(False, index=carb_df.index)
for col in carb_df.columns:
if carb_df[col].dtype == 'object':
try:
ev_mask = ev_mask | carb_df[col].astype(str).str.lower().str.contains('|'.join(ev_indicators), na=False)
except:
pass
self.data['ev_projects'] = carb_df[ev_mask].copy()
logger.info(f"Identified {len(self.data['ev_projects'])} potential EV rebate/voucher projects")
except Exception as e:
logger.error(f"Error creating CARB datasets: {e}")
def analyze_data(self, include_carb_breakdown=True):
"""Basic analysis of CCI data with optional CARB breakdown."""
if 'cci_projects' not in self.data:
logger.error("No data available for analysis")
return None
df = self.data['cci_projects']
# Get agency information
if 'agency_name' in df.columns:
agency_counts = df['agency_name'].value_counts()
print("\nAgencies involved in CCI projects:")
for agency, count in agency_counts.head(10).items():
print(f" {agency}: {count} projects")
# Analyze funding distribution
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
if funding_col:
total_funding = df[funding_col[0]].sum()
print(f"\nTotal CCI funding: ${total_funding:,.2f}")
print(f"Average project funding: ${df[funding_col[0]].mean():,.2f}")
# Analyze GHG reductions
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
if ghg_col:
total_ghg = df[ghg_col[0]].sum()
print(f"\nTotal GHG reductions: {total_ghg:,.2f} tons")
print(f"Average GHG reduction per project: {df[ghg_col[0]].mean():,.2f} tons")
# Analyze DAC benefits
if 'dac_benefit_percentage' in df.columns:
avg_dac = df['dac_benefit_percentage'].mean()
print(f"\nAverage DAC benefit percentage: {avg_dac:.2f}%")
# CARB vs. Non-CARB Analysis
if include_carb_breakdown and 'carb_projects' in self.data and 'non_carb_projects' in self.data:
carb_df = self.data['carb_projects']
non_carb_df = self.data['non_carb_projects']
print("\n--- CARB vs. Non-CARB Analysis ---")
print(f"CARB projects: {len(carb_df)} ({len(carb_df)/len(df)*100:.1f}% of total)")
print(f"Non-CARB projects: {len(non_carb_df)} ({len(non_carb_df)/len(df)*100:.1f}% of total)")
if funding_col:
carb_funding = carb_df[funding_col[0]].sum()
non_carb_funding = non_carb_df[funding_col[0]].sum()
print(f"\nCARB funding: ${carb_funding:,.2f} ({carb_funding/total_funding*100:.1f}% of total)")
print(f"Non-CARB funding: ${non_carb_funding:,.2f} ({non_carb_funding/total_funding*100:.1f}% of total)")
print(f"Average CARB project: ${carb_df[funding_col[0]].mean():,.2f}")
print(f"Average non-CARB project: ${non_carb_df[funding_col[0]].mean():,.2f}")
if ghg_col:
carb_ghg = carb_df[ghg_col[0]].sum()
non_carb_ghg = non_carb_df[ghg_col[0]].sum()
print(f"\nCARB GHG reductions: {carb_ghg:,.2f} tons ({carb_ghg/total_ghg*100:.1f}% of total)")
print(f"Non-CARB GHG reductions: {non_carb_ghg:,.2f} tons ({non_carb_ghg/total_ghg*100:.1f}% of total)")
# Calculate efficiency
if funding_col:
carb_efficiency = carb_funding / carb_ghg if carb_ghg > 0 else 0
non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0
print(f"\nCARB efficiency: ${carb_efficiency:.2f} per ton CO2e")
print(f"Non-CARB efficiency: ${non_carb_efficiency:.2f} per ton CO2e")
# EV Projects Analysis
if 'ev_projects' in self.data:
ev_df = self.data['ev_projects']
print("\n--- Electric Vehicle Projects Analysis ---")
print(f"EV projects: {len(ev_df)} ({len(ev_df)/len(carb_df)*100:.1f}% of CARB projects)")
if funding_col:
ev_funding = ev_df[funding_col[0]].sum()
print(f"EV funding: ${ev_funding:,.2f} ({ev_funding/carb_funding*100:.1f}% of CARB funding)")
print(f"Average EV project: ${ev_df[funding_col[0]].mean():,.2f}")
if ghg_col:
ev_ghg = ev_df[ghg_col[0]].sum()
print(f"EV GHG reductions: {ev_ghg:,.2f} tons ({ev_ghg/carb_ghg*100:.1f}% of CARB reductions)")
# Calculate efficiency
if funding_col:
ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0
print(f"EV efficiency: ${ev_efficiency:.2f} per ton CO2e")
return {
"total_projects": len(df),
"total_funding": total_funding if funding_col else None,
"total_ghg_reductions": total_ghg if ghg_col else None,
"carb_projects": len(self.data['carb_projects']) if 'carb_projects' in self.data else None,
"ev_projects": len(self.data['ev_projects']) if 'ev_projects' in self.data else None
}
def plot_agency_comparison(self):
"""Create visualizations comparing agencies."""
if 'cci_projects' not in self.data:
logger.error("No data available for visualization")
return
df = self.data['cci_projects']
# Ensure agency_name column exists
if 'agency_name' not in df.columns:
logger.error("agency_name column not found")
return
# Find funding column
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
if not funding_col:
logger.error("Funding column not found")
return
funding_col = funding_col[0]
# Find GHG reduction column
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
if not ghg_col:
logger.error("GHG reduction column not found")
return
ghg_col = ghg_col[0]
# Create figure
plt.figure(figsize=(15, 12))
# 1. Project count by agency
plt.subplot(2, 2, 1)
agency_counts = df['agency_name'].value_counts().head(10)
agency_counts.plot(kind='barh')
plt.title('Number of Projects by Agency (Top 10)')
plt.xlabel('Number of Projects')
# 2. Funding by agency
plt.subplot(2, 2, 2)
agency_funding = df.groupby('agency_name')[funding_col].sum().sort_values(ascending=False).head(10) / 1_000_000
agency_funding.plot(kind='barh')
plt.title('Total Funding by Agency ($ Millions)')
plt.xlabel('Funding ($ Millions)')
# 3. GHG reductions by agency
plt.subplot(2, 2, 3)
agency_ghg = df.groupby('agency_name')[ghg_col].sum().sort_values(ascending=False).head(10) / 1_000
agency_ghg.plot(kind='barh')
plt.title('GHG Reductions by Agency (Thousand Tons)')
plt.xlabel('GHG Reductions (Thousand Tons)')
# 4. Efficiency by agency ($/ton)
plt.subplot(2, 2, 4)
agency_efficiency = df.groupby('agency_name').apply(
lambda x: x[funding_col].sum() / x[ghg_col].sum() if x[ghg_col].sum() > 0 else np.nan
).dropna().sort_values().head(10)
agency_efficiency.plot(kind='barh')
plt.title('Cost Efficiency by Agency ($ per Ton CO2e)')
plt.xlabel('Cost per Ton GHG Reduced ($)')
plt.tight_layout()
# Save visualization
output_file = self.output_path / "agency_comparison.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"Agency comparison visualization saved to {output_file}")
plt.show()
def plot_carb_analysis(self):
"""Create visualizations specifically for CARB vs non-CARB analysis."""
if 'carb_projects' not in self.data or 'non_carb_projects' not in self.data:
logger.error("CARB datasets not available")
return
# Find funding column
funding_col = None
ghg_col = None
# Check if we have funding data
for key in ['carb_projects', 'non_carb_projects']:
df = self.data[key]
funding_cols = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
if funding_cols:
funding_col = funding_cols[0]
ghg_cols = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
if ghg_cols:
ghg_col = ghg_cols[0]
if not funding_col or not ghg_col:
logger.error("Required columns not found")
return
# Prepare data for comparison
carb_df = self.data['carb_projects']
non_carb_df = self.data['non_carb_projects']
ev_df = self.data.get('ev_projects', pd.DataFrame())
# Create figure
plt.figure(figsize=(15, 12))
# 1. Project count comparison
plt.subplot(2, 2, 1)
project_counts = pd.Series({
'CARB (non-EV)': len(carb_df) - len(ev_df),
'CARB (EV Projects)': len(ev_df),
'Non-CARB': len(non_carb_df)
})
project_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Projects')
plt.ylabel('') # Hide ylabel
# 2. Funding comparison
plt.subplot(2, 2, 2)
if funding_col:
carb_non_ev_funding = carb_df[~carb_df.index.isin(ev_df.index)][funding_col].sum() if not ev_df.empty else carb_df[funding_col].sum()
ev_funding = ev_df[funding_col].sum() if not ev_df.empty else 0
non_carb_funding = non_carb_df[funding_col].sum()
funding_distribution = pd.Series({
'CARB (non-EV)': carb_non_ev_funding,
'CARB (EV Projects)': ev_funding,
'Non-CARB': non_carb_funding
})
funding_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Funding')
plt.ylabel('') # Hide ylabel
# 3. GHG reductions comparison
plt.subplot(2, 2, 3)
if ghg_col:
carb_non_ev_ghg = carb_df[~carb_df.index.isin(ev_df.index)][ghg_col].sum() if not ev_df.empty else carb_df[ghg_col].sum()
ev_ghg = ev_df[ghg_col].sum() if not ev_df.empty else 0
non_carb_ghg = non_carb_df[ghg_col].sum()
ghg_distribution = pd.Series({
'CARB (non-EV)': carb_non_ev_ghg,
'CARB (EV Projects)': ev_ghg,
'Non-CARB': non_carb_ghg
})
ghg_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Distribution of GHG Reductions')
plt.ylabel('') # Hide ylabel
# 4. Efficiency comparison ($/ton)
plt.subplot(2, 2, 4)
if funding_col and ghg_col:
carb_non_ev_efficiency = carb_non_ev_funding / carb_non_ev_ghg if carb_non_ev_ghg > 0 else 0
ev_efficiency = ev_funding / ev_ghg if ev_ghg > 0 else 0
non_carb_efficiency = non_carb_funding / non_carb_ghg if non_carb_ghg > 0 else 0
efficiency_comparison = pd.Series({
'CARB (non-EV)': carb_non_ev_efficiency,
'CARB (EV Projects)': ev_efficiency,
'Non-CARB': non_carb_efficiency
})
efficiency_comparison.plot(kind='bar')
plt.title('Cost Efficiency ($ per Ton CO2e)')
plt.ylabel('Cost per Ton GHG Reduced ($)')
plt.xticks(rotation=45)
plt.tight_layout()
# Save visualization
output_file = self.output_path / "carb_analysis.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"CARB analysis visualization saved to {output_file}")
plt.show()
def plot_temporal_analysis(self):
"""Create visualizations showing trends over time."""
if 'cci_projects' not in self.data:
logger.error("No data available for visualization")
return
df = self.data['cci_projects']
# Check if we have year data
if 'funding_year' not in df.columns:
logger.error("funding_year column not found")
return
# Find funding and GHG columns
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()]
if not funding_col or not ghg_col:
logger.error("Required columns not found")
return
funding_col = funding_col[0]
ghg_col = ghg_col[0]
dac_col = dac_col[0] if dac_col else None
# Separate CARB data if available
carb_df = self.data.get('carb_projects', None)
ev_df = self.data.get('ev_projects', None)
# Create figure
plt.figure(figsize=(15, 12))
# 1. Funding by year
plt.subplot(2, 2, 1)
yearly_funding = df.groupby('funding_year')[funding_col].sum() / 1_000_000
# Add CARB and EV breakdowns if available
if carb_df is not None:
carb_yearly = carb_df.groupby('funding_year')[funding_col].sum() / 1_000_000
non_carb_yearly = yearly_funding - carb_yearly
if ev_df is not None:
ev_yearly = ev_df.groupby('funding_year')[funding_col].sum() / 1_000_000
carb_non_ev_yearly = carb_yearly - ev_yearly
# Plot stacked bar chart
years = sorted(yearly_funding.index)
bottom = np.zeros(len(years))
plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
bottom += non_carb_yearly.reindex(years, fill_value=0)
plt.bar(years, carb_non_ev_yearly.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
bottom += carb_non_ev_yearly.reindex(years, fill_value=0)
plt.bar(years, ev_yearly.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
plt.legend()
else:
# Plot CARB vs non-CARB
years = sorted(yearly_funding.index)
plt.bar(years, non_carb_yearly.reindex(years, fill_value=0), label='Non-CARB')
plt.bar(years, carb_yearly.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly.reindex(years, fill_value=0))
plt.legend()
else:
# Simple yearly plot
yearly_funding.plot(kind='bar')
plt.title('CCI Funding by Year')
plt.xlabel('Funding Year')
plt.ylabel('Funding ($ Millions)')
plt.xticks(rotation=45)
# 2. GHG reductions by year
plt.subplot(2, 2, 2)
yearly_ghg = df.groupby('funding_year')[ghg_col].sum() / 1_000
# Add CARB and EV breakdowns if available
if carb_df is not None:
carb_yearly_ghg = carb_df.groupby('funding_year')[ghg_col].sum() / 1_000
non_carb_yearly_ghg = yearly_ghg - carb_yearly_ghg
if ev_df is not None:
ev_yearly_ghg = ev_df.groupby('funding_year')[ghg_col].sum() / 1_000
carb_non_ev_yearly_ghg = carb_yearly_ghg - ev_yearly_ghg
# Plot stacked bar chart
years = sorted(yearly_ghg.index)
bottom = np.zeros(len(years))
plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
bottom += non_carb_yearly_ghg.reindex(years, fill_value=0)
plt.bar(years, carb_non_ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
bottom += carb_non_ev_yearly_ghg.reindex(years, fill_value=0)
plt.bar(years, ev_yearly_ghg.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
plt.legend()
else:
# Plot CARB vs non-CARB
years = sorted(yearly_ghg.index)
plt.bar(years, non_carb_yearly_ghg.reindex(years, fill_value=0), label='Non-CARB')
plt.bar(years, carb_yearly_ghg.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_ghg.reindex(years, fill_value=0))
plt.legend()
else:
# Simple yearly plot
yearly_ghg.plot(kind='bar')
plt.title('GHG Reductions by Year')
plt.xlabel('Funding Year')
plt.ylabel('GHG Reductions (Thousand Tons)')
plt.xticks(rotation=45)
# 3. Project counts by year
plt.subplot(2, 2, 3)
yearly_projects = df.groupby('funding_year').size()
# Add CARB and EV breakdowns if available
if carb_df is not None:
carb_yearly_projects = carb_df.groupby('funding_year').size()
non_carb_yearly_projects = yearly_projects - carb_yearly_projects
if ev_df is not None:
ev_yearly_projects = ev_df.groupby('funding_year').size()
carb_non_ev_yearly_projects = carb_yearly_projects - ev_yearly_projects
# Plot stacked bar chart
years = sorted(yearly_projects.index)
bottom = np.zeros(len(years))
plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB', bottom=bottom)
bottom += non_carb_yearly_projects.reindex(years, fill_value=0)
plt.bar(years, carb_non_ev_yearly_projects.reindex(years, fill_value=0), label='CARB (non-EV)', bottom=bottom)
bottom += carb_non_ev_yearly_projects.reindex(years, fill_value=0)
plt.bar(years, ev_yearly_projects.reindex(years, fill_value=0), label='CARB (EV Projects)', bottom=bottom)
plt.legend()
else:
# Plot CARB vs non-CARB
years = sorted(yearly_projects.index)
plt.bar(years, non_carb_yearly_projects.reindex(years, fill_value=0), label='Non-CARB')
plt.bar(years, carb_yearly_projects.reindex(years, fill_value=0), label='CARB', bottom=non_carb_yearly_projects.reindex(years, fill_value=0))
plt.legend()
else:
# Simple yearly plot
yearly_projects.plot(kind='bar')
plt.title('Number of Projects by Year')
plt.xlabel('Funding Year')
plt.ylabel('Number of Projects')
plt.xticks(rotation=45)
# 4. DAC benefit percentage by year
plt.subplot(2, 2, 4)
if dac_col:
yearly_dac = df.groupby('funding_year')[dac_col].mean()
# Compare CARB vs non-CARB if available
if carb_df is not None:
carb_yearly_dac = carb_df.groupby('funding_year')[dac_col].mean()
non_carb_yearly_dac = self.data['non_carb_projects'].groupby('funding_year')[dac_col].mean()
# Plot lines
years = sorted(yearly_dac.index)
plt.plot(years, yearly_dac.reindex(years), 'k-', label='Overall', linewidth=2)
plt.plot(years, carb_yearly_dac.reindex(years), 'b-', label='CARB', linewidth=1.5)
plt.plot(years, non_carb_yearly_dac.reindex(years), 'r-', label='Non-CARB', linewidth=1.5)
if ev_df is not None and not ev_df.empty:
ev_yearly_dac = ev_df.groupby('funding_year')[dac_col].mean()
plt.plot(years, ev_yearly_dac.reindex(years), 'g-', label='EV Projects', linewidth=1.5)
plt.legend()
else:
yearly_dac.plot(kind='line', marker='o')
plt.title('DAC Benefit Percentage by Year')
plt.xlabel('Funding Year')
plt.ylabel('Average DAC Benefit (%)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
# Save visualization
output_file = self.output_path / "temporal_analysis.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"Temporal analysis visualization saved to {output_file}")
plt.show()
def identify_collaboration_patterns(self):
"""
Analyze collaboration patterns in CCI projects to address the research question.
This examines how inter-agency collaboration affects outcomes.
"""
if 'cci_projects' not in self.data:
logger.error("No data available for analysis")
return
df = self.data['cci_projects']
# Check if we can identify collaborative projects
collab_indicators = []
# Look for program name patterns that might indicate collaboration
if 'program_name' in df.columns:
collab_indicators.append('program_name')
if 'sub_program_name' in df.columns:
collab_indicators.append('sub_program_name')
if 'agency_name' in df.columns:
collab_indicators.append('agency_name')
if not collab_indicators:
logger.error("Could not identify columns for collaboration analysis")
return
print("\n--- Collaboration Analysis ---")
try:
# Identify unique programs
if 'program_name' in df.columns:
unique_programs = df['program_name'].nunique()
print(f"Number of unique programs: {unique_programs}")
# Count agencies per program
program_agencies = df.groupby('program_name')['agency_name'].nunique().sort_values(ascending=False)
multi_agency_programs = program_agencies[program_agencies > 1]
print(f"Programs with multiple agencies: {len(multi_agency_programs)} ({len(multi_agency_programs)/unique_programs*100:.1f}% of programs)")
if len(multi_agency_programs) > 0:
print("\nTop multi-agency programs:")
for program, count in multi_agency_programs.head(5).items():
print(f" {program}: {count} agencies")
# Analyze outcomes for multi-agency vs single-agency programs
funding_col = [col for col in df.columns if 'total_program' in col.lower() and 'funding' in col.lower()]
ghg_col = [col for col in df.columns if 'total_project' in col.lower() and 'ghg' in col.lower()]
dac_col = [col for col in df.columns if 'dac_benefit_percentage' in col.lower()]
if funding_col and ghg_col:
funding_col = funding_col[0]
ghg_col = ghg_col[0]
# Create multi-agency flag
df['multi_agency_program'] = df['program_name'].map(lambda x: program_agencies[x] > 1 if x in program_agencies else False)
# Group by multi-agency flag
multi_df = df[df['multi_agency_program']].copy()
single_df = df[~df['multi_agency_program']].copy()
# Compare outcomes
print("\nComparison of Multi-agency vs Single-agency Programs:")
print(f"Multi-agency projects: {len(multi_df)} ({len(multi_df)/len(df)*100:.1f}% of total)")
print(f"Single-agency projects: {len(single_df)} ({len(single_df)/len(df)*100:.1f}% of total)")
multi_funding = multi_df[funding_col].sum()
single_funding = single_df[funding_col].sum()
total_funding = df[funding_col].sum()
print(f"\nMulti-agency funding: ${multi_funding:,.2f} ({multi_funding/total_funding*100:.1f}% of total)")
print(f"Single-agency funding: ${single_funding:,.2f} ({single_funding/total_funding*100:.1f}% of total)")
multi_ghg = multi_df[ghg_col].sum()
single_ghg = single_df[ghg_col].sum()
total_ghg = df[ghg_col].sum()
print(f"\nMulti-agency GHG reductions: {multi_ghg:,.2f} tons ({multi_ghg/total_ghg*100:.1f}% of total)")
print(f"Single-agency GHG reductions: {single_ghg:,.2f} tons ({single_ghg/total_ghg*100:.1f}% of total)")
# Calculate efficiency
multi_efficiency = multi_funding / multi_ghg if multi_ghg > 0 else 0
single_efficiency = single_funding / single_ghg if single_ghg > 0 else 0
print(f"\nMulti-agency efficiency: ${multi_efficiency:.2f} per ton CO2e")
print(f"Single-agency efficiency: ${single_efficiency:.2f} per ton CO2e")
# DAC benefits
if dac_col:
dac_col = dac_col[0]
multi_dac = multi_df[dac_col].mean()
single_dac = single_df[dac_col].mean()
print(f"\nMulti-agency DAC benefit: {multi_dac:.2f}%")
print(f"Single-agency DAC benefit: {single_dac:.2f}%")
# Create visualization
plt.figure(figsize=(15, 10))
# 1. Project distribution
plt.subplot(2, 2, 1)
project_dist = pd.Series({
'Multi-agency Programs': len(multi_df),
'Single-agency Programs': len(single_df)
})
project_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Projects')
plt.ylabel('')
# 2. Funding distribution
plt.subplot(2, 2, 2)
funding_dist = pd.Series({
'Multi-agency Programs': multi_funding,
'Single-agency Programs': single_funding
})
funding_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Funding')
plt.ylabel('')
# 3. GHG reduction distribution
plt.subplot(2, 2, 3)
ghg_dist = pd.Series({
'Multi-agency Programs': multi_ghg,
'Single-agency Programs': single_ghg
})
ghg_dist.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Distribution of GHG Reductions')
plt.ylabel('')
# 4. Efficiency & DAC comparison
plt.subplot(2, 2, 4)
metrics = ['Cost Efficiency ($/ton)', 'DAC Benefit (%)']
multi_values = [multi_efficiency]
single_values = [single_efficiency]
if dac_col:
multi_values.append(multi_dac)
single_values.append(single_dac)
x = np.arange(len(metrics))
width = 0.35
plt.bar(x - width/2, multi_values, width, label='Multi-agency')
plt.bar(x + width/2, single_values, width, label='Single-agency')
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Performance Comparison')
plt.xticks(x, metrics)
plt.legend()
plt.tight_layout()
# Save visualization
output_file = self.output_path / "collaboration_analysis.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"Collaboration analysis visualization saved to {output_file}")
plt.show()
except Exception as e:
logger.error(f"Error in collaboration analysis: {e}")
# Usage example
if __name__ == "__main__":
analyzer = CCIDataAnalyzer(data_path="data/cci_programs_data_reduced.csv")
if analyzer.load_data():
print("Data loaded successfully!")
results = analyzer.analyze_data()
# Run agency comparison analysis
analyzer.plot_agency_comparison()
# Run CARB vs non-CARB analysis
analyzer.plot_carb_analysis()
# Run temporal analysis
analyzer.plot_temporal_analysis()
# Run collaboration analysis
analyzer.identify_collaboration_patterns()
else:
print("Failed to load data. Check file path and format.")

File diff suppressed because it is too large Load Diff

View File

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +0,0 @@
Project IDNumber,Reporting Cycle Name,Agency Name,Program Name,Sub Program Name,Record Type,Census Tract,Lat Long,"Senate
District","Assembly
District",County,Total Project Cost,Total Program GGRFFunding,Project Life Years,Total Project GHGReductions,Annual Project GHGReductions,Project Count,Fiscal Year Funding Project,Is Benefit Disadvantaged Communities,Disadvantaged Community Criteria,Total GGRFDisadvantaged Community Funding,Funding Benefiting Disadvantaged Communities,Estimated Num Vehicles In Service,Funding Within Disadvantage Communities,VMTReductions,Number Of Housing Units,Number Of Affordable Housing Units,Estimated Number Of Trees To Be Planted,Energy Cost Savings,Estimated Energy Saved KWH,Estimated Energy Saved Therms,Estimated Water Saved Gallons,Estimated Energy Generated KWH,Estimated Fuel Use Reduction Gal,Vouchers Benefiting Disadvantaged Communities,Number Of Rebates Issued,Rebates Within Disadvantaged Communities,Date Operational,Project Completion Date,Is AB1550Buffer Region,Is Benefit DAC1550Communities,Is Low Income Communities
G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001201,,14,31,Fresno,0,5000,3,14,0,1,,True,1-1A,5000,0,0,5000,0,0,0,0,0,0,0,0,0,0,0,1,0,,05/01/2018,False,False,False
G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001410,,14,31,Fresno,0,5000,3,12,0,1,,True,1-1A,5000,0,0,5000,0,0,0,0,0,0,0,0,0,0,0,1,0,,05/01/2018,False,False,False
G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001414,,14,31,Fresno,0,5000,3,15,0,2,,True,1-1A,5000,0,0,5000,0,0,0,0,0,0,0,0,0,0,0,2,0,,05/01/2018,False,False,False
G14-LCTI-01,2015,California Air Resources Board,Low Carbon Transportation,Clean Cars 4 All,IMPLEMENT,6019001800,,14,31,Fresno,0,2500,3,8,0,1,,True,1-1A,2500,0,0,2500,0,0,0,0,0,0,0,0,0,0,0,1,0,,05/01/2018,False,False,False
1 Project IDNumber Reporting Cycle Name Agency Name Program Name Sub Program Name Record Type Census Tract Lat Long Senate District Assembly District County Total Project Cost Total Program GGRFFunding Project Life Years Total Project GHGReductions Annual Project GHGReductions Project Count Fiscal Year Funding Project Is Benefit Disadvantaged Communities Disadvantaged Community Criteria Total GGRFDisadvantaged Community Funding Funding Benefiting Disadvantaged Communities Estimated Num Vehicles In Service Funding Within Disadvantage Communities VMTReductions Number Of Housing Units Number Of Affordable Housing Units Estimated Number Of Trees To Be Planted Energy Cost Savings Estimated Energy Saved KWH Estimated Energy Saved Therms Estimated Water Saved Gallons Estimated Energy Generated KWH Estimated Fuel Use Reduction Gal Vouchers Benefiting Disadvantaged Communities Number Of Rebates Issued Rebates Within Disadvantaged Communities Date Operational Project Completion Date Is AB1550Buffer Region Is Benefit DAC1550Communities Is Low Income Communities
2 G14-LCTI-01 2015 California Air Resources Board Low Carbon Transportation Clean Cars 4 All IMPLEMENT 6019001201 14 31 Fresno 0 5000 3 14 0 1 True 1-1A 5000 0 0 5000 0 0 0 0 0 0 0 0 0 0 0 1 0 05/01/2018 False False False
3 G14-LCTI-01 2015 California Air Resources Board Low Carbon Transportation Clean Cars 4 All IMPLEMENT 6019001410 14 31 Fresno 0 5000 3 12 0 1 True 1-1A 5000 0 0 5000 0 0 0 0 0 0 0 0 0 0 0 1 0 05/01/2018 False False False
4 G14-LCTI-01 2015 California Air Resources Board Low Carbon Transportation Clean Cars 4 All IMPLEMENT 6019001414 14 31 Fresno 0 5000 3 15 0 2 True 1-1A 5000 0 0 5000 0 0 0 0 0 0 0 0 0 0 0 2 0 05/01/2018 False False False
5 G14-LCTI-01 2015 California Air Resources Board Low Carbon Transportation Clean Cars 4 All IMPLEMENT 6019001800 14 31 Fresno 0 2500 3 8 0 1 True 1-1A 2500 0 0 2500 0 0 0 0 0 0 0 0 0 0 0 1 0 05/01/2018 False False False

Binary file not shown.

Before

Width:  |  Height:  |  Size: 761 KiB

After

Width:  |  Height:  |  Size: 751 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 363 KiB

After

Width:  |  Height:  |  Size: 371 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 328 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 275 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 555 KiB

After

Width:  |  Height:  |  Size: 329 KiB