This commit is contained in:
2025-04-10 00:03:30 -07:00
parent 81ec68b3cc
commit 03ae352949
12 changed files with 150373 additions and 0 deletions

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,94 @@
"""
Cleaned version of the CCICollaborationAnalyzer script.
This script is structured and corrected for proper exception handling and visualization generation.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import warnings
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from scipy import stats
from cci_analyzer import CCIDataAnalyzer
# Configure logging
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("cci_collaboration_analysis")
# Suppress pandas warnings
warnings.filterwarnings("ignore")
class CCICollaborationAnalyzer:
def __init__(self, data_path, output_path="./output/collaboration"):
self.data_path = Path(data_path)
self.output_path = Path(output_path)
self.output_path.mkdir(parents=True, exist_ok=True)
self.base_analyzer = CCIDataAnalyzer(data_path, output_path=str(self.output_path))
if not self.base_analyzer.load_data():
logger.error("Failed to load data through base analyzer")
return
self.data = self.base_analyzer.data
self.collaboration_metrics = {}
self.temporal_analysis = {}
self.regional_analysis = {}
self.ev_vouchers_analysis = {}
self._separate_ev_vouchers()
def _separate_ev_vouchers(self):
if 'cci_projects' not in self.data:
logger.error("No project data available to separate EV vouchers")
return
df = self.data['cci_projects']
try:
ev_mask = ((df['agency_name'].str.contains('Air Resources Board', case=False, na=False)) &
(df['program_name'].str.contains('Low Carbon Transportation', case=False, na=False)) &
(df['sub_program_name'].str.contains('Clean Cars 4 All|CVRP|Financing Assistance',
case=False, na=False)))
self.data['ev_vouchers'] = df[ev_mask].copy()
self.data['non_ev_projects'] = df[~ev_mask].copy()
logger.info(f"Separated {len(self.data['ev_vouchers'])} EV vouchers from {len(self.data['non_ev_projects'])} other projects")
except Exception as e:
logger.error(f"Error separating EV vouchers: {e}")
def _generate_visualizations(self):
"""Generate visualizations of key findings."""
logger.info("Generating visualizations")
try:
self._plot_collaboration_impact()
self._plot_temporal_trends()
self._plot_regional_analysis()
self._plot_ev_vouchers_analysis()
self._plot_efficiency_equity_tradeoff()
logger.info(f"All visualizations completed and saved to {self.output_path}")
except Exception as e:
logger.error(f"Error generating visualizations: {e}")
# Define stubs for the required plotting methods
def _plot_collaboration_impact(self):
logger.info("Plotting collaboration impact...")
# Implementation goes here
def _plot_temporal_trends(self):
logger.info("Plotting temporal trends...")
# Implementation goes here
def _plot_regional_analysis(self):
logger.info("Plotting regional analysis...")
# Implementation goes here
def _plot_ev_vouchers_analysis(self):
logger.info("Plotting EV vouchers analysis...")
# Implementation goes here
def _plot_efficiency_equity_tradeoff(self):
logger.info("Plotting efficiency-equity tradeoff...")
# Implementation goes here

View File

@@ -0,0 +1,125 @@
import pandas as pd
import numpy as np
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("collaboration_detection")
def investigate_collaboration(input_path, output_path=None):
"""
Investigate potential collaboration patterns in the CCI data
that might not be captured by the current approach.
Parameters:
input_path (str): Path to the CCI data CSV file
output_path (str, optional): Path to save findings
"""
logger.info(f"Loading data from {input_path}")
# Load the data
df = pd.read_csv(input_path, low_memory=False)
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
# 1. Look at unique agency_name values
agencies = df['agency_name'].unique()
logger.info(f"Found {len(agencies)} unique agencies")
logger.info("Agencies: " + ", ".join(sorted(agencies)[:10]) + "..." if len(agencies) > 10 else ", ".join(sorted(agencies)))
# 2. Look at how agency_name is associated with program_name
program_agency_counts = df.groupby('program_name')['agency_name'].nunique()
# Look for programs with multiple agencies
multi_agency_programs = program_agency_counts[program_agency_counts > 1]
logger.info(f"Found {len(multi_agency_programs)} programs with multiple agencies")
if len(multi_agency_programs) > 0:
logger.info("Multi-agency programs:")
for program, count in multi_agency_programs.items():
agencies = df[df['program_name'] == program]['agency_name'].unique()
logger.info(f" {program}: {count} agencies ({', '.join(agencies)})")
# 3. Look for other potential indicators of collaboration
# Check if there are other columns that might indicate collaboration
potential_collab_indicators = [
'agency_name', 'program_name', 'sub_program_name',
# Add other potential columns here
]
# Look for terms that might indicate collaboration
collab_terms = ['collab', 'partner', 'joint', 'multi', 'together', 'coop']
# Search for collaboration terms across relevant columns
for col in potential_collab_indicators:
if col in df.columns and df[col].dtype == 'object':
# Search for collaboration terms in the column
matches = []
for term in collab_terms:
term_matches = df[df[col].str.contains(term, case=False, na=False)]
if len(term_matches) > 0:
matches.append((term, len(term_matches)))
if matches:
logger.info(f"Found potential collaboration indicators in column '{col}':")
for term, count in matches:
logger.info(f" Term '{term}': {count} matches")
# 4. Look for potential co-funding patterns
# Sometimes collaboration is indicated by multiple funding sources
funding_cols = [col for col in df.columns if 'funding' in col.lower()]
logger.info(f"Found {len(funding_cols)} funding-related columns: {', '.join(funding_cols)}")
# 5. Generate a set of recommendations for identifying collaboration
recommendations = [
"1. Consider using a different approach to identify multi-agency programs:",
" - Look at sub_program_name for indicators of collaboration",
" - Check if there are text fields with partnership information",
" - Consider if programs can have multiple sub-agencies that aren't captured in agency_name",
"2. Explore if collaboration occurs at higher levels (program level) rather than project level",
"3. Check if some agencies have been merged or renamed in the data",
"4. For temporal analysis, check if collaboration patterns changed over time",
"5. Consider if collaboration is defined by regions/jurisdictions rather than state agencies"
]
if output_path:
with open(output_path, 'w') as f:
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) COLLABORATION DETECTION ANALYSIS\n")
f.write("==================================================================\n\n")
f.write("AGENCY ANALYSIS\n")
f.write(f"Found {len(agencies)} unique agencies\n")
f.write("Agencies: " + ", ".join(sorted(agencies)[:20]) + "...\n\n" if len(agencies) > 20 else ", ".join(sorted(agencies)) + "\n\n")
f.write("MULTI-AGENCY PROGRAM ANALYSIS\n")
f.write(f"Found {len(multi_agency_programs)} programs with multiple agencies\n\n")
if len(multi_agency_programs) > 0:
f.write("Multi-agency programs:\n")
for program, count in multi_agency_programs.items():
agencies = df[df['program_name'] == program]['agency_name'].unique()
f.write(f" {program}: {count} agencies ({', '.join(agencies)})\n")
else:
f.write("No multi-agency programs found using current detection method\n")
f.write("\nRECOMMENDATIONS\n")
for rec in recommendations:
f.write(f"{rec}\n")
logger.info(f"Saved collaboration detection analysis to {output_path}")
# Return the number of multi-agency programs
return len(multi_agency_programs)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Investigate collaboration patterns in CCI data')
parser.add_argument('--input_path', type=str, required=True, help='Path to the CCI data CSV file')
parser.add_argument('--output_path', type=str, help='Path to save findings')
args = parser.parse_args()
investigate_collaboration(args.input_path, args.output_path)

540
data_cleaning_script.py Normal file
View File

@@ -0,0 +1,540 @@
import pandas as pd
import numpy as np
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("cci_data_prep")
def clean_and_prepare_cci_data(input_path, output_path=None):
"""
Clean and prepare the CCI data for analysis, fixing specific issues identified.
Parameters:
input_path (str): Path to the original CCI data file
output_path (str, optional): Path to save the cleaned data
Returns:
pd.DataFrame: The cleaned and prepared data
"""
logger.info(f"Loading data from {input_path}")
# Try different encodings if needed
try:
df = pd.read_csv(input_path)
except UnicodeDecodeError:
logger.info("Trying different encoding (latin-1)")
df = pd.read_csv(input_path, encoding='latin-1')
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
# 1. Fix column names - standardize to lowercase with underscores
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
# 2. Identify and mark EV vouchers/rebates
logger.info("Identifying EV vouchers and rebates")
# Check if required columns exist
required_cols = ['agency_name', 'program_name']
if not all(col in df.columns for col in required_cols):
missing = [col for col in required_cols if col not in df.columns]
logger.error(f"Missing required columns: {missing}")
return df
# Identify CARB's Low Carbon Transportation projects
carb_mask = df['agency_name'].str.contains('Air Resources Board', case=False, na=False)
lct_mask = df['program_name'].str.contains('Low Carbon Transportation', case=False, na=False)
# Create CARB indicator
df['is_carb'] = carb_mask
# 3. Mark EV projects using multiple methods
# Start with subprogram if available
ev_mask = pd.Series(False, index=df.index)
if 'sub_program_name' in df.columns:
ev_indicators = ['Clean Cars 4 All', 'CVRP', 'Clean Vehicle', 'EV', 'Electric Vehicle',
'Hybrid', 'Rebate', 'Voucher', 'ZEV', 'Zero Emission']
ev_subprogram_mask = df['sub_program_name'].str.contains('|'.join(ev_indicators),
case=False, na=False)
ev_mask = ev_mask | (carb_mask & lct_mask & ev_subprogram_mask)
# Check project count column
if 'number_of_rebates_issued' in df.columns:
rebate_mask = df['number_of_rebates_issued'] > 0
ev_mask = ev_mask | (carb_mask & rebate_mask)
# Check for small funding amounts typical of vouchers
funding_col = None
for col in df.columns:
if 'total_program' in col.lower() and 'funding' in col.lower():
funding_col = col
break
if funding_col:
# Identify potential vouchers by small funding amount (for individual vouchers)
small_funding_mask = (df[funding_col] > 0) & (df[funding_col] < 10000) & carb_mask & lct_mask
ev_mask = ev_mask | small_funding_mask
# Mark EV vouchers
df['is_ev_voucher'] = ev_mask
count_ev = ev_mask.sum()
logger.info(f"Identified {count_ev} EV vouchers/rebates")
# 4. Create funding year if needed
if 'funding_year' not in df.columns and 'fiscal_year_funding_project' in df.columns:
# Extract year from fiscal year format (e.g., "2019-20" -> 2019)
try:
df['funding_year'] = df['fiscal_year_funding_project'].str.extract(r'(\d{4})').astype('Int64')
logger.info("Created funding_year column from fiscal year data")
except Exception as e:
logger.error(f"Error creating funding_year: {e}")
# 5. Calculate GHG efficiency
if funding_col:
ghg_col = None
for col in df.columns:
if 'total_project' in col.lower() and 'ghg' in col.lower():
ghg_col = col
break
if ghg_col:
df['ghg_efficiency'] = np.where(
df[ghg_col] > 0,
df[funding_col] / df[ghg_col],
np.nan
)
logger.info("Calculated GHG efficiency ($ per ton CO2e)")
# 6. Calculate DAC benefit percentage
dac_funding_col = None
for col in df.columns:
if 'funding_benefiting' in col.lower() and 'disadvantaged' in col.lower():
dac_funding_col = col
break
if dac_funding_col and funding_col:
df['dac_benefit_percentage'] = np.where(
df[funding_col] > 0,
100 * df[dac_funding_col] / df[funding_col],
0
)
logger.info("Calculated DAC benefit percentage")
# 7. Identify multi-agency programs
logger.info("Identifying multi-agency programs")
program_agencies = df.groupby('program_name')['agency_name'].nunique()
df['num_agencies_in_program'] = df['program_name'].map(program_agencies)
df['is_multi_agency'] = df['num_agencies_in_program'] > 1
multi_agency_count = (df['is_multi_agency'] == True).sum()
logger.info(f"Found {multi_agency_count} projects in multi-agency programs")
# 8. Identify regional scope
if 'county' in df.columns:
logger.info("Determining regional scope of projects")
program_counties = df.groupby('program_name')['county'].nunique()
df['num_counties'] = df['program_name'].map(program_counties)
# Define region categories
df['regional_scope'] = pd.cut(
df['num_counties'],
bins=[0, 1, 3, 10, np.inf],
labels=['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
)
# 9. Assign California region based on county
if 'county' in df.columns:
logger.info("Assigning California regions")
# Define California regions
ca_regions = {
'Bay Area': ['Alameda', 'Contra Costa', 'Marin', 'Napa', 'San Francisco', 'San Mateo', 'Santa Clara', 'Solano', 'Sonoma'],
'Sacramento Region': ['El Dorado', 'Placer', 'Sacramento', 'Sutter', 'Yolo', 'Yuba'],
'San Joaquin Valley': ['Fresno', 'Kern', 'Kings', 'Madera', 'Merced', 'San Joaquin', 'Stanislaus', 'Tulare'],
'Southern California': ['Imperial', 'Los Angeles', 'Orange', 'Riverside', 'San Bernardino', 'San Diego', 'Ventura'],
'Central Coast': ['Monterey', 'San Benito', 'San Luis Obispo', 'Santa Barbara', 'Santa Cruz'],
'Northern California': ['Butte', 'Colusa', 'Del Norte', 'Glenn', 'Humboldt', 'Lake', 'Lassen', 'Mendocino', 'Modoc', 'Nevada', 'Plumas', 'Shasta', 'Sierra', 'Siskiyou', 'Tehama', 'Trinity'],
'Sierra Nevada': ['Alpine', 'Amador', 'Calaveras', 'Inyo', 'Mariposa', 'Mono', 'Tuolumne']
}
# Create mapping dictionary
county_to_region = {}
for region, counties in ca_regions.items():
for county in counties:
county_to_region[county] = region
# Apply mapping
df['ca_region'] = df['county'].map(county_to_region)
# For projects with multiple counties, determine if they are multi-region
multi_county_programs = program_counties[program_counties > 1].index
# For multi-county programs, check if they span multiple regions
for program in multi_county_programs:
program_df = df[df['program_name'] == program]
unique_regions = program_df['ca_region'].nunique()
if unique_regions > 1:
df.loc[df['program_name'] == program, 'ca_region'] = 'Multi-Region'
# 10. Create temporal period indicator (pre/post 2020)
if 'funding_year' in df.columns:
logger.info("Creating temporal period indicator (pre/post 2020)")
df['period'] = df['funding_year'].apply(lambda x: 'Post-2020' if x >= 2020 else 'Pre-2020')
# 11. Handle outliers in GHG efficiency and DAC benefit
if 'ghg_efficiency' in df.columns:
# Cap extreme values at 95th percentile
upper_limit = df['ghg_efficiency'].quantile(0.95)
df['ghg_efficiency_capped'] = df['ghg_efficiency'].clip(upper=upper_limit)
# Log transform for analysis
df['ghg_efficiency_log'] = np.log1p(df['ghg_efficiency_capped'])
logger.info(f"Handled outliers in GHG efficiency (capped at ${upper_limit:.2f} per ton)")
if 'dac_benefit_percentage' in df.columns:
# Handle values > 100%
df['dac_benefit_percentage'] = df['dac_benefit_percentage'].clip(upper=100)
logger.info("Capped DAC benefit percentage at 100%")
# Save cleaned data if output path provided
if output_path:
output_file = Path(output_path)
logger.info(f"Saving cleaned data to {output_file}")
df.to_csv(output_file, index=False)
return df
def json_serializable(obj):
"""Convert NumPy types to Python standard types for JSON serialization."""
if isinstance(obj, (np.integer, np.int64)):
return int(obj)
elif isinstance(obj, (np.floating, np.float64)):
return float(obj)
elif isinstance(obj, (np.ndarray,)):
return obj.tolist()
else:
return obj
def generate_data_summary(df, output_path=None):
"""
Generate a summary of the cleaned CCI data.
Parameters:
df (pd.DataFrame): The cleaned CCI data
output_path (str, optional): Path to save the summary
Returns:
dict: Summary statistics
"""
summary = {}
# 1. Basic dataset stats
summary['total_projects'] = len(df)
summary['total_agencies'] = df['agency_name'].nunique()
summary['total_programs'] = df['program_name'].nunique()
if 'sub_program_name' in df.columns:
summary['total_subprograms'] = df['sub_program_name'].nunique()
# 2. CARB vs Non-CARB breakdown
if 'is_carb' in df.columns:
carb_df = df[df['is_carb']]
non_carb_df = df[~df['is_carb']]
summary['carb_projects'] = len(carb_df)
summary['non_carb_projects'] = len(non_carb_df)
summary['carb_percentage'] = len(carb_df) / len(df) * 100
# 3. EV vouchers breakdown
if 'is_ev_voucher' in df.columns:
ev_df = df[df['is_ev_voucher']]
summary['ev_vouchers'] = len(ev_df)
summary['ev_percentage'] = len(ev_df) / len(df) * 100
if 'is_carb' in df.columns:
summary['ev_percentage_of_carb'] = len(ev_df) / len(carb_df) * 100 if len(carb_df) > 0 else 0
# 4. Funding statistics
funding_col = None
for col in df.columns:
if 'total_program' in col.lower() and 'funding' in col.lower():
funding_col = col
break
if funding_col:
summary['total_funding'] = df[funding_col].sum()
summary['avg_funding_per_project'] = df[funding_col].mean()
if 'is_carb' in df.columns:
summary['carb_funding'] = carb_df[funding_col].sum()
summary['non_carb_funding'] = non_carb_df[funding_col].sum()
summary['carb_funding_percentage'] = carb_df[funding_col].sum() / df[funding_col].sum() * 100
summary['avg_carb_funding'] = carb_df[funding_col].mean()
summary['avg_non_carb_funding'] = non_carb_df[funding_col].mean()
if 'is_ev_voucher' in df.columns:
summary['ev_funding'] = ev_df[funding_col].sum()
summary['ev_funding_percentage'] = ev_df[funding_col].sum() / df[funding_col].sum() * 100
summary['avg_ev_funding'] = ev_df[funding_col].mean()
# 5. GHG reduction statistics
ghg_col = None
for col in df.columns:
if 'total_project' in col.lower() and 'ghg' in col.lower():
ghg_col = col
break
if ghg_col:
summary['total_ghg_reduction'] = df[ghg_col].sum()
summary['avg_ghg_reduction_per_project'] = df[ghg_col].mean()
if 'is_carb' in df.columns:
summary['carb_ghg_reduction'] = carb_df[ghg_col].sum()
summary['non_carb_ghg_reduction'] = non_carb_df[ghg_col].sum()
summary['carb_ghg_percentage'] = carb_df[ghg_col].sum() / df[ghg_col].sum() * 100
if 'is_ev_voucher' in df.columns:
summary['ev_ghg_reduction'] = ev_df[ghg_col].sum()
summary['ev_ghg_percentage'] = ev_df[ghg_col].sum() / df[ghg_col].sum() * 100
# 6. Efficiency statistics
if 'ghg_efficiency' in df.columns:
# Use median for efficiency due to skewness
valid_efficiency = df[df['ghg_efficiency'].notna() & (df['ghg_efficiency'] > 0)]
if len(valid_efficiency) > 0:
summary['median_ghg_efficiency'] = valid_efficiency['ghg_efficiency'].median()
if 'is_carb' in df.columns:
valid_carb = carb_df[carb_df['ghg_efficiency'].notna() & (carb_df['ghg_efficiency'] > 0)]
valid_non_carb = non_carb_df[non_carb_df['ghg_efficiency'].notna() & (non_carb_df['ghg_efficiency'] > 0)]
if len(valid_carb) > 0:
summary['median_carb_efficiency'] = valid_carb['ghg_efficiency'].median()
if len(valid_non_carb) > 0:
summary['median_non_carb_efficiency'] = valid_non_carb['ghg_efficiency'].median()
if 'is_ev_voucher' in df.columns:
valid_ev = ev_df[ev_df['ghg_efficiency'].notna() & (ev_df['ghg_efficiency'] > 0)]
if len(valid_ev) > 0:
summary['median_ev_efficiency'] = valid_ev['ghg_efficiency'].median()
# 7. DAC benefit statistics
if 'dac_benefit_percentage' in df.columns:
summary['avg_dac_benefit'] = df['dac_benefit_percentage'].mean()
if 'is_carb' in df.columns:
summary['avg_carb_dac_benefit'] = carb_df['dac_benefit_percentage'].mean()
summary['avg_non_carb_dac_benefit'] = non_carb_df['dac_benefit_percentage'].mean()
if 'is_ev_voucher' in df.columns:
summary['avg_ev_dac_benefit'] = ev_df['dac_benefit_percentage'].mean()
# 8. Multi-agency statistics
if 'is_multi_agency' in df.columns:
multi_df = df[df['is_multi_agency']]
single_df = df[~df['is_multi_agency']]
summary['multi_agency_projects'] = len(multi_df)
summary['multi_agency_percentage'] = len(multi_df) / len(df) * 100
if 'num_agencies_in_program' in df.columns:
summary['avg_agencies_per_program'] = df['num_agencies_in_program'].mean()
if 'ghg_efficiency' in df.columns:
valid_multi = multi_df[multi_df['ghg_efficiency'].notna() & (multi_df['ghg_efficiency'] > 0)]
valid_single = single_df[single_df['ghg_efficiency'].notna() & (single_df['ghg_efficiency'] > 0)]
if len(valid_multi) > 0:
summary['median_multi_agency_efficiency'] = valid_multi['ghg_efficiency'].median()
if len(valid_single) > 0:
summary['median_single_agency_efficiency'] = valid_single['ghg_efficiency'].median()
if 'dac_benefit_percentage' in df.columns:
summary['avg_multi_agency_dac_benefit'] = multi_df['dac_benefit_percentage'].mean()
summary['avg_single_agency_dac_benefit'] = single_df['dac_benefit_percentage'].mean()
# 9. Temporal statistics
if 'period' in df.columns:
pre_df = df[df['period'] == 'Pre-2020']
post_df = df[df['period'] == 'Post-2020']
summary['pre_2020_projects'] = len(pre_df)
summary['post_2020_projects'] = len(post_df)
if 'num_agencies_in_program' in df.columns:
summary['pre_2020_avg_agencies'] = pre_df['num_agencies_in_program'].mean()
summary['post_2020_avg_agencies'] = post_df['num_agencies_in_program'].mean()
summary['agency_change_percentage'] = ((post_df['num_agencies_in_program'].mean() -
pre_df['num_agencies_in_program'].mean()) /
pre_df['num_agencies_in_program'].mean() * 100) if pre_df['num_agencies_in_program'].mean() > 0 else 0
if funding_col:
summary['pre_2020_avg_funding'] = pre_df[funding_col].mean()
summary['post_2020_avg_funding'] = post_df[funding_col].mean()
summary['funding_change_percentage'] = ((post_df[funding_col].mean() -
pre_df[funding_col].mean()) /
pre_df[funding_col].mean() * 100) if pre_df[funding_col].mean() > 0 else 0
if 'dac_benefit_percentage' in df.columns:
summary['pre_2020_avg_dac_benefit'] = pre_df['dac_benefit_percentage'].mean()
summary['post_2020_avg_dac_benefit'] = post_df['dac_benefit_percentage'].mean()
summary['dac_change_percentage'] = ((post_df['dac_benefit_percentage'].mean() -
pre_df['dac_benefit_percentage'].mean()) /
pre_df['dac_benefit_percentage'].mean() * 100) if pre_df['dac_benefit_percentage'].mean() > 0 else 0
# 10. Regional statistics
if 'ca_region' in df.columns:
region_counts = df['ca_region'].value_counts()
region_percentages = df['ca_region'].value_counts(normalize=True) * 100
summary['region_counts'] = region_counts.to_dict()
summary['region_percentages'] = region_percentages.to_dict()
# Get efficiency and DAC benefit by region
if 'ghg_efficiency' in df.columns:
region_efficiency = df.groupby('ca_region')['ghg_efficiency'].median()
summary['region_efficiency'] = region_efficiency.to_dict()
if 'dac_benefit_percentage' in df.columns:
region_dac = df.groupby('ca_region')['dac_benefit_percentage'].mean()
summary['region_dac_benefit'] = region_dac.to_dict()
# Save summary if output path provided
if output_path:
import json
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w') as f:
json.dump(summary, f, indent=2, default=json_serializable)
logger.info(f"Saved data summary to {output_file}")
# Also create a readable text version
text_file = output_file.with_suffix('.txt')
with open(text_file, 'w') as f:
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) DATA SUMMARY\n")
f.write("================================================\n\n")
f.write("DATASET OVERVIEW\n")
f.write(f"Total Projects: {summary['total_projects']:,}\n")
f.write(f"Total Agencies: {summary['total_agencies']}\n")
f.write(f"Total Programs: {summary['total_programs']}\n")
if 'total_subprograms' in summary:
f.write(f"Total Subprograms: {summary['total_subprograms']}\n")
f.write("\nCARB VS NON-CARB BREAKDOWN\n")
if 'carb_projects' in summary:
f.write(f"CARB Projects: {summary['carb_projects']:,} ({summary['carb_percentage']:.1f}%)\n")
f.write(f"Non-CARB Projects: {summary['non_carb_projects']:,} ({100-summary['carb_percentage']:.1f}%)\n")
if 'carb_funding' in summary:
f.write(f"CARB Funding: ${summary['carb_funding']:,.2f} ({summary['carb_funding_percentage']:.1f}%)\n")
f.write(f"Non-CARB Funding: ${summary['non_carb_funding']:,.2f} ({100-summary['carb_funding_percentage']:.1f}%)\n")
f.write(f"Average CARB Project: ${summary['avg_carb_funding']:,.2f}\n")
f.write(f"Average Non-CARB Project: ${summary['avg_non_carb_funding']:,.2f}\n")
if 'carb_ghg_reduction' in summary:
f.write(f"CARB GHG Reductions: {summary['carb_ghg_reduction']:,.2f} tons ({summary['carb_ghg_percentage']:.1f}%)\n")
f.write(f"Non-CARB GHG Reductions: {summary['non_carb_ghg_reduction']:,.2f} tons ({100-summary['carb_ghg_percentage']:.1f}%)\n")
if 'median_carb_efficiency' in summary and 'median_non_carb_efficiency' in summary:
f.write(f"CARB Efficiency: ${summary['median_carb_efficiency']:,.2f} per ton CO2e\n")
f.write(f"Non-CARB Efficiency: ${summary['median_non_carb_efficiency']:,.2f} per ton CO2e\n")
f.write("\nEV VOUCHERS BREAKDOWN\n")
if 'ev_vouchers' in summary:
f.write(f"EV Vouchers: {summary['ev_vouchers']:,} ({summary['ev_percentage']:.1f}% of total)\n")
if 'ev_percentage_of_carb' in summary:
f.write(f"Percentage of CARB Projects: {summary['ev_percentage_of_carb']:.1f}%\n")
if 'ev_funding' in summary:
f.write(f"EV Funding: ${summary['ev_funding']:,.2f} ({summary['ev_funding_percentage']:.1f}% of total)\n")
f.write(f"Average Voucher Amount: ${summary['avg_ev_funding']:,.2f}\n")
if 'ev_ghg_reduction' in summary:
f.write(f"EV GHG Reductions: {summary['ev_ghg_reduction']:,.2f} tons ({summary['ev_ghg_percentage']:.1f}% of total)\n")
if 'median_ev_efficiency' in summary:
f.write(f"EV Efficiency: ${summary['median_ev_efficiency']:,.2f} per ton CO2e\n")
f.write("\nMULTI-AGENCY COLLABORATION\n")
if 'multi_agency_projects' in summary:
f.write(f"Multi-Agency Projects: {summary['multi_agency_projects']:,} ({summary['multi_agency_percentage']:.1f}%)\n")
if 'avg_agencies_per_program' in summary:
f.write(f"Average Agencies per Program: {summary['avg_agencies_per_program']:.2f}\n")
if 'median_multi_agency_efficiency' in summary and 'median_single_agency_efficiency' in summary:
f.write(f"Multi-Agency Efficiency: ${summary['median_multi_agency_efficiency']:,.2f} per ton CO2e\n")
f.write(f"Single-Agency Efficiency: ${summary['median_single_agency_efficiency']:,.2f} per ton CO2e\n")
if 'avg_multi_agency_dac_benefit' in summary and 'avg_single_agency_dac_benefit' in summary:
f.write(f"Multi-Agency DAC Benefit: {summary['avg_multi_agency_dac_benefit']:.2f}%\n")
f.write(f"Single-Agency DAC Benefit: {summary['avg_single_agency_dac_benefit']:.2f}%\n")
f.write("\nTEMPORAL TRENDS (PRE/POST 2020)\n")
if 'pre_2020_projects' in summary and 'post_2020_projects' in summary:
f.write(f"Pre-2020 Projects: {summary['pre_2020_projects']:,}\n")
f.write(f"Post-2020 Projects: {summary['post_2020_projects']:,}\n")
if 'agency_change_percentage' in summary:
f.write(f"Change in Average Agencies: {summary['agency_change_percentage']:+.1f}%\n")
if 'funding_change_percentage' in summary:
f.write(f"Change in Average Funding: {summary['funding_change_percentage']:+.1f}%\n")
if 'dac_change_percentage' in summary:
f.write(f"Change in DAC Benefit: {summary['dac_change_percentage']:+.1f}%\n")
f.write("\nREGIONAL ANALYSIS\n")
if 'region_counts' in summary:
for region, count in sorted(summary['region_counts'].items(), key=lambda x: x[1], reverse=True):
f.write(f"{region}: {count:,} projects ({summary['region_percentages'][region]:.1f}%)\n")
f.write("\nEfficiency by Region ($ per ton CO2e):\n")
if 'region_efficiency' in summary:
for region, efficiency in sorted(summary['region_efficiency'].items(), key=lambda x: x[1]):
f.write(f"{region}: ${efficiency:,.2f}\n")
f.write("\nDAC Benefit by Region:\n")
if 'region_dac_benefit' in summary:
for region, dac in sorted(summary['region_dac_benefit'].items(), key=lambda x: x[1], reverse=True):
f.write(f"{region}: {dac:.2f}%\n")
logger.info(f"Saved readable summary to {text_file}")
return summary
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Clean and prepare CCI data for analysis')
parser.add_argument('--input_path', type=str, required=True, help='Path to the input CCI data file')
parser.add_argument('--output_path', type=str, help='Path to save the cleaned data')
parser.add_argument('--summary_path', type=str, help='Path to save the data summary')
args = parser.parse_args()
# Clean and prepare the data
cleaned_df = clean_and_prepare_cci_data(args.input_path, args.output_path)
# Generate summary
if args.summary_path:
generate_data_summary(cleaned_df, args.summary_path)

146898
output/cleaned_cci_data.csv Normal file

File diff suppressed because it is too large Load Diff

70
output/data_summary.json Normal file
View File

@@ -0,0 +1,70 @@
{
"total_projects": 146305,
"total_agencies": 21,
"total_programs": 39,
"total_subprograms": 76,
"carb_projects": 125581,
"non_carb_projects": 20724,
"carb_percentage": 85.83507057175079,
"ev_vouchers": 109270,
"ev_percentage": 74.68644270530741,
"ev_percentage_of_carb": 87.01157022160996,
"total_funding": 11588544819,
"avg_funding_per_project": 79208.12562113394,
"carb_funding": 3372893006,
"non_carb_funding": 8215651813,
"carb_funding_percentage": 29.10540588728598,
"avg_carb_funding": 26858.30663874312,
"avg_non_carb_funding": 396431.76090523065,
"ev_funding": 1714222371,
"ev_funding_percentage": 14.792386773095501,
"avg_ev_funding": 15687.950681797383,
"total_ghg_reduction": 112749573,
"avg_ghg_reduction_per_project": 770.6474351525922,
"carb_ghg_reduction": 5011819,
"non_carb_ghg_reduction": 107737754,
"carb_ghg_percentage": 4.44508911798717,
"ev_ghg_reduction": 4193168,
"ev_ghg_percentage": 3.71901009327991,
"median_ghg_efficiency": 312.5,
"median_carb_efficiency": 312.5,
"median_non_carb_efficiency": 197.3,
"median_ev_efficiency": 312.5,
"avg_dac_benefit": 1.29624968537629,
"avg_carb_dac_benefit": 1.3918376975701816,
"avg_non_carb_dac_benefit": 0.9900241837968561,
"avg_ev_dac_benefit": 1.39305362075886,
"multi_agency_projects": 0,
"multi_agency_percentage": 0.0,
"avg_agencies_per_program": 1.0,
"median_single_agency_efficiency": 312.5,
"avg_multi_agency_dac_benefit": NaN,
"avg_single_agency_dac_benefit": 1.29624968537629,
"pre_2020_projects": 144185,
"post_2020_projects": 2120,
"pre_2020_avg_agencies": 1.0,
"post_2020_avg_agencies": 1.0,
"agency_change_percentage": 0.0,
"pre_2020_avg_funding": 71885.7690813885,
"post_2020_avg_funding": 577214.7188679245,
"funding_change_percentage": 702.9610397774371,
"pre_2020_avg_dac_benefit": 1.2963895281933258,
"post_2020_avg_dac_benefit": 0.0,
"dac_change_percentage": -100.0,
"region_counts": {
"Multi-Region": 146221,
"Bay Area": 84
},
"region_percentages": {
"Multi-Region": 99.94258569426883,
"Bay Area": 0.05741430573117801
},
"region_efficiency": {
"Bay Area": NaN,
"Multi-Region": 312.5
},
"region_dac_benefit": {
"Bay Area": NaN,
"Multi-Region": 1.29624968537629
}
}

53
output/data_summary.txt Normal file
View File

@@ -0,0 +1,53 @@
CALIFORNIA CLIMATE INVESTMENTS (CCI) DATA SUMMARY
================================================
DATASET OVERVIEW
Total Projects: 146,305
Total Agencies: 21
Total Programs: 39
Total Subprograms: 76
CARB VS NON-CARB BREAKDOWN
CARB Projects: 125,581 (85.8%)
Non-CARB Projects: 20,724 (14.2%)
CARB Funding: $3,372,893,006.00 (29.1%)
Non-CARB Funding: $8,215,651,813.00 (70.9%)
Average CARB Project: $26,858.31
Average Non-CARB Project: $396,431.76
CARB GHG Reductions: 5,011,819.00 tons (4.4%)
Non-CARB GHG Reductions: 107,737,754.00 tons (95.6%)
CARB Efficiency: $312.50 per ton CO2e
Non-CARB Efficiency: $197.30 per ton CO2e
EV VOUCHERS BREAKDOWN
EV Vouchers: 109,270 (74.7% of total)
Percentage of CARB Projects: 87.0%
EV Funding: $1,714,222,371.00 (14.8% of total)
Average Voucher Amount: $15,687.95
EV GHG Reductions: 4,193,168.00 tons (3.7% of total)
EV Efficiency: $312.50 per ton CO2e
MULTI-AGENCY COLLABORATION
Multi-Agency Projects: 0 (0.0%)
Average Agencies per Program: 1.00
Multi-Agency DAC Benefit: nan%
Single-Agency DAC Benefit: 1.30%
TEMPORAL TRENDS (PRE/POST 2020)
Pre-2020 Projects: 144,185
Post-2020 Projects: 2,120
Change in Average Agencies: +0.0%
Change in Average Funding: +703.0%
Change in DAC Benefit: -100.0%
REGIONAL ANALYSIS
Multi-Region: 146,221 projects (99.9%)
Bay Area: 84 projects (0.1%)
Efficiency by Region ($ per ton CO2e):
Bay Area: $nan
Multi-Region: $312.50
DAC Benefit by Region:
Bay Area: nan%
Multi-Region: 1.30%

262
regional_analysis_script.py Normal file
View File

@@ -0,0 +1,262 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("regional_analysis")
def analyze_regional_distribution(input_path, output_path=None):
"""
Analyze the regional distribution of CCI projects and its
relationship to GHG efficiency and DAC benefits.
Parameters:
input_path (str): Path to the cleaned CCI data CSV file
output_path (str, optional): Path to save findings and visualizations
"""
logger.info(f"Loading data from {input_path}")
# Load the data
df = pd.read_csv(input_path, low_memory=False)
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
# Check if we have the regional data
if 'ca_region' not in df.columns:
logger.error("Regional data not found in the dataset")
return
# Define output directory if provided
output_dir = None
if output_path:
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
# 1. Basic regional distribution analysis
region_counts = df['ca_region'].value_counts()
region_percent = df['ca_region'].value_counts(normalize=True) * 100
logger.info("Regional distribution of CCI projects:")
for region, count in region_counts.items():
logger.info(f" {region}: {count} projects ({region_percent[region]:.1f}%)")
# Visualize regional distribution
plt.figure(figsize=(10, 6))
region_counts.plot(kind='bar')
plt.title('Number of CCI Projects by Region')
plt.xlabel('Region')
plt.ylabel('Number of Projects')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_distribution.png", dpi=300)
plt.close()
# 2. EV vouchers vs non-EV projects by region
if 'is_ev_voucher' in df.columns:
ev_by_region = df[df['is_ev_voucher']]['ca_region'].value_counts()
nonev_by_region = df[~df['is_ev_voucher']]['ca_region'].value_counts()
# Calculate percentages
ev_percent = 100 * ev_by_region / ev_by_region.sum()
nonev_percent = 100 * nonev_by_region / nonev_by_region.sum()
# Combine for comparison
comparison_df = pd.DataFrame({
'EV Vouchers': ev_percent,
'Non-EV Projects': nonev_percent
})
# Fill missing values with 0
comparison_df = comparison_df.fillna(0)
# Visualize comparison
plt.figure(figsize=(12, 6))
comparison_df.plot(kind='bar')
plt.title('Regional Distribution: EV Vouchers vs. Non-EV Projects')
plt.xlabel('Region')
plt.ylabel('Percentage of Projects')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Project Type')
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_ev_comparison.png", dpi=300)
plt.close()
# 3. GHG efficiency by region
if 'ghg_efficiency' in df.columns:
# Filter to valid efficiency values and non-extreme outliers
valid_data = df[(df['ghg_efficiency'].notna()) &
(df['ghg_efficiency'] > 0) &
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
# Calculate median efficiency by region
efficiency_by_region = valid_data.groupby('ca_region')['ghg_efficiency'].median().sort_values()
logger.info("GHG efficiency by region ($ per ton CO2e, median):")
for region, efficiency in efficiency_by_region.items():
logger.info(f" {region}: ${efficiency:.2f}")
# Visualize efficiency by region
plt.figure(figsize=(10, 6))
efficiency_by_region.plot(kind='barh')
plt.title('GHG Efficiency by Region (lower is better)')
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
plt.ylabel('Region')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_efficiency.png", dpi=300)
plt.close()
# 4. DAC benefit by region
if 'dac_benefit_percentage' in df.columns:
# Calculate mean DAC benefit by region
dac_by_region = df.groupby('ca_region')['dac_benefit_percentage'].mean().sort_values(ascending=False)
logger.info("DAC benefit percentage by region:")
for region, dac in dac_by_region.items():
logger.info(f" {region}: {dac:.2f}%")
# Visualize DAC benefit by region
plt.figure(figsize=(10, 6))
dac_by_region.plot(kind='barh')
plt.title('DAC Benefit Percentage by Region')
plt.xlabel('DAC Benefit Percentage')
plt.ylabel('Region')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
if output_dir:
plt.savefig(output_dir / "regional_dac_benefit.png", dpi=300)
plt.close()
# 5. Efficiency vs Equity by Region
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
# Filter to valid data
valid_data = df[(df['ghg_efficiency'].notna()) &
(df['dac_benefit_percentage'].notna()) &
(df['ghg_efficiency'] > 0) &
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
# Calculate regional metrics
region_metrics = valid_data.groupby('ca_region').agg({
'ghg_efficiency': 'median',
'dac_benefit_percentage': 'mean',
'ca_region': 'count'
}).rename(columns={'ca_region': 'project_count'})
# Create scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
region_metrics['ghg_efficiency'],
region_metrics['dac_benefit_percentage'],
s=region_metrics['project_count'] / 10, # Size based on project count
alpha=0.7
)
# Add region labels
for region in region_metrics.index:
plt.annotate(
region,
(region_metrics.loc[region, 'ghg_efficiency'],
region_metrics.loc[region, 'dac_benefit_percentage']),
textcoords="offset points",
xytext=(5, 5),
ha='left'
)
# Add quadrant lines
median_efficiency = region_metrics['ghg_efficiency'].median()
median_dac = region_metrics['dac_benefit_percentage'].median()
plt.axvline(x=median_efficiency, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=median_dac, color='gray', linestyle='--', alpha=0.5)
# Add quadrant labels
plt.text(0.98, 0.98, 'High Cost,\nHigh Equity', transform=plt.gca().transAxes,
ha='right', va='top', bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.02, 0.98, 'Low Cost,\nHigh Equity', transform=plt.gca().transAxes,
ha='left', va='top', bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.98, 0.02, 'High Cost,\nLow Equity', transform=plt.gca().transAxes,
ha='right', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
plt.text(0.02, 0.02, 'Low Cost,\nLow Equity', transform=plt.gca().transAxes,
ha='left', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
plt.ylabel('DAC Benefit Percentage')
plt.title('Efficiency vs. Equity by Region')
plt.grid(True, linestyle='--', alpha=0.7)
if output_dir:
plt.savefig(output_dir / "regional_efficiency_equity.png", dpi=300)
plt.close()
# 6. Generate a summary text file
if output_dir:
with open(output_dir / "regional_analysis_summary.txt", 'w') as f:
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) REGIONAL ANALYSIS\n")
f.write("===================================================\n\n")
f.write("REGIONAL DISTRIBUTION\n")
for region, count in region_counts.items():
f.write(f"{region}: {count} projects ({region_percent[region]:.1f}%)\n")
if 'ghg_efficiency' in df.columns:
f.write("\nGHG EFFICIENCY BY REGION ($ PER TON CO2E, MEDIAN)\n")
for region, efficiency in efficiency_by_region.items():
f.write(f"{region}: ${efficiency:.2f}\n")
if 'dac_benefit_percentage' in df.columns:
f.write("\nDAC BENEFIT PERCENTAGE BY REGION\n")
for region, dac in dac_by_region.items():
f.write(f"{region}: {dac:.2f}%\n")
f.write("\nKEY FINDINGS\n")
# Add key findings based on the analysis
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
# Identify top performing regions
best_efficiency_region = efficiency_by_region.index[0]
best_dac_region = dac_by_region.index[0]
f.write(f"1. {best_efficiency_region} achieves the best GHG efficiency (${efficiency_by_region[best_efficiency_region]:.2f} per ton).\n")
f.write(f"2. {best_dac_region} achieves the highest DAC benefit ({dac_by_region[best_dac_region]:.2f}%).\n")
# Identify balanced regions (good in both dimensions)
low_cost_high_equity = region_metrics[(region_metrics['ghg_efficiency'] < median_efficiency) &
(region_metrics['dac_benefit_percentage'] > median_dac)]
if len(low_cost_high_equity) > 0:
top_balanced = low_cost_high_equity.index[0]
f.write(f"3. {top_balanced} achieves the best balance between efficiency and equity.\n")
# Check for regional disparities
max_efficiency_diff = efficiency_by_region.max() / efficiency_by_region.min() if efficiency_by_region.min() > 0 else 0
max_dac_diff = dac_by_region.max() - dac_by_region.min()
f.write(f"4. Regional disparities: {max_efficiency_diff:.1f}x variation in efficiency, {max_dac_diff:.1f} percentage point variation in DAC benefits.\n")
logger.info(f"Saved regional analysis summary to {output_dir / 'regional_analysis_summary.txt'}")
logger.info("Regional analysis completed")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Analyze regional distribution of CCI projects')
parser.add_argument('--input_path', type=str, required=True, help='Path to the cleaned CCI data CSV file')
parser.add_argument('--output_path', type=str, help='Path to save findings and visualizations')
args = parser.parse_args()
analyze_regional_distribution(args.input_path, args.output_path)

2209
research_analysis_script.py Normal file

File diff suppressed because it is too large Load Diff

122
run_cci_analysis.py Normal file
View File

@@ -0,0 +1,122 @@
#!/usr/bin/env python3
"""
California Climate Investments (CCI) Collaboration Analysis Workflow
This script runs the complete workflow for analyzing collaboration patterns
in California's Climate Investments program and their impact on greenhouse
gas reduction efficiency and equity outcomes.
Usage:
python run_cci_analysis.py --data_path data/cci_programs_data_reduced.csv --output_dir output
"""
import os
import argparse
import logging
from pathlib import Path
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("cci_workflow")
def main():
"""Run the complete CCI collaboration analysis workflow."""
parser = argparse.ArgumentParser(description='Run CCI Collaboration Analysis Workflow')
parser.add_argument('--data_path', type=str, required=True, help='Path to the raw CCI data CSV file')
parser.add_argument('--output_dir', type=str, default='./output', help='Directory to save all outputs')
parser.add_argument('--skip_cleaning', action='store_true', help='Skip the data cleaning step')
parser.add_argument('--skip_analysis', action='store_true', help='Skip the detailed analysis step')
parser.add_argument('--skip_research', action='store_true', help='Skip the research questions analysis')
args = parser.parse_args()
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# File paths
raw_data_path = args.data_path
cleaned_data_path = output_dir / "cleaned_cci_data.csv"
data_summary_path = output_dir / "data_summary.json"
# Create subdirectories for different analysis outputs
cleaned_output_dir = output_dir / "cleaned"
analysis_output_dir = output_dir / "analysis"
research_output_dir = output_dir / "research"
for directory in [cleaned_output_dir, analysis_output_dir, research_output_dir]:
directory.mkdir(parents=True, exist_ok=True)
# Step 1: Clean and prepare the data
if not args.skip_cleaning:
logger.info("Step 1: Cleaning and preparing the CCI data")
try:
from data_cleaning_script import clean_and_prepare_cci_data, generate_data_summary
# Clean and prepare the data
cleaned_df = clean_and_prepare_cci_data(raw_data_path, cleaned_data_path)
# Generate data summary
generate_data_summary(cleaned_df, data_summary_path)
logger.info(f"Data cleaning complete. Cleaned data saved to {cleaned_data_path}")
logger.info(f"Data summary saved to {data_summary_path}")
except Exception as e:
logger.error(f"Error in data cleaning step: {e}")
return
else:
logger.info("Skipping data cleaning step")
# Check if cleaned data exists
if not cleaned_data_path.exists():
logger.error(f"Cleaned data file {cleaned_data_path} not found. Cannot proceed without data.")
return
# Step 2: Run the detailed collaboration analysis
if not args.skip_analysis:
logger.info("Step 2: Running detailed collaboration analysis")
try:
from cci_collaboration_analysis import CCICollaborationAnalyzer
# Initialize the analyzer
analyzer = CCICollaborationAnalyzer(cleaned_data_path, str(analysis_output_dir))
# Run full analysis
analyzer.run_full_analysis()
logger.info(f"Detailed analysis complete. Results saved to {analysis_output_dir}")
except Exception as e:
logger.error(f"Error in detailed analysis step: {e}")
logger.error("Continuing to research analysis with available data...")
else:
logger.info("Skipping detailed analysis step")
# Step 3: Analyze specific research questions
if not args.skip_research:
logger.info("Step 3: Analyzing research questions")
try:
from research_analysis_script import analyze_research_questions
# Run research analysis
findings = analyze_research_questions(cleaned_data_path, str(research_output_dir))
if findings:
logger.info(f"Research analysis complete. Results saved to {research_output_dir}")
else:
logger.error("Research analysis failed to complete successfully")
except Exception as e:
logger.error(f"Error in research analysis step: {e}")
else:
logger.info("Skipping research analysis step")
logger.info("CCI Collaboration Analysis Workflow complete!")
if __name__ == "__main__":
main()