idk
This commit is contained in:
Binary file not shown.
BIN
__pycache__/cci_collaboration_analysis.cpython-313.pyc
Normal file
BIN
__pycache__/cci_collaboration_analysis.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/data_cleaning_script.cpython-313.pyc
Normal file
BIN
__pycache__/data_cleaning_script.cpython-313.pyc
Normal file
Binary file not shown.
@@ -0,0 +1,94 @@
|
|||||||
|
|
||||||
|
"""
|
||||||
|
Cleaned version of the CCICollaborationAnalyzer script.
|
||||||
|
This script is structured and corrected for proper exception handling and visualization generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
from cci_analyzer import CCIDataAnalyzer
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||||
|
logger = logging.getLogger("cci_collaboration_analysis")
|
||||||
|
|
||||||
|
# Suppress pandas warnings
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
|
||||||
|
class CCICollaborationAnalyzer:
|
||||||
|
def __init__(self, data_path, output_path="./output/collaboration"):
|
||||||
|
self.data_path = Path(data_path)
|
||||||
|
self.output_path = Path(output_path)
|
||||||
|
self.output_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
self.base_analyzer = CCIDataAnalyzer(data_path, output_path=str(self.output_path))
|
||||||
|
if not self.base_analyzer.load_data():
|
||||||
|
logger.error("Failed to load data through base analyzer")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.data = self.base_analyzer.data
|
||||||
|
self.collaboration_metrics = {}
|
||||||
|
self.temporal_analysis = {}
|
||||||
|
self.regional_analysis = {}
|
||||||
|
self.ev_vouchers_analysis = {}
|
||||||
|
self._separate_ev_vouchers()
|
||||||
|
|
||||||
|
def _separate_ev_vouchers(self):
|
||||||
|
if 'cci_projects' not in self.data:
|
||||||
|
logger.error("No project data available to separate EV vouchers")
|
||||||
|
return
|
||||||
|
df = self.data['cci_projects']
|
||||||
|
try:
|
||||||
|
ev_mask = ((df['agency_name'].str.contains('Air Resources Board', case=False, na=False)) &
|
||||||
|
(df['program_name'].str.contains('Low Carbon Transportation', case=False, na=False)) &
|
||||||
|
(df['sub_program_name'].str.contains('Clean Cars 4 All|CVRP|Financing Assistance',
|
||||||
|
case=False, na=False)))
|
||||||
|
self.data['ev_vouchers'] = df[ev_mask].copy()
|
||||||
|
self.data['non_ev_projects'] = df[~ev_mask].copy()
|
||||||
|
logger.info(f"Separated {len(self.data['ev_vouchers'])} EV vouchers from {len(self.data['non_ev_projects'])} other projects")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error separating EV vouchers: {e}")
|
||||||
|
|
||||||
|
def _generate_visualizations(self):
|
||||||
|
"""Generate visualizations of key findings."""
|
||||||
|
logger.info("Generating visualizations")
|
||||||
|
try:
|
||||||
|
self._plot_collaboration_impact()
|
||||||
|
self._plot_temporal_trends()
|
||||||
|
self._plot_regional_analysis()
|
||||||
|
self._plot_ev_vouchers_analysis()
|
||||||
|
self._plot_efficiency_equity_tradeoff()
|
||||||
|
logger.info(f"All visualizations completed and saved to {self.output_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error generating visualizations: {e}")
|
||||||
|
|
||||||
|
# Define stubs for the required plotting methods
|
||||||
|
def _plot_collaboration_impact(self):
|
||||||
|
logger.info("Plotting collaboration impact...")
|
||||||
|
# Implementation goes here
|
||||||
|
|
||||||
|
def _plot_temporal_trends(self):
|
||||||
|
logger.info("Plotting temporal trends...")
|
||||||
|
# Implementation goes here
|
||||||
|
|
||||||
|
def _plot_regional_analysis(self):
|
||||||
|
logger.info("Plotting regional analysis...")
|
||||||
|
# Implementation goes here
|
||||||
|
|
||||||
|
def _plot_ev_vouchers_analysis(self):
|
||||||
|
logger.info("Plotting EV vouchers analysis...")
|
||||||
|
# Implementation goes here
|
||||||
|
|
||||||
|
def _plot_efficiency_equity_tradeoff(self):
|
||||||
|
logger.info("Plotting efficiency-equity tradeoff...")
|
||||||
|
# Implementation goes here
|
||||||
|
|||||||
125
collaboration_detection_script.py
Normal file
125
collaboration_detection_script.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger("collaboration_detection")
|
||||||
|
|
||||||
|
def investigate_collaboration(input_path, output_path=None):
|
||||||
|
"""
|
||||||
|
Investigate potential collaboration patterns in the CCI data
|
||||||
|
that might not be captured by the current approach.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
input_path (str): Path to the CCI data CSV file
|
||||||
|
output_path (str, optional): Path to save findings
|
||||||
|
"""
|
||||||
|
logger.info(f"Loading data from {input_path}")
|
||||||
|
|
||||||
|
# Load the data
|
||||||
|
df = pd.read_csv(input_path, low_memory=False)
|
||||||
|
|
||||||
|
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
|
||||||
|
|
||||||
|
# 1. Look at unique agency_name values
|
||||||
|
agencies = df['agency_name'].unique()
|
||||||
|
logger.info(f"Found {len(agencies)} unique agencies")
|
||||||
|
logger.info("Agencies: " + ", ".join(sorted(agencies)[:10]) + "..." if len(agencies) > 10 else ", ".join(sorted(agencies)))
|
||||||
|
|
||||||
|
# 2. Look at how agency_name is associated with program_name
|
||||||
|
program_agency_counts = df.groupby('program_name')['agency_name'].nunique()
|
||||||
|
|
||||||
|
# Look for programs with multiple agencies
|
||||||
|
multi_agency_programs = program_agency_counts[program_agency_counts > 1]
|
||||||
|
logger.info(f"Found {len(multi_agency_programs)} programs with multiple agencies")
|
||||||
|
|
||||||
|
if len(multi_agency_programs) > 0:
|
||||||
|
logger.info("Multi-agency programs:")
|
||||||
|
for program, count in multi_agency_programs.items():
|
||||||
|
agencies = df[df['program_name'] == program]['agency_name'].unique()
|
||||||
|
logger.info(f" {program}: {count} agencies ({', '.join(agencies)})")
|
||||||
|
|
||||||
|
# 3. Look for other potential indicators of collaboration
|
||||||
|
# Check if there are other columns that might indicate collaboration
|
||||||
|
potential_collab_indicators = [
|
||||||
|
'agency_name', 'program_name', 'sub_program_name',
|
||||||
|
# Add other potential columns here
|
||||||
|
]
|
||||||
|
|
||||||
|
# Look for terms that might indicate collaboration
|
||||||
|
collab_terms = ['collab', 'partner', 'joint', 'multi', 'together', 'coop']
|
||||||
|
|
||||||
|
# Search for collaboration terms across relevant columns
|
||||||
|
for col in potential_collab_indicators:
|
||||||
|
if col in df.columns and df[col].dtype == 'object':
|
||||||
|
# Search for collaboration terms in the column
|
||||||
|
matches = []
|
||||||
|
for term in collab_terms:
|
||||||
|
term_matches = df[df[col].str.contains(term, case=False, na=False)]
|
||||||
|
if len(term_matches) > 0:
|
||||||
|
matches.append((term, len(term_matches)))
|
||||||
|
|
||||||
|
if matches:
|
||||||
|
logger.info(f"Found potential collaboration indicators in column '{col}':")
|
||||||
|
for term, count in matches:
|
||||||
|
logger.info(f" Term '{term}': {count} matches")
|
||||||
|
|
||||||
|
# 4. Look for potential co-funding patterns
|
||||||
|
# Sometimes collaboration is indicated by multiple funding sources
|
||||||
|
funding_cols = [col for col in df.columns if 'funding' in col.lower()]
|
||||||
|
logger.info(f"Found {len(funding_cols)} funding-related columns: {', '.join(funding_cols)}")
|
||||||
|
|
||||||
|
# 5. Generate a set of recommendations for identifying collaboration
|
||||||
|
recommendations = [
|
||||||
|
"1. Consider using a different approach to identify multi-agency programs:",
|
||||||
|
" - Look at sub_program_name for indicators of collaboration",
|
||||||
|
" - Check if there are text fields with partnership information",
|
||||||
|
" - Consider if programs can have multiple sub-agencies that aren't captured in agency_name",
|
||||||
|
"2. Explore if collaboration occurs at higher levels (program level) rather than project level",
|
||||||
|
"3. Check if some agencies have been merged or renamed in the data",
|
||||||
|
"4. For temporal analysis, check if collaboration patterns changed over time",
|
||||||
|
"5. Consider if collaboration is defined by regions/jurisdictions rather than state agencies"
|
||||||
|
]
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
with open(output_path, 'w') as f:
|
||||||
|
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) COLLABORATION DETECTION ANALYSIS\n")
|
||||||
|
f.write("==================================================================\n\n")
|
||||||
|
|
||||||
|
f.write("AGENCY ANALYSIS\n")
|
||||||
|
f.write(f"Found {len(agencies)} unique agencies\n")
|
||||||
|
f.write("Agencies: " + ", ".join(sorted(agencies)[:20]) + "...\n\n" if len(agencies) > 20 else ", ".join(sorted(agencies)) + "\n\n")
|
||||||
|
|
||||||
|
f.write("MULTI-AGENCY PROGRAM ANALYSIS\n")
|
||||||
|
f.write(f"Found {len(multi_agency_programs)} programs with multiple agencies\n\n")
|
||||||
|
|
||||||
|
if len(multi_agency_programs) > 0:
|
||||||
|
f.write("Multi-agency programs:\n")
|
||||||
|
for program, count in multi_agency_programs.items():
|
||||||
|
agencies = df[df['program_name'] == program]['agency_name'].unique()
|
||||||
|
f.write(f" {program}: {count} agencies ({', '.join(agencies)})\n")
|
||||||
|
else:
|
||||||
|
f.write("No multi-agency programs found using current detection method\n")
|
||||||
|
|
||||||
|
f.write("\nRECOMMENDATIONS\n")
|
||||||
|
for rec in recommendations:
|
||||||
|
f.write(f"{rec}\n")
|
||||||
|
|
||||||
|
logger.info(f"Saved collaboration detection analysis to {output_path}")
|
||||||
|
|
||||||
|
# Return the number of multi-agency programs
|
||||||
|
return len(multi_agency_programs)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Investigate collaboration patterns in CCI data')
|
||||||
|
parser.add_argument('--input_path', type=str, required=True, help='Path to the CCI data CSV file')
|
||||||
|
parser.add_argument('--output_path', type=str, help='Path to save findings')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
investigate_collaboration(args.input_path, args.output_path)
|
||||||
540
data_cleaning_script.py
Normal file
540
data_cleaning_script.py
Normal file
@@ -0,0 +1,540 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger("cci_data_prep")
|
||||||
|
|
||||||
|
def clean_and_prepare_cci_data(input_path, output_path=None):
|
||||||
|
"""
|
||||||
|
Clean and prepare the CCI data for analysis, fixing specific issues identified.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
input_path (str): Path to the original CCI data file
|
||||||
|
output_path (str, optional): Path to save the cleaned data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: The cleaned and prepared data
|
||||||
|
"""
|
||||||
|
logger.info(f"Loading data from {input_path}")
|
||||||
|
|
||||||
|
# Try different encodings if needed
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(input_path)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
logger.info("Trying different encoding (latin-1)")
|
||||||
|
df = pd.read_csv(input_path, encoding='latin-1')
|
||||||
|
|
||||||
|
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
|
||||||
|
|
||||||
|
# 1. Fix column names - standardize to lowercase with underscores
|
||||||
|
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
|
||||||
|
|
||||||
|
# 2. Identify and mark EV vouchers/rebates
|
||||||
|
logger.info("Identifying EV vouchers and rebates")
|
||||||
|
|
||||||
|
# Check if required columns exist
|
||||||
|
required_cols = ['agency_name', 'program_name']
|
||||||
|
if not all(col in df.columns for col in required_cols):
|
||||||
|
missing = [col for col in required_cols if col not in df.columns]
|
||||||
|
logger.error(f"Missing required columns: {missing}")
|
||||||
|
return df
|
||||||
|
|
||||||
|
# Identify CARB's Low Carbon Transportation projects
|
||||||
|
carb_mask = df['agency_name'].str.contains('Air Resources Board', case=False, na=False)
|
||||||
|
lct_mask = df['program_name'].str.contains('Low Carbon Transportation', case=False, na=False)
|
||||||
|
|
||||||
|
# Create CARB indicator
|
||||||
|
df['is_carb'] = carb_mask
|
||||||
|
|
||||||
|
# 3. Mark EV projects using multiple methods
|
||||||
|
# Start with subprogram if available
|
||||||
|
ev_mask = pd.Series(False, index=df.index)
|
||||||
|
|
||||||
|
if 'sub_program_name' in df.columns:
|
||||||
|
ev_indicators = ['Clean Cars 4 All', 'CVRP', 'Clean Vehicle', 'EV', 'Electric Vehicle',
|
||||||
|
'Hybrid', 'Rebate', 'Voucher', 'ZEV', 'Zero Emission']
|
||||||
|
|
||||||
|
ev_subprogram_mask = df['sub_program_name'].str.contains('|'.join(ev_indicators),
|
||||||
|
case=False, na=False)
|
||||||
|
ev_mask = ev_mask | (carb_mask & lct_mask & ev_subprogram_mask)
|
||||||
|
|
||||||
|
# Check project count column
|
||||||
|
if 'number_of_rebates_issued' in df.columns:
|
||||||
|
rebate_mask = df['number_of_rebates_issued'] > 0
|
||||||
|
ev_mask = ev_mask | (carb_mask & rebate_mask)
|
||||||
|
|
||||||
|
# Check for small funding amounts typical of vouchers
|
||||||
|
funding_col = None
|
||||||
|
for col in df.columns:
|
||||||
|
if 'total_program' in col.lower() and 'funding' in col.lower():
|
||||||
|
funding_col = col
|
||||||
|
break
|
||||||
|
|
||||||
|
if funding_col:
|
||||||
|
# Identify potential vouchers by small funding amount (for individual vouchers)
|
||||||
|
small_funding_mask = (df[funding_col] > 0) & (df[funding_col] < 10000) & carb_mask & lct_mask
|
||||||
|
ev_mask = ev_mask | small_funding_mask
|
||||||
|
|
||||||
|
# Mark EV vouchers
|
||||||
|
df['is_ev_voucher'] = ev_mask
|
||||||
|
|
||||||
|
count_ev = ev_mask.sum()
|
||||||
|
logger.info(f"Identified {count_ev} EV vouchers/rebates")
|
||||||
|
|
||||||
|
# 4. Create funding year if needed
|
||||||
|
if 'funding_year' not in df.columns and 'fiscal_year_funding_project' in df.columns:
|
||||||
|
# Extract year from fiscal year format (e.g., "2019-20" -> 2019)
|
||||||
|
try:
|
||||||
|
df['funding_year'] = df['fiscal_year_funding_project'].str.extract(r'(\d{4})').astype('Int64')
|
||||||
|
logger.info("Created funding_year column from fiscal year data")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error creating funding_year: {e}")
|
||||||
|
|
||||||
|
# 5. Calculate GHG efficiency
|
||||||
|
if funding_col:
|
||||||
|
ghg_col = None
|
||||||
|
for col in df.columns:
|
||||||
|
if 'total_project' in col.lower() and 'ghg' in col.lower():
|
||||||
|
ghg_col = col
|
||||||
|
break
|
||||||
|
|
||||||
|
if ghg_col:
|
||||||
|
df['ghg_efficiency'] = np.where(
|
||||||
|
df[ghg_col] > 0,
|
||||||
|
df[funding_col] / df[ghg_col],
|
||||||
|
np.nan
|
||||||
|
)
|
||||||
|
logger.info("Calculated GHG efficiency ($ per ton CO2e)")
|
||||||
|
|
||||||
|
# 6. Calculate DAC benefit percentage
|
||||||
|
dac_funding_col = None
|
||||||
|
for col in df.columns:
|
||||||
|
if 'funding_benefiting' in col.lower() and 'disadvantaged' in col.lower():
|
||||||
|
dac_funding_col = col
|
||||||
|
break
|
||||||
|
|
||||||
|
if dac_funding_col and funding_col:
|
||||||
|
df['dac_benefit_percentage'] = np.where(
|
||||||
|
df[funding_col] > 0,
|
||||||
|
100 * df[dac_funding_col] / df[funding_col],
|
||||||
|
0
|
||||||
|
)
|
||||||
|
logger.info("Calculated DAC benefit percentage")
|
||||||
|
|
||||||
|
# 7. Identify multi-agency programs
|
||||||
|
logger.info("Identifying multi-agency programs")
|
||||||
|
program_agencies = df.groupby('program_name')['agency_name'].nunique()
|
||||||
|
df['num_agencies_in_program'] = df['program_name'].map(program_agencies)
|
||||||
|
df['is_multi_agency'] = df['num_agencies_in_program'] > 1
|
||||||
|
|
||||||
|
multi_agency_count = (df['is_multi_agency'] == True).sum()
|
||||||
|
logger.info(f"Found {multi_agency_count} projects in multi-agency programs")
|
||||||
|
|
||||||
|
# 8. Identify regional scope
|
||||||
|
if 'county' in df.columns:
|
||||||
|
logger.info("Determining regional scope of projects")
|
||||||
|
program_counties = df.groupby('program_name')['county'].nunique()
|
||||||
|
df['num_counties'] = df['program_name'].map(program_counties)
|
||||||
|
|
||||||
|
# Define region categories
|
||||||
|
df['regional_scope'] = pd.cut(
|
||||||
|
df['num_counties'],
|
||||||
|
bins=[0, 1, 3, 10, np.inf],
|
||||||
|
labels=['Single County', 'Limited Regional', 'Regional', 'Multi-Regional']
|
||||||
|
)
|
||||||
|
|
||||||
|
# 9. Assign California region based on county
|
||||||
|
if 'county' in df.columns:
|
||||||
|
logger.info("Assigning California regions")
|
||||||
|
|
||||||
|
# Define California regions
|
||||||
|
ca_regions = {
|
||||||
|
'Bay Area': ['Alameda', 'Contra Costa', 'Marin', 'Napa', 'San Francisco', 'San Mateo', 'Santa Clara', 'Solano', 'Sonoma'],
|
||||||
|
'Sacramento Region': ['El Dorado', 'Placer', 'Sacramento', 'Sutter', 'Yolo', 'Yuba'],
|
||||||
|
'San Joaquin Valley': ['Fresno', 'Kern', 'Kings', 'Madera', 'Merced', 'San Joaquin', 'Stanislaus', 'Tulare'],
|
||||||
|
'Southern California': ['Imperial', 'Los Angeles', 'Orange', 'Riverside', 'San Bernardino', 'San Diego', 'Ventura'],
|
||||||
|
'Central Coast': ['Monterey', 'San Benito', 'San Luis Obispo', 'Santa Barbara', 'Santa Cruz'],
|
||||||
|
'Northern California': ['Butte', 'Colusa', 'Del Norte', 'Glenn', 'Humboldt', 'Lake', 'Lassen', 'Mendocino', 'Modoc', 'Nevada', 'Plumas', 'Shasta', 'Sierra', 'Siskiyou', 'Tehama', 'Trinity'],
|
||||||
|
'Sierra Nevada': ['Alpine', 'Amador', 'Calaveras', 'Inyo', 'Mariposa', 'Mono', 'Tuolumne']
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create mapping dictionary
|
||||||
|
county_to_region = {}
|
||||||
|
for region, counties in ca_regions.items():
|
||||||
|
for county in counties:
|
||||||
|
county_to_region[county] = region
|
||||||
|
|
||||||
|
# Apply mapping
|
||||||
|
df['ca_region'] = df['county'].map(county_to_region)
|
||||||
|
|
||||||
|
# For projects with multiple counties, determine if they are multi-region
|
||||||
|
multi_county_programs = program_counties[program_counties > 1].index
|
||||||
|
|
||||||
|
# For multi-county programs, check if they span multiple regions
|
||||||
|
for program in multi_county_programs:
|
||||||
|
program_df = df[df['program_name'] == program]
|
||||||
|
unique_regions = program_df['ca_region'].nunique()
|
||||||
|
|
||||||
|
if unique_regions > 1:
|
||||||
|
df.loc[df['program_name'] == program, 'ca_region'] = 'Multi-Region'
|
||||||
|
|
||||||
|
# 10. Create temporal period indicator (pre/post 2020)
|
||||||
|
if 'funding_year' in df.columns:
|
||||||
|
logger.info("Creating temporal period indicator (pre/post 2020)")
|
||||||
|
df['period'] = df['funding_year'].apply(lambda x: 'Post-2020' if x >= 2020 else 'Pre-2020')
|
||||||
|
|
||||||
|
# 11. Handle outliers in GHG efficiency and DAC benefit
|
||||||
|
if 'ghg_efficiency' in df.columns:
|
||||||
|
# Cap extreme values at 95th percentile
|
||||||
|
upper_limit = df['ghg_efficiency'].quantile(0.95)
|
||||||
|
df['ghg_efficiency_capped'] = df['ghg_efficiency'].clip(upper=upper_limit)
|
||||||
|
|
||||||
|
# Log transform for analysis
|
||||||
|
df['ghg_efficiency_log'] = np.log1p(df['ghg_efficiency_capped'])
|
||||||
|
|
||||||
|
logger.info(f"Handled outliers in GHG efficiency (capped at ${upper_limit:.2f} per ton)")
|
||||||
|
|
||||||
|
if 'dac_benefit_percentage' in df.columns:
|
||||||
|
# Handle values > 100%
|
||||||
|
df['dac_benefit_percentage'] = df['dac_benefit_percentage'].clip(upper=100)
|
||||||
|
logger.info("Capped DAC benefit percentage at 100%")
|
||||||
|
|
||||||
|
# Save cleaned data if output path provided
|
||||||
|
if output_path:
|
||||||
|
output_file = Path(output_path)
|
||||||
|
logger.info(f"Saving cleaned data to {output_file}")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def json_serializable(obj):
|
||||||
|
"""Convert NumPy types to Python standard types for JSON serialization."""
|
||||||
|
if isinstance(obj, (np.integer, np.int64)):
|
||||||
|
return int(obj)
|
||||||
|
elif isinstance(obj, (np.floating, np.float64)):
|
||||||
|
return float(obj)
|
||||||
|
elif isinstance(obj, (np.ndarray,)):
|
||||||
|
return obj.tolist()
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def generate_data_summary(df, output_path=None):
|
||||||
|
"""
|
||||||
|
Generate a summary of the cleaned CCI data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
df (pd.DataFrame): The cleaned CCI data
|
||||||
|
output_path (str, optional): Path to save the summary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Summary statistics
|
||||||
|
"""
|
||||||
|
summary = {}
|
||||||
|
|
||||||
|
# 1. Basic dataset stats
|
||||||
|
summary['total_projects'] = len(df)
|
||||||
|
summary['total_agencies'] = df['agency_name'].nunique()
|
||||||
|
summary['total_programs'] = df['program_name'].nunique()
|
||||||
|
|
||||||
|
if 'sub_program_name' in df.columns:
|
||||||
|
summary['total_subprograms'] = df['sub_program_name'].nunique()
|
||||||
|
|
||||||
|
# 2. CARB vs Non-CARB breakdown
|
||||||
|
if 'is_carb' in df.columns:
|
||||||
|
carb_df = df[df['is_carb']]
|
||||||
|
non_carb_df = df[~df['is_carb']]
|
||||||
|
|
||||||
|
summary['carb_projects'] = len(carb_df)
|
||||||
|
summary['non_carb_projects'] = len(non_carb_df)
|
||||||
|
summary['carb_percentage'] = len(carb_df) / len(df) * 100
|
||||||
|
|
||||||
|
# 3. EV vouchers breakdown
|
||||||
|
if 'is_ev_voucher' in df.columns:
|
||||||
|
ev_df = df[df['is_ev_voucher']]
|
||||||
|
|
||||||
|
summary['ev_vouchers'] = len(ev_df)
|
||||||
|
summary['ev_percentage'] = len(ev_df) / len(df) * 100
|
||||||
|
|
||||||
|
if 'is_carb' in df.columns:
|
||||||
|
summary['ev_percentage_of_carb'] = len(ev_df) / len(carb_df) * 100 if len(carb_df) > 0 else 0
|
||||||
|
|
||||||
|
# 4. Funding statistics
|
||||||
|
funding_col = None
|
||||||
|
for col in df.columns:
|
||||||
|
if 'total_program' in col.lower() and 'funding' in col.lower():
|
||||||
|
funding_col = col
|
||||||
|
break
|
||||||
|
|
||||||
|
if funding_col:
|
||||||
|
summary['total_funding'] = df[funding_col].sum()
|
||||||
|
summary['avg_funding_per_project'] = df[funding_col].mean()
|
||||||
|
|
||||||
|
if 'is_carb' in df.columns:
|
||||||
|
summary['carb_funding'] = carb_df[funding_col].sum()
|
||||||
|
summary['non_carb_funding'] = non_carb_df[funding_col].sum()
|
||||||
|
summary['carb_funding_percentage'] = carb_df[funding_col].sum() / df[funding_col].sum() * 100
|
||||||
|
|
||||||
|
summary['avg_carb_funding'] = carb_df[funding_col].mean()
|
||||||
|
summary['avg_non_carb_funding'] = non_carb_df[funding_col].mean()
|
||||||
|
|
||||||
|
if 'is_ev_voucher' in df.columns:
|
||||||
|
summary['ev_funding'] = ev_df[funding_col].sum()
|
||||||
|
summary['ev_funding_percentage'] = ev_df[funding_col].sum() / df[funding_col].sum() * 100
|
||||||
|
summary['avg_ev_funding'] = ev_df[funding_col].mean()
|
||||||
|
|
||||||
|
# 5. GHG reduction statistics
|
||||||
|
ghg_col = None
|
||||||
|
for col in df.columns:
|
||||||
|
if 'total_project' in col.lower() and 'ghg' in col.lower():
|
||||||
|
ghg_col = col
|
||||||
|
break
|
||||||
|
|
||||||
|
if ghg_col:
|
||||||
|
summary['total_ghg_reduction'] = df[ghg_col].sum()
|
||||||
|
summary['avg_ghg_reduction_per_project'] = df[ghg_col].mean()
|
||||||
|
|
||||||
|
if 'is_carb' in df.columns:
|
||||||
|
summary['carb_ghg_reduction'] = carb_df[ghg_col].sum()
|
||||||
|
summary['non_carb_ghg_reduction'] = non_carb_df[ghg_col].sum()
|
||||||
|
summary['carb_ghg_percentage'] = carb_df[ghg_col].sum() / df[ghg_col].sum() * 100
|
||||||
|
|
||||||
|
if 'is_ev_voucher' in df.columns:
|
||||||
|
summary['ev_ghg_reduction'] = ev_df[ghg_col].sum()
|
||||||
|
summary['ev_ghg_percentage'] = ev_df[ghg_col].sum() / df[ghg_col].sum() * 100
|
||||||
|
|
||||||
|
# 6. Efficiency statistics
|
||||||
|
if 'ghg_efficiency' in df.columns:
|
||||||
|
# Use median for efficiency due to skewness
|
||||||
|
valid_efficiency = df[df['ghg_efficiency'].notna() & (df['ghg_efficiency'] > 0)]
|
||||||
|
|
||||||
|
if len(valid_efficiency) > 0:
|
||||||
|
summary['median_ghg_efficiency'] = valid_efficiency['ghg_efficiency'].median()
|
||||||
|
|
||||||
|
if 'is_carb' in df.columns:
|
||||||
|
valid_carb = carb_df[carb_df['ghg_efficiency'].notna() & (carb_df['ghg_efficiency'] > 0)]
|
||||||
|
valid_non_carb = non_carb_df[non_carb_df['ghg_efficiency'].notna() & (non_carb_df['ghg_efficiency'] > 0)]
|
||||||
|
|
||||||
|
if len(valid_carb) > 0:
|
||||||
|
summary['median_carb_efficiency'] = valid_carb['ghg_efficiency'].median()
|
||||||
|
|
||||||
|
if len(valid_non_carb) > 0:
|
||||||
|
summary['median_non_carb_efficiency'] = valid_non_carb['ghg_efficiency'].median()
|
||||||
|
|
||||||
|
if 'is_ev_voucher' in df.columns:
|
||||||
|
valid_ev = ev_df[ev_df['ghg_efficiency'].notna() & (ev_df['ghg_efficiency'] > 0)]
|
||||||
|
|
||||||
|
if len(valid_ev) > 0:
|
||||||
|
summary['median_ev_efficiency'] = valid_ev['ghg_efficiency'].median()
|
||||||
|
|
||||||
|
# 7. DAC benefit statistics
|
||||||
|
if 'dac_benefit_percentage' in df.columns:
|
||||||
|
summary['avg_dac_benefit'] = df['dac_benefit_percentage'].mean()
|
||||||
|
|
||||||
|
if 'is_carb' in df.columns:
|
||||||
|
summary['avg_carb_dac_benefit'] = carb_df['dac_benefit_percentage'].mean()
|
||||||
|
summary['avg_non_carb_dac_benefit'] = non_carb_df['dac_benefit_percentage'].mean()
|
||||||
|
|
||||||
|
if 'is_ev_voucher' in df.columns:
|
||||||
|
summary['avg_ev_dac_benefit'] = ev_df['dac_benefit_percentage'].mean()
|
||||||
|
|
||||||
|
# 8. Multi-agency statistics
|
||||||
|
if 'is_multi_agency' in df.columns:
|
||||||
|
multi_df = df[df['is_multi_agency']]
|
||||||
|
single_df = df[~df['is_multi_agency']]
|
||||||
|
|
||||||
|
summary['multi_agency_projects'] = len(multi_df)
|
||||||
|
summary['multi_agency_percentage'] = len(multi_df) / len(df) * 100
|
||||||
|
|
||||||
|
if 'num_agencies_in_program' in df.columns:
|
||||||
|
summary['avg_agencies_per_program'] = df['num_agencies_in_program'].mean()
|
||||||
|
|
||||||
|
if 'ghg_efficiency' in df.columns:
|
||||||
|
valid_multi = multi_df[multi_df['ghg_efficiency'].notna() & (multi_df['ghg_efficiency'] > 0)]
|
||||||
|
valid_single = single_df[single_df['ghg_efficiency'].notna() & (single_df['ghg_efficiency'] > 0)]
|
||||||
|
|
||||||
|
if len(valid_multi) > 0:
|
||||||
|
summary['median_multi_agency_efficiency'] = valid_multi['ghg_efficiency'].median()
|
||||||
|
|
||||||
|
if len(valid_single) > 0:
|
||||||
|
summary['median_single_agency_efficiency'] = valid_single['ghg_efficiency'].median()
|
||||||
|
|
||||||
|
if 'dac_benefit_percentage' in df.columns:
|
||||||
|
summary['avg_multi_agency_dac_benefit'] = multi_df['dac_benefit_percentage'].mean()
|
||||||
|
summary['avg_single_agency_dac_benefit'] = single_df['dac_benefit_percentage'].mean()
|
||||||
|
|
||||||
|
# 9. Temporal statistics
|
||||||
|
if 'period' in df.columns:
|
||||||
|
pre_df = df[df['period'] == 'Pre-2020']
|
||||||
|
post_df = df[df['period'] == 'Post-2020']
|
||||||
|
|
||||||
|
summary['pre_2020_projects'] = len(pre_df)
|
||||||
|
summary['post_2020_projects'] = len(post_df)
|
||||||
|
|
||||||
|
if 'num_agencies_in_program' in df.columns:
|
||||||
|
summary['pre_2020_avg_agencies'] = pre_df['num_agencies_in_program'].mean()
|
||||||
|
summary['post_2020_avg_agencies'] = post_df['num_agencies_in_program'].mean()
|
||||||
|
summary['agency_change_percentage'] = ((post_df['num_agencies_in_program'].mean() -
|
||||||
|
pre_df['num_agencies_in_program'].mean()) /
|
||||||
|
pre_df['num_agencies_in_program'].mean() * 100) if pre_df['num_agencies_in_program'].mean() > 0 else 0
|
||||||
|
|
||||||
|
if funding_col:
|
||||||
|
summary['pre_2020_avg_funding'] = pre_df[funding_col].mean()
|
||||||
|
summary['post_2020_avg_funding'] = post_df[funding_col].mean()
|
||||||
|
summary['funding_change_percentage'] = ((post_df[funding_col].mean() -
|
||||||
|
pre_df[funding_col].mean()) /
|
||||||
|
pre_df[funding_col].mean() * 100) if pre_df[funding_col].mean() > 0 else 0
|
||||||
|
|
||||||
|
if 'dac_benefit_percentage' in df.columns:
|
||||||
|
summary['pre_2020_avg_dac_benefit'] = pre_df['dac_benefit_percentage'].mean()
|
||||||
|
summary['post_2020_avg_dac_benefit'] = post_df['dac_benefit_percentage'].mean()
|
||||||
|
summary['dac_change_percentage'] = ((post_df['dac_benefit_percentage'].mean() -
|
||||||
|
pre_df['dac_benefit_percentage'].mean()) /
|
||||||
|
pre_df['dac_benefit_percentage'].mean() * 100) if pre_df['dac_benefit_percentage'].mean() > 0 else 0
|
||||||
|
|
||||||
|
# 10. Regional statistics
|
||||||
|
if 'ca_region' in df.columns:
|
||||||
|
region_counts = df['ca_region'].value_counts()
|
||||||
|
region_percentages = df['ca_region'].value_counts(normalize=True) * 100
|
||||||
|
|
||||||
|
summary['region_counts'] = region_counts.to_dict()
|
||||||
|
summary['region_percentages'] = region_percentages.to_dict()
|
||||||
|
|
||||||
|
# Get efficiency and DAC benefit by region
|
||||||
|
if 'ghg_efficiency' in df.columns:
|
||||||
|
region_efficiency = df.groupby('ca_region')['ghg_efficiency'].median()
|
||||||
|
summary['region_efficiency'] = region_efficiency.to_dict()
|
||||||
|
|
||||||
|
if 'dac_benefit_percentage' in df.columns:
|
||||||
|
region_dac = df.groupby('ca_region')['dac_benefit_percentage'].mean()
|
||||||
|
summary['region_dac_benefit'] = region_dac.to_dict()
|
||||||
|
|
||||||
|
# Save summary if output path provided
|
||||||
|
if output_path:
|
||||||
|
import json
|
||||||
|
|
||||||
|
output_file = Path(output_path)
|
||||||
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
json.dump(summary, f, indent=2, default=json_serializable)
|
||||||
|
|
||||||
|
logger.info(f"Saved data summary to {output_file}")
|
||||||
|
|
||||||
|
# Also create a readable text version
|
||||||
|
text_file = output_file.with_suffix('.txt')
|
||||||
|
|
||||||
|
with open(text_file, 'w') as f:
|
||||||
|
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) DATA SUMMARY\n")
|
||||||
|
f.write("================================================\n\n")
|
||||||
|
|
||||||
|
f.write("DATASET OVERVIEW\n")
|
||||||
|
f.write(f"Total Projects: {summary['total_projects']:,}\n")
|
||||||
|
f.write(f"Total Agencies: {summary['total_agencies']}\n")
|
||||||
|
f.write(f"Total Programs: {summary['total_programs']}\n")
|
||||||
|
if 'total_subprograms' in summary:
|
||||||
|
f.write(f"Total Subprograms: {summary['total_subprograms']}\n")
|
||||||
|
|
||||||
|
f.write("\nCARB VS NON-CARB BREAKDOWN\n")
|
||||||
|
if 'carb_projects' in summary:
|
||||||
|
f.write(f"CARB Projects: {summary['carb_projects']:,} ({summary['carb_percentage']:.1f}%)\n")
|
||||||
|
f.write(f"Non-CARB Projects: {summary['non_carb_projects']:,} ({100-summary['carb_percentage']:.1f}%)\n")
|
||||||
|
|
||||||
|
if 'carb_funding' in summary:
|
||||||
|
f.write(f"CARB Funding: ${summary['carb_funding']:,.2f} ({summary['carb_funding_percentage']:.1f}%)\n")
|
||||||
|
f.write(f"Non-CARB Funding: ${summary['non_carb_funding']:,.2f} ({100-summary['carb_funding_percentage']:.1f}%)\n")
|
||||||
|
f.write(f"Average CARB Project: ${summary['avg_carb_funding']:,.2f}\n")
|
||||||
|
f.write(f"Average Non-CARB Project: ${summary['avg_non_carb_funding']:,.2f}\n")
|
||||||
|
|
||||||
|
if 'carb_ghg_reduction' in summary:
|
||||||
|
f.write(f"CARB GHG Reductions: {summary['carb_ghg_reduction']:,.2f} tons ({summary['carb_ghg_percentage']:.1f}%)\n")
|
||||||
|
f.write(f"Non-CARB GHG Reductions: {summary['non_carb_ghg_reduction']:,.2f} tons ({100-summary['carb_ghg_percentage']:.1f}%)\n")
|
||||||
|
|
||||||
|
if 'median_carb_efficiency' in summary and 'median_non_carb_efficiency' in summary:
|
||||||
|
f.write(f"CARB Efficiency: ${summary['median_carb_efficiency']:,.2f} per ton CO2e\n")
|
||||||
|
f.write(f"Non-CARB Efficiency: ${summary['median_non_carb_efficiency']:,.2f} per ton CO2e\n")
|
||||||
|
|
||||||
|
f.write("\nEV VOUCHERS BREAKDOWN\n")
|
||||||
|
if 'ev_vouchers' in summary:
|
||||||
|
f.write(f"EV Vouchers: {summary['ev_vouchers']:,} ({summary['ev_percentage']:.1f}% of total)\n")
|
||||||
|
if 'ev_percentage_of_carb' in summary:
|
||||||
|
f.write(f"Percentage of CARB Projects: {summary['ev_percentage_of_carb']:.1f}%\n")
|
||||||
|
|
||||||
|
if 'ev_funding' in summary:
|
||||||
|
f.write(f"EV Funding: ${summary['ev_funding']:,.2f} ({summary['ev_funding_percentage']:.1f}% of total)\n")
|
||||||
|
f.write(f"Average Voucher Amount: ${summary['avg_ev_funding']:,.2f}\n")
|
||||||
|
|
||||||
|
if 'ev_ghg_reduction' in summary:
|
||||||
|
f.write(f"EV GHG Reductions: {summary['ev_ghg_reduction']:,.2f} tons ({summary['ev_ghg_percentage']:.1f}% of total)\n")
|
||||||
|
|
||||||
|
if 'median_ev_efficiency' in summary:
|
||||||
|
f.write(f"EV Efficiency: ${summary['median_ev_efficiency']:,.2f} per ton CO2e\n")
|
||||||
|
|
||||||
|
f.write("\nMULTI-AGENCY COLLABORATION\n")
|
||||||
|
if 'multi_agency_projects' in summary:
|
||||||
|
f.write(f"Multi-Agency Projects: {summary['multi_agency_projects']:,} ({summary['multi_agency_percentage']:.1f}%)\n")
|
||||||
|
|
||||||
|
if 'avg_agencies_per_program' in summary:
|
||||||
|
f.write(f"Average Agencies per Program: {summary['avg_agencies_per_program']:.2f}\n")
|
||||||
|
|
||||||
|
if 'median_multi_agency_efficiency' in summary and 'median_single_agency_efficiency' in summary:
|
||||||
|
f.write(f"Multi-Agency Efficiency: ${summary['median_multi_agency_efficiency']:,.2f} per ton CO2e\n")
|
||||||
|
f.write(f"Single-Agency Efficiency: ${summary['median_single_agency_efficiency']:,.2f} per ton CO2e\n")
|
||||||
|
|
||||||
|
if 'avg_multi_agency_dac_benefit' in summary and 'avg_single_agency_dac_benefit' in summary:
|
||||||
|
f.write(f"Multi-Agency DAC Benefit: {summary['avg_multi_agency_dac_benefit']:.2f}%\n")
|
||||||
|
f.write(f"Single-Agency DAC Benefit: {summary['avg_single_agency_dac_benefit']:.2f}%\n")
|
||||||
|
|
||||||
|
f.write("\nTEMPORAL TRENDS (PRE/POST 2020)\n")
|
||||||
|
if 'pre_2020_projects' in summary and 'post_2020_projects' in summary:
|
||||||
|
f.write(f"Pre-2020 Projects: {summary['pre_2020_projects']:,}\n")
|
||||||
|
f.write(f"Post-2020 Projects: {summary['post_2020_projects']:,}\n")
|
||||||
|
|
||||||
|
if 'agency_change_percentage' in summary:
|
||||||
|
f.write(f"Change in Average Agencies: {summary['agency_change_percentage']:+.1f}%\n")
|
||||||
|
|
||||||
|
if 'funding_change_percentage' in summary:
|
||||||
|
f.write(f"Change in Average Funding: {summary['funding_change_percentage']:+.1f}%\n")
|
||||||
|
|
||||||
|
if 'dac_change_percentage' in summary:
|
||||||
|
f.write(f"Change in DAC Benefit: {summary['dac_change_percentage']:+.1f}%\n")
|
||||||
|
|
||||||
|
f.write("\nREGIONAL ANALYSIS\n")
|
||||||
|
if 'region_counts' in summary:
|
||||||
|
for region, count in sorted(summary['region_counts'].items(), key=lambda x: x[1], reverse=True):
|
||||||
|
f.write(f"{region}: {count:,} projects ({summary['region_percentages'][region]:.1f}%)\n")
|
||||||
|
|
||||||
|
f.write("\nEfficiency by Region ($ per ton CO2e):\n")
|
||||||
|
if 'region_efficiency' in summary:
|
||||||
|
for region, efficiency in sorted(summary['region_efficiency'].items(), key=lambda x: x[1]):
|
||||||
|
f.write(f"{region}: ${efficiency:,.2f}\n")
|
||||||
|
|
||||||
|
f.write("\nDAC Benefit by Region:\n")
|
||||||
|
if 'region_dac_benefit' in summary:
|
||||||
|
for region, dac in sorted(summary['region_dac_benefit'].items(), key=lambda x: x[1], reverse=True):
|
||||||
|
f.write(f"{region}: {dac:.2f}%\n")
|
||||||
|
|
||||||
|
logger.info(f"Saved readable summary to {text_file}")
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Clean and prepare CCI data for analysis')
|
||||||
|
parser.add_argument('--input_path', type=str, required=True, help='Path to the input CCI data file')
|
||||||
|
parser.add_argument('--output_path', type=str, help='Path to save the cleaned data')
|
||||||
|
parser.add_argument('--summary_path', type=str, help='Path to save the data summary')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Clean and prepare the data
|
||||||
|
cleaned_df = clean_and_prepare_cci_data(args.input_path, args.output_path)
|
||||||
|
|
||||||
|
# Generate summary
|
||||||
|
if args.summary_path:
|
||||||
|
generate_data_summary(cleaned_df, args.summary_path)
|
||||||
146898
output/cleaned_cci_data.csv
Normal file
146898
output/cleaned_cci_data.csv
Normal file
File diff suppressed because it is too large
Load Diff
70
output/data_summary.json
Normal file
70
output/data_summary.json
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
{
|
||||||
|
"total_projects": 146305,
|
||||||
|
"total_agencies": 21,
|
||||||
|
"total_programs": 39,
|
||||||
|
"total_subprograms": 76,
|
||||||
|
"carb_projects": 125581,
|
||||||
|
"non_carb_projects": 20724,
|
||||||
|
"carb_percentage": 85.83507057175079,
|
||||||
|
"ev_vouchers": 109270,
|
||||||
|
"ev_percentage": 74.68644270530741,
|
||||||
|
"ev_percentage_of_carb": 87.01157022160996,
|
||||||
|
"total_funding": 11588544819,
|
||||||
|
"avg_funding_per_project": 79208.12562113394,
|
||||||
|
"carb_funding": 3372893006,
|
||||||
|
"non_carb_funding": 8215651813,
|
||||||
|
"carb_funding_percentage": 29.10540588728598,
|
||||||
|
"avg_carb_funding": 26858.30663874312,
|
||||||
|
"avg_non_carb_funding": 396431.76090523065,
|
||||||
|
"ev_funding": 1714222371,
|
||||||
|
"ev_funding_percentage": 14.792386773095501,
|
||||||
|
"avg_ev_funding": 15687.950681797383,
|
||||||
|
"total_ghg_reduction": 112749573,
|
||||||
|
"avg_ghg_reduction_per_project": 770.6474351525922,
|
||||||
|
"carb_ghg_reduction": 5011819,
|
||||||
|
"non_carb_ghg_reduction": 107737754,
|
||||||
|
"carb_ghg_percentage": 4.44508911798717,
|
||||||
|
"ev_ghg_reduction": 4193168,
|
||||||
|
"ev_ghg_percentage": 3.71901009327991,
|
||||||
|
"median_ghg_efficiency": 312.5,
|
||||||
|
"median_carb_efficiency": 312.5,
|
||||||
|
"median_non_carb_efficiency": 197.3,
|
||||||
|
"median_ev_efficiency": 312.5,
|
||||||
|
"avg_dac_benefit": 1.29624968537629,
|
||||||
|
"avg_carb_dac_benefit": 1.3918376975701816,
|
||||||
|
"avg_non_carb_dac_benefit": 0.9900241837968561,
|
||||||
|
"avg_ev_dac_benefit": 1.39305362075886,
|
||||||
|
"multi_agency_projects": 0,
|
||||||
|
"multi_agency_percentage": 0.0,
|
||||||
|
"avg_agencies_per_program": 1.0,
|
||||||
|
"median_single_agency_efficiency": 312.5,
|
||||||
|
"avg_multi_agency_dac_benefit": NaN,
|
||||||
|
"avg_single_agency_dac_benefit": 1.29624968537629,
|
||||||
|
"pre_2020_projects": 144185,
|
||||||
|
"post_2020_projects": 2120,
|
||||||
|
"pre_2020_avg_agencies": 1.0,
|
||||||
|
"post_2020_avg_agencies": 1.0,
|
||||||
|
"agency_change_percentage": 0.0,
|
||||||
|
"pre_2020_avg_funding": 71885.7690813885,
|
||||||
|
"post_2020_avg_funding": 577214.7188679245,
|
||||||
|
"funding_change_percentage": 702.9610397774371,
|
||||||
|
"pre_2020_avg_dac_benefit": 1.2963895281933258,
|
||||||
|
"post_2020_avg_dac_benefit": 0.0,
|
||||||
|
"dac_change_percentage": -100.0,
|
||||||
|
"region_counts": {
|
||||||
|
"Multi-Region": 146221,
|
||||||
|
"Bay Area": 84
|
||||||
|
},
|
||||||
|
"region_percentages": {
|
||||||
|
"Multi-Region": 99.94258569426883,
|
||||||
|
"Bay Area": 0.05741430573117801
|
||||||
|
},
|
||||||
|
"region_efficiency": {
|
||||||
|
"Bay Area": NaN,
|
||||||
|
"Multi-Region": 312.5
|
||||||
|
},
|
||||||
|
"region_dac_benefit": {
|
||||||
|
"Bay Area": NaN,
|
||||||
|
"Multi-Region": 1.29624968537629
|
||||||
|
}
|
||||||
|
}
|
||||||
53
output/data_summary.txt
Normal file
53
output/data_summary.txt
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
CALIFORNIA CLIMATE INVESTMENTS (CCI) DATA SUMMARY
|
||||||
|
================================================
|
||||||
|
|
||||||
|
DATASET OVERVIEW
|
||||||
|
Total Projects: 146,305
|
||||||
|
Total Agencies: 21
|
||||||
|
Total Programs: 39
|
||||||
|
Total Subprograms: 76
|
||||||
|
|
||||||
|
CARB VS NON-CARB BREAKDOWN
|
||||||
|
CARB Projects: 125,581 (85.8%)
|
||||||
|
Non-CARB Projects: 20,724 (14.2%)
|
||||||
|
CARB Funding: $3,372,893,006.00 (29.1%)
|
||||||
|
Non-CARB Funding: $8,215,651,813.00 (70.9%)
|
||||||
|
Average CARB Project: $26,858.31
|
||||||
|
Average Non-CARB Project: $396,431.76
|
||||||
|
CARB GHG Reductions: 5,011,819.00 tons (4.4%)
|
||||||
|
Non-CARB GHG Reductions: 107,737,754.00 tons (95.6%)
|
||||||
|
CARB Efficiency: $312.50 per ton CO2e
|
||||||
|
Non-CARB Efficiency: $197.30 per ton CO2e
|
||||||
|
|
||||||
|
EV VOUCHERS BREAKDOWN
|
||||||
|
EV Vouchers: 109,270 (74.7% of total)
|
||||||
|
Percentage of CARB Projects: 87.0%
|
||||||
|
EV Funding: $1,714,222,371.00 (14.8% of total)
|
||||||
|
Average Voucher Amount: $15,687.95
|
||||||
|
EV GHG Reductions: 4,193,168.00 tons (3.7% of total)
|
||||||
|
EV Efficiency: $312.50 per ton CO2e
|
||||||
|
|
||||||
|
MULTI-AGENCY COLLABORATION
|
||||||
|
Multi-Agency Projects: 0 (0.0%)
|
||||||
|
Average Agencies per Program: 1.00
|
||||||
|
Multi-Agency DAC Benefit: nan%
|
||||||
|
Single-Agency DAC Benefit: 1.30%
|
||||||
|
|
||||||
|
TEMPORAL TRENDS (PRE/POST 2020)
|
||||||
|
Pre-2020 Projects: 144,185
|
||||||
|
Post-2020 Projects: 2,120
|
||||||
|
Change in Average Agencies: +0.0%
|
||||||
|
Change in Average Funding: +703.0%
|
||||||
|
Change in DAC Benefit: -100.0%
|
||||||
|
|
||||||
|
REGIONAL ANALYSIS
|
||||||
|
Multi-Region: 146,221 projects (99.9%)
|
||||||
|
Bay Area: 84 projects (0.1%)
|
||||||
|
|
||||||
|
Efficiency by Region ($ per ton CO2e):
|
||||||
|
Bay Area: $nan
|
||||||
|
Multi-Region: $312.50
|
||||||
|
|
||||||
|
DAC Benefit by Region:
|
||||||
|
Bay Area: nan%
|
||||||
|
Multi-Region: 1.30%
|
||||||
262
regional_analysis_script.py
Normal file
262
regional_analysis_script.py
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger("regional_analysis")
|
||||||
|
|
||||||
|
def analyze_regional_distribution(input_path, output_path=None):
|
||||||
|
"""
|
||||||
|
Analyze the regional distribution of CCI projects and its
|
||||||
|
relationship to GHG efficiency and DAC benefits.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
input_path (str): Path to the cleaned CCI data CSV file
|
||||||
|
output_path (str, optional): Path to save findings and visualizations
|
||||||
|
"""
|
||||||
|
logger.info(f"Loading data from {input_path}")
|
||||||
|
|
||||||
|
# Load the data
|
||||||
|
df = pd.read_csv(input_path, low_memory=False)
|
||||||
|
|
||||||
|
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
|
||||||
|
|
||||||
|
# Check if we have the regional data
|
||||||
|
if 'ca_region' not in df.columns:
|
||||||
|
logger.error("Regional data not found in the dataset")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Define output directory if provided
|
||||||
|
output_dir = None
|
||||||
|
if output_path:
|
||||||
|
output_dir = Path(output_path)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# 1. Basic regional distribution analysis
|
||||||
|
region_counts = df['ca_region'].value_counts()
|
||||||
|
region_percent = df['ca_region'].value_counts(normalize=True) * 100
|
||||||
|
|
||||||
|
logger.info("Regional distribution of CCI projects:")
|
||||||
|
for region, count in region_counts.items():
|
||||||
|
logger.info(f" {region}: {count} projects ({region_percent[region]:.1f}%)")
|
||||||
|
|
||||||
|
# Visualize regional distribution
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
region_counts.plot(kind='bar')
|
||||||
|
plt.title('Number of CCI Projects by Region')
|
||||||
|
plt.xlabel('Region')
|
||||||
|
plt.ylabel('Number of Projects')
|
||||||
|
plt.xticks(rotation=45, ha='right')
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
if output_dir:
|
||||||
|
plt.savefig(output_dir / "regional_distribution.png", dpi=300)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 2. EV vouchers vs non-EV projects by region
|
||||||
|
if 'is_ev_voucher' in df.columns:
|
||||||
|
ev_by_region = df[df['is_ev_voucher']]['ca_region'].value_counts()
|
||||||
|
nonev_by_region = df[~df['is_ev_voucher']]['ca_region'].value_counts()
|
||||||
|
|
||||||
|
# Calculate percentages
|
||||||
|
ev_percent = 100 * ev_by_region / ev_by_region.sum()
|
||||||
|
nonev_percent = 100 * nonev_by_region / nonev_by_region.sum()
|
||||||
|
|
||||||
|
# Combine for comparison
|
||||||
|
comparison_df = pd.DataFrame({
|
||||||
|
'EV Vouchers': ev_percent,
|
||||||
|
'Non-EV Projects': nonev_percent
|
||||||
|
})
|
||||||
|
|
||||||
|
# Fill missing values with 0
|
||||||
|
comparison_df = comparison_df.fillna(0)
|
||||||
|
|
||||||
|
# Visualize comparison
|
||||||
|
plt.figure(figsize=(12, 6))
|
||||||
|
comparison_df.plot(kind='bar')
|
||||||
|
plt.title('Regional Distribution: EV Vouchers vs. Non-EV Projects')
|
||||||
|
plt.xlabel('Region')
|
||||||
|
plt.ylabel('Percentage of Projects')
|
||||||
|
plt.xticks(rotation=45, ha='right')
|
||||||
|
plt.legend(title='Project Type')
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
if output_dir:
|
||||||
|
plt.savefig(output_dir / "regional_ev_comparison.png", dpi=300)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 3. GHG efficiency by region
|
||||||
|
if 'ghg_efficiency' in df.columns:
|
||||||
|
# Filter to valid efficiency values and non-extreme outliers
|
||||||
|
valid_data = df[(df['ghg_efficiency'].notna()) &
|
||||||
|
(df['ghg_efficiency'] > 0) &
|
||||||
|
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
|
||||||
|
|
||||||
|
# Calculate median efficiency by region
|
||||||
|
efficiency_by_region = valid_data.groupby('ca_region')['ghg_efficiency'].median().sort_values()
|
||||||
|
|
||||||
|
logger.info("GHG efficiency by region ($ per ton CO2e, median):")
|
||||||
|
for region, efficiency in efficiency_by_region.items():
|
||||||
|
logger.info(f" {region}: ${efficiency:.2f}")
|
||||||
|
|
||||||
|
# Visualize efficiency by region
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
efficiency_by_region.plot(kind='barh')
|
||||||
|
plt.title('GHG Efficiency by Region (lower is better)')
|
||||||
|
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
||||||
|
plt.ylabel('Region')
|
||||||
|
plt.grid(axis='x', alpha=0.3)
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
if output_dir:
|
||||||
|
plt.savefig(output_dir / "regional_efficiency.png", dpi=300)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 4. DAC benefit by region
|
||||||
|
if 'dac_benefit_percentage' in df.columns:
|
||||||
|
# Calculate mean DAC benefit by region
|
||||||
|
dac_by_region = df.groupby('ca_region')['dac_benefit_percentage'].mean().sort_values(ascending=False)
|
||||||
|
|
||||||
|
logger.info("DAC benefit percentage by region:")
|
||||||
|
for region, dac in dac_by_region.items():
|
||||||
|
logger.info(f" {region}: {dac:.2f}%")
|
||||||
|
|
||||||
|
# Visualize DAC benefit by region
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
dac_by_region.plot(kind='barh')
|
||||||
|
plt.title('DAC Benefit Percentage by Region')
|
||||||
|
plt.xlabel('DAC Benefit Percentage')
|
||||||
|
plt.ylabel('Region')
|
||||||
|
plt.grid(axis='x', alpha=0.3)
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
if output_dir:
|
||||||
|
plt.savefig(output_dir / "regional_dac_benefit.png", dpi=300)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 5. Efficiency vs Equity by Region
|
||||||
|
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
|
||||||
|
# Filter to valid data
|
||||||
|
valid_data = df[(df['ghg_efficiency'].notna()) &
|
||||||
|
(df['dac_benefit_percentage'].notna()) &
|
||||||
|
(df['ghg_efficiency'] > 0) &
|
||||||
|
(df['ghg_efficiency'] < df['ghg_efficiency'].quantile(0.95))]
|
||||||
|
|
||||||
|
# Calculate regional metrics
|
||||||
|
region_metrics = valid_data.groupby('ca_region').agg({
|
||||||
|
'ghg_efficiency': 'median',
|
||||||
|
'dac_benefit_percentage': 'mean',
|
||||||
|
'ca_region': 'count'
|
||||||
|
}).rename(columns={'ca_region': 'project_count'})
|
||||||
|
|
||||||
|
# Create scatter plot
|
||||||
|
plt.figure(figsize=(10, 8))
|
||||||
|
|
||||||
|
scatter = plt.scatter(
|
||||||
|
region_metrics['ghg_efficiency'],
|
||||||
|
region_metrics['dac_benefit_percentage'],
|
||||||
|
s=region_metrics['project_count'] / 10, # Size based on project count
|
||||||
|
alpha=0.7
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add region labels
|
||||||
|
for region in region_metrics.index:
|
||||||
|
plt.annotate(
|
||||||
|
region,
|
||||||
|
(region_metrics.loc[region, 'ghg_efficiency'],
|
||||||
|
region_metrics.loc[region, 'dac_benefit_percentage']),
|
||||||
|
textcoords="offset points",
|
||||||
|
xytext=(5, 5),
|
||||||
|
ha='left'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add quadrant lines
|
||||||
|
median_efficiency = region_metrics['ghg_efficiency'].median()
|
||||||
|
median_dac = region_metrics['dac_benefit_percentage'].median()
|
||||||
|
|
||||||
|
plt.axvline(x=median_efficiency, color='gray', linestyle='--', alpha=0.5)
|
||||||
|
plt.axhline(y=median_dac, color='gray', linestyle='--', alpha=0.5)
|
||||||
|
|
||||||
|
# Add quadrant labels
|
||||||
|
plt.text(0.98, 0.98, 'High Cost,\nHigh Equity', transform=plt.gca().transAxes,
|
||||||
|
ha='right', va='top', bbox=dict(facecolor='white', alpha=0.7))
|
||||||
|
plt.text(0.02, 0.98, 'Low Cost,\nHigh Equity', transform=plt.gca().transAxes,
|
||||||
|
ha='left', va='top', bbox=dict(facecolor='white', alpha=0.7))
|
||||||
|
plt.text(0.98, 0.02, 'High Cost,\nLow Equity', transform=plt.gca().transAxes,
|
||||||
|
ha='right', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
|
||||||
|
plt.text(0.02, 0.02, 'Low Cost,\nLow Equity', transform=plt.gca().transAxes,
|
||||||
|
ha='left', va='bottom', bbox=dict(facecolor='white', alpha=0.7))
|
||||||
|
|
||||||
|
plt.xlabel('GHG Efficiency ($ per ton CO2e)')
|
||||||
|
plt.ylabel('DAC Benefit Percentage')
|
||||||
|
plt.title('Efficiency vs. Equity by Region')
|
||||||
|
plt.grid(True, linestyle='--', alpha=0.7)
|
||||||
|
|
||||||
|
if output_dir:
|
||||||
|
plt.savefig(output_dir / "regional_efficiency_equity.png", dpi=300)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 6. Generate a summary text file
|
||||||
|
if output_dir:
|
||||||
|
with open(output_dir / "regional_analysis_summary.txt", 'w') as f:
|
||||||
|
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) REGIONAL ANALYSIS\n")
|
||||||
|
f.write("===================================================\n\n")
|
||||||
|
|
||||||
|
f.write("REGIONAL DISTRIBUTION\n")
|
||||||
|
for region, count in region_counts.items():
|
||||||
|
f.write(f"{region}: {count} projects ({region_percent[region]:.1f}%)\n")
|
||||||
|
|
||||||
|
if 'ghg_efficiency' in df.columns:
|
||||||
|
f.write("\nGHG EFFICIENCY BY REGION ($ PER TON CO2E, MEDIAN)\n")
|
||||||
|
for region, efficiency in efficiency_by_region.items():
|
||||||
|
f.write(f"{region}: ${efficiency:.2f}\n")
|
||||||
|
|
||||||
|
if 'dac_benefit_percentage' in df.columns:
|
||||||
|
f.write("\nDAC BENEFIT PERCENTAGE BY REGION\n")
|
||||||
|
for region, dac in dac_by_region.items():
|
||||||
|
f.write(f"{region}: {dac:.2f}%\n")
|
||||||
|
|
||||||
|
f.write("\nKEY FINDINGS\n")
|
||||||
|
|
||||||
|
# Add key findings based on the analysis
|
||||||
|
if 'ghg_efficiency' in df.columns and 'dac_benefit_percentage' in df.columns:
|
||||||
|
# Identify top performing regions
|
||||||
|
best_efficiency_region = efficiency_by_region.index[0]
|
||||||
|
best_dac_region = dac_by_region.index[0]
|
||||||
|
|
||||||
|
f.write(f"1. {best_efficiency_region} achieves the best GHG efficiency (${efficiency_by_region[best_efficiency_region]:.2f} per ton).\n")
|
||||||
|
f.write(f"2. {best_dac_region} achieves the highest DAC benefit ({dac_by_region[best_dac_region]:.2f}%).\n")
|
||||||
|
|
||||||
|
# Identify balanced regions (good in both dimensions)
|
||||||
|
low_cost_high_equity = region_metrics[(region_metrics['ghg_efficiency'] < median_efficiency) &
|
||||||
|
(region_metrics['dac_benefit_percentage'] > median_dac)]
|
||||||
|
|
||||||
|
if len(low_cost_high_equity) > 0:
|
||||||
|
top_balanced = low_cost_high_equity.index[0]
|
||||||
|
f.write(f"3. {top_balanced} achieves the best balance between efficiency and equity.\n")
|
||||||
|
|
||||||
|
# Check for regional disparities
|
||||||
|
max_efficiency_diff = efficiency_by_region.max() / efficiency_by_region.min() if efficiency_by_region.min() > 0 else 0
|
||||||
|
max_dac_diff = dac_by_region.max() - dac_by_region.min()
|
||||||
|
|
||||||
|
f.write(f"4. Regional disparities: {max_efficiency_diff:.1f}x variation in efficiency, {max_dac_diff:.1f} percentage point variation in DAC benefits.\n")
|
||||||
|
|
||||||
|
logger.info(f"Saved regional analysis summary to {output_dir / 'regional_analysis_summary.txt'}")
|
||||||
|
|
||||||
|
logger.info("Regional analysis completed")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Analyze regional distribution of CCI projects')
|
||||||
|
parser.add_argument('--input_path', type=str, required=True, help='Path to the cleaned CCI data CSV file')
|
||||||
|
parser.add_argument('--output_path', type=str, help='Path to save findings and visualizations')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
analyze_regional_distribution(args.input_path, args.output_path)
|
||||||
2209
research_analysis_script.py
Normal file
2209
research_analysis_script.py
Normal file
File diff suppressed because it is too large
Load Diff
122
run_cci_analysis.py
Normal file
122
run_cci_analysis.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
California Climate Investments (CCI) Collaboration Analysis Workflow
|
||||||
|
|
||||||
|
This script runs the complete workflow for analyzing collaboration patterns
|
||||||
|
in California's Climate Investments program and their impact on greenhouse
|
||||||
|
gas reduction efficiency and equity outcomes.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python run_cci_analysis.py --data_path data/cci_programs_data_reduced.csv --output_dir output
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger("cci_workflow")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the complete CCI collaboration analysis workflow."""
|
||||||
|
parser = argparse.ArgumentParser(description='Run CCI Collaboration Analysis Workflow')
|
||||||
|
parser.add_argument('--data_path', type=str, required=True, help='Path to the raw CCI data CSV file')
|
||||||
|
parser.add_argument('--output_dir', type=str, default='./output', help='Directory to save all outputs')
|
||||||
|
parser.add_argument('--skip_cleaning', action='store_true', help='Skip the data cleaning step')
|
||||||
|
parser.add_argument('--skip_analysis', action='store_true', help='Skip the detailed analysis step')
|
||||||
|
parser.add_argument('--skip_research', action='store_true', help='Skip the research questions analysis')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# File paths
|
||||||
|
raw_data_path = args.data_path
|
||||||
|
cleaned_data_path = output_dir / "cleaned_cci_data.csv"
|
||||||
|
data_summary_path = output_dir / "data_summary.json"
|
||||||
|
|
||||||
|
# Create subdirectories for different analysis outputs
|
||||||
|
cleaned_output_dir = output_dir / "cleaned"
|
||||||
|
analysis_output_dir = output_dir / "analysis"
|
||||||
|
research_output_dir = output_dir / "research"
|
||||||
|
|
||||||
|
for directory in [cleaned_output_dir, analysis_output_dir, research_output_dir]:
|
||||||
|
directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Step 1: Clean and prepare the data
|
||||||
|
if not args.skip_cleaning:
|
||||||
|
logger.info("Step 1: Cleaning and preparing the CCI data")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from data_cleaning_script import clean_and_prepare_cci_data, generate_data_summary
|
||||||
|
|
||||||
|
# Clean and prepare the data
|
||||||
|
cleaned_df = clean_and_prepare_cci_data(raw_data_path, cleaned_data_path)
|
||||||
|
|
||||||
|
# Generate data summary
|
||||||
|
generate_data_summary(cleaned_df, data_summary_path)
|
||||||
|
|
||||||
|
logger.info(f"Data cleaning complete. Cleaned data saved to {cleaned_data_path}")
|
||||||
|
logger.info(f"Data summary saved to {data_summary_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in data cleaning step: {e}")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
logger.info("Skipping data cleaning step")
|
||||||
|
# Check if cleaned data exists
|
||||||
|
if not cleaned_data_path.exists():
|
||||||
|
logger.error(f"Cleaned data file {cleaned_data_path} not found. Cannot proceed without data.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2: Run the detailed collaboration analysis
|
||||||
|
if not args.skip_analysis:
|
||||||
|
logger.info("Step 2: Running detailed collaboration analysis")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cci_collaboration_analysis import CCICollaborationAnalyzer
|
||||||
|
|
||||||
|
# Initialize the analyzer
|
||||||
|
analyzer = CCICollaborationAnalyzer(cleaned_data_path, str(analysis_output_dir))
|
||||||
|
|
||||||
|
# Run full analysis
|
||||||
|
analyzer.run_full_analysis()
|
||||||
|
|
||||||
|
logger.info(f"Detailed analysis complete. Results saved to {analysis_output_dir}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in detailed analysis step: {e}")
|
||||||
|
logger.error("Continuing to research analysis with available data...")
|
||||||
|
else:
|
||||||
|
logger.info("Skipping detailed analysis step")
|
||||||
|
|
||||||
|
# Step 3: Analyze specific research questions
|
||||||
|
if not args.skip_research:
|
||||||
|
logger.info("Step 3: Analyzing research questions")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from research_analysis_script import analyze_research_questions
|
||||||
|
|
||||||
|
# Run research analysis
|
||||||
|
findings = analyze_research_questions(cleaned_data_path, str(research_output_dir))
|
||||||
|
|
||||||
|
if findings:
|
||||||
|
logger.info(f"Research analysis complete. Results saved to {research_output_dir}")
|
||||||
|
else:
|
||||||
|
logger.error("Research analysis failed to complete successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in research analysis step: {e}")
|
||||||
|
else:
|
||||||
|
logger.info("Skipping research analysis step")
|
||||||
|
|
||||||
|
logger.info("CCI Collaboration Analysis Workflow complete!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user