import pandas as pd import numpy as np from pathlib import Path import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("collaboration_detection") def investigate_collaboration(input_path, output_path=None): """ Investigate potential collaboration patterns in the CCI data that might not be captured by the current approach. Parameters: input_path (str): Path to the CCI data CSV file output_path (str, optional): Path to save findings """ logger.info(f"Loading data from {input_path}") # Load the data df = pd.read_csv(input_path, low_memory=False) logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns") # 1. Look at unique agency_name values agencies = df['agency_name'].unique() logger.info(f"Found {len(agencies)} unique agencies") logger.info("Agencies: " + ", ".join(sorted(agencies)[:10]) + "..." if len(agencies) > 10 else ", ".join(sorted(agencies))) # 2. Look at how agency_name is associated with program_name program_agency_counts = df.groupby('program_name')['agency_name'].nunique() # Look for programs with multiple agencies multi_agency_programs = program_agency_counts[program_agency_counts > 1] logger.info(f"Found {len(multi_agency_programs)} programs with multiple agencies") if len(multi_agency_programs) > 0: logger.info("Multi-agency programs:") for program, count in multi_agency_programs.items(): agencies = df[df['program_name'] == program]['agency_name'].unique() logger.info(f" {program}: {count} agencies ({', '.join(agencies)})") # 3. Look for other potential indicators of collaboration # Check if there are other columns that might indicate collaboration potential_collab_indicators = [ 'agency_name', 'program_name', 'sub_program_name', # Add other potential columns here ] # Look for terms that might indicate collaboration collab_terms = ['collab', 'partner', 'joint', 'multi', 'together', 'coop'] # Search for collaboration terms across relevant columns for col in potential_collab_indicators: if col in df.columns and df[col].dtype == 'object': # Search for collaboration terms in the column matches = [] for term in collab_terms: term_matches = df[df[col].str.contains(term, case=False, na=False)] if len(term_matches) > 0: matches.append((term, len(term_matches))) if matches: logger.info(f"Found potential collaboration indicators in column '{col}':") for term, count in matches: logger.info(f" Term '{term}': {count} matches") # 4. Look for potential co-funding patterns # Sometimes collaboration is indicated by multiple funding sources funding_cols = [col for col in df.columns if 'funding' in col.lower()] logger.info(f"Found {len(funding_cols)} funding-related columns: {', '.join(funding_cols)}") # 5. Generate a set of recommendations for identifying collaboration recommendations = [ "1. Consider using a different approach to identify multi-agency programs:", " - Look at sub_program_name for indicators of collaboration", " - Check if there are text fields with partnership information", " - Consider if programs can have multiple sub-agencies that aren't captured in agency_name", "2. Explore if collaboration occurs at higher levels (program level) rather than project level", "3. Check if some agencies have been merged or renamed in the data", "4. For temporal analysis, check if collaboration patterns changed over time", "5. Consider if collaboration is defined by regions/jurisdictions rather than state agencies" ] if output_path: with open(output_path, 'w') as f: f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) COLLABORATION DETECTION ANALYSIS\n") f.write("==================================================================\n\n") f.write("AGENCY ANALYSIS\n") f.write(f"Found {len(agencies)} unique agencies\n") f.write("Agencies: " + ", ".join(sorted(agencies)[:20]) + "...\n\n" if len(agencies) > 20 else ", ".join(sorted(agencies)) + "\n\n") f.write("MULTI-AGENCY PROGRAM ANALYSIS\n") f.write(f"Found {len(multi_agency_programs)} programs with multiple agencies\n\n") if len(multi_agency_programs) > 0: f.write("Multi-agency programs:\n") for program, count in multi_agency_programs.items(): agencies = df[df['program_name'] == program]['agency_name'].unique() f.write(f" {program}: {count} agencies ({', '.join(agencies)})\n") else: f.write("No multi-agency programs found using current detection method\n") f.write("\nRECOMMENDATIONS\n") for rec in recommendations: f.write(f"{rec}\n") logger.info(f"Saved collaboration detection analysis to {output_path}") # Return the number of multi-agency programs return len(multi_agency_programs) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Investigate collaboration patterns in CCI data') parser.add_argument('--input_path', type=str, required=True, help='Path to the CCI data CSV file') parser.add_argument('--output_path', type=str, help='Path to save findings') args = parser.parse_args() investigate_collaboration(args.input_path, args.output_path)