125 lines
5.8 KiB
Python
125 lines
5.8 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from pathlib import Path
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger("collaboration_detection")
|
|
|
|
def investigate_collaboration(input_path, output_path=None):
|
|
"""
|
|
Investigate potential collaboration patterns in the CCI data
|
|
that might not be captured by the current approach.
|
|
|
|
Parameters:
|
|
input_path (str): Path to the CCI data CSV file
|
|
output_path (str, optional): Path to save findings
|
|
"""
|
|
logger.info(f"Loading data from {input_path}")
|
|
|
|
# Load the data
|
|
df = pd.read_csv(input_path, low_memory=False)
|
|
|
|
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
|
|
|
|
# 1. Look at unique agency_name values
|
|
agencies = df['agency_name'].unique()
|
|
logger.info(f"Found {len(agencies)} unique agencies")
|
|
logger.info("Agencies: " + ", ".join(sorted(agencies)[:10]) + "..." if len(agencies) > 10 else ", ".join(sorted(agencies)))
|
|
|
|
# 2. Look at how agency_name is associated with program_name
|
|
program_agency_counts = df.groupby('program_name')['agency_name'].nunique()
|
|
|
|
# Look for programs with multiple agencies
|
|
multi_agency_programs = program_agency_counts[program_agency_counts > 1]
|
|
logger.info(f"Found {len(multi_agency_programs)} programs with multiple agencies")
|
|
|
|
if len(multi_agency_programs) > 0:
|
|
logger.info("Multi-agency programs:")
|
|
for program, count in multi_agency_programs.items():
|
|
agencies = df[df['program_name'] == program]['agency_name'].unique()
|
|
logger.info(f" {program}: {count} agencies ({', '.join(agencies)})")
|
|
|
|
# 3. Look for other potential indicators of collaboration
|
|
# Check if there are other columns that might indicate collaboration
|
|
potential_collab_indicators = [
|
|
'agency_name', 'program_name', 'sub_program_name',
|
|
# Add other potential columns here
|
|
]
|
|
|
|
# Look for terms that might indicate collaboration
|
|
collab_terms = ['collab', 'partner', 'joint', 'multi', 'together', 'coop']
|
|
|
|
# Search for collaboration terms across relevant columns
|
|
for col in potential_collab_indicators:
|
|
if col in df.columns and df[col].dtype == 'object':
|
|
# Search for collaboration terms in the column
|
|
matches = []
|
|
for term in collab_terms:
|
|
term_matches = df[df[col].str.contains(term, case=False, na=False)]
|
|
if len(term_matches) > 0:
|
|
matches.append((term, len(term_matches)))
|
|
|
|
if matches:
|
|
logger.info(f"Found potential collaboration indicators in column '{col}':")
|
|
for term, count in matches:
|
|
logger.info(f" Term '{term}': {count} matches")
|
|
|
|
# 4. Look for potential co-funding patterns
|
|
# Sometimes collaboration is indicated by multiple funding sources
|
|
funding_cols = [col for col in df.columns if 'funding' in col.lower()]
|
|
logger.info(f"Found {len(funding_cols)} funding-related columns: {', '.join(funding_cols)}")
|
|
|
|
# 5. Generate a set of recommendations for identifying collaboration
|
|
recommendations = [
|
|
"1. Consider using a different approach to identify multi-agency programs:",
|
|
" - Look at sub_program_name for indicators of collaboration",
|
|
" - Check if there are text fields with partnership information",
|
|
" - Consider if programs can have multiple sub-agencies that aren't captured in agency_name",
|
|
"2. Explore if collaboration occurs at higher levels (program level) rather than project level",
|
|
"3. Check if some agencies have been merged or renamed in the data",
|
|
"4. For temporal analysis, check if collaboration patterns changed over time",
|
|
"5. Consider if collaboration is defined by regions/jurisdictions rather than state agencies"
|
|
]
|
|
|
|
if output_path:
|
|
with open(output_path, 'w') as f:
|
|
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) COLLABORATION DETECTION ANALYSIS\n")
|
|
f.write("==================================================================\n\n")
|
|
|
|
f.write("AGENCY ANALYSIS\n")
|
|
f.write(f"Found {len(agencies)} unique agencies\n")
|
|
f.write("Agencies: " + ", ".join(sorted(agencies)[:20]) + "...\n\n" if len(agencies) > 20 else ", ".join(sorted(agencies)) + "\n\n")
|
|
|
|
f.write("MULTI-AGENCY PROGRAM ANALYSIS\n")
|
|
f.write(f"Found {len(multi_agency_programs)} programs with multiple agencies\n\n")
|
|
|
|
if len(multi_agency_programs) > 0:
|
|
f.write("Multi-agency programs:\n")
|
|
for program, count in multi_agency_programs.items():
|
|
agencies = df[df['program_name'] == program]['agency_name'].unique()
|
|
f.write(f" {program}: {count} agencies ({', '.join(agencies)})\n")
|
|
else:
|
|
f.write("No multi-agency programs found using current detection method\n")
|
|
|
|
f.write("\nRECOMMENDATIONS\n")
|
|
for rec in recommendations:
|
|
f.write(f"{rec}\n")
|
|
|
|
logger.info(f"Saved collaboration detection analysis to {output_path}")
|
|
|
|
# Return the number of multi-agency programs
|
|
return len(multi_agency_programs)
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Investigate collaboration patterns in CCI data')
|
|
parser.add_argument('--input_path', type=str, required=True, help='Path to the CCI data CSV file')
|
|
parser.add_argument('--output_path', type=str, help='Path to save findings')
|
|
|
|
args = parser.parse_args()
|
|
|
|
investigate_collaboration(args.input_path, args.output_path) |