This commit is contained in:
2025-04-10 00:03:30 -07:00
parent 81ec68b3cc
commit 03ae352949
12 changed files with 150373 additions and 0 deletions

View File

@@ -0,0 +1,125 @@
import pandas as pd
import numpy as np
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("collaboration_detection")
def investigate_collaboration(input_path, output_path=None):
"""
Investigate potential collaboration patterns in the CCI data
that might not be captured by the current approach.
Parameters:
input_path (str): Path to the CCI data CSV file
output_path (str, optional): Path to save findings
"""
logger.info(f"Loading data from {input_path}")
# Load the data
df = pd.read_csv(input_path, low_memory=False)
logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")
# 1. Look at unique agency_name values
agencies = df['agency_name'].unique()
logger.info(f"Found {len(agencies)} unique agencies")
logger.info("Agencies: " + ", ".join(sorted(agencies)[:10]) + "..." if len(agencies) > 10 else ", ".join(sorted(agencies)))
# 2. Look at how agency_name is associated with program_name
program_agency_counts = df.groupby('program_name')['agency_name'].nunique()
# Look for programs with multiple agencies
multi_agency_programs = program_agency_counts[program_agency_counts > 1]
logger.info(f"Found {len(multi_agency_programs)} programs with multiple agencies")
if len(multi_agency_programs) > 0:
logger.info("Multi-agency programs:")
for program, count in multi_agency_programs.items():
agencies = df[df['program_name'] == program]['agency_name'].unique()
logger.info(f" {program}: {count} agencies ({', '.join(agencies)})")
# 3. Look for other potential indicators of collaboration
# Check if there are other columns that might indicate collaboration
potential_collab_indicators = [
'agency_name', 'program_name', 'sub_program_name',
# Add other potential columns here
]
# Look for terms that might indicate collaboration
collab_terms = ['collab', 'partner', 'joint', 'multi', 'together', 'coop']
# Search for collaboration terms across relevant columns
for col in potential_collab_indicators:
if col in df.columns and df[col].dtype == 'object':
# Search for collaboration terms in the column
matches = []
for term in collab_terms:
term_matches = df[df[col].str.contains(term, case=False, na=False)]
if len(term_matches) > 0:
matches.append((term, len(term_matches)))
if matches:
logger.info(f"Found potential collaboration indicators in column '{col}':")
for term, count in matches:
logger.info(f" Term '{term}': {count} matches")
# 4. Look for potential co-funding patterns
# Sometimes collaboration is indicated by multiple funding sources
funding_cols = [col for col in df.columns if 'funding' in col.lower()]
logger.info(f"Found {len(funding_cols)} funding-related columns: {', '.join(funding_cols)}")
# 5. Generate a set of recommendations for identifying collaboration
recommendations = [
"1. Consider using a different approach to identify multi-agency programs:",
" - Look at sub_program_name for indicators of collaboration",
" - Check if there are text fields with partnership information",
" - Consider if programs can have multiple sub-agencies that aren't captured in agency_name",
"2. Explore if collaboration occurs at higher levels (program level) rather than project level",
"3. Check if some agencies have been merged or renamed in the data",
"4. For temporal analysis, check if collaboration patterns changed over time",
"5. Consider if collaboration is defined by regions/jurisdictions rather than state agencies"
]
if output_path:
with open(output_path, 'w') as f:
f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) COLLABORATION DETECTION ANALYSIS\n")
f.write("==================================================================\n\n")
f.write("AGENCY ANALYSIS\n")
f.write(f"Found {len(agencies)} unique agencies\n")
f.write("Agencies: " + ", ".join(sorted(agencies)[:20]) + "...\n\n" if len(agencies) > 20 else ", ".join(sorted(agencies)) + "\n\n")
f.write("MULTI-AGENCY PROGRAM ANALYSIS\n")
f.write(f"Found {len(multi_agency_programs)} programs with multiple agencies\n\n")
if len(multi_agency_programs) > 0:
f.write("Multi-agency programs:\n")
for program, count in multi_agency_programs.items():
agencies = df[df['program_name'] == program]['agency_name'].unique()
f.write(f" {program}: {count} agencies ({', '.join(agencies)})\n")
else:
f.write("No multi-agency programs found using current detection method\n")
f.write("\nRECOMMENDATIONS\n")
for rec in recommendations:
f.write(f"{rec}\n")
logger.info(f"Saved collaboration detection analysis to {output_path}")
# Return the number of multi-agency programs
return len(multi_agency_programs)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Investigate collaboration patterns in CCI data')
parser.add_argument('--input_path', type=str, required=True, help='Path to the CCI data CSV file')
parser.add_argument('--output_path', type=str, help='Path to save findings')
args = parser.parse_args()
investigate_collaboration(args.input_path, args.output_path)