california-equity-git/collaboration_detection_script.py

import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("collaboration_detection")

def investigate_collaboration(input_path, output_path=None):
    """
    Investigate potential collaboration patterns in the CCI data
    that might not be captured by the current approach.

    Parameters:
        input_path (str): Path to the CCI data CSV file
        output_path (str, optional): Path to save findings
    """
    logger.info(f"Loading data from {input_path}")

    # Load the data
    df = pd.read_csv(input_path, low_memory=False)

    logger.info(f"Successfully loaded {len(df)} rows with {len(df.columns)} columns")

    # 1. Look at unique agency_name values
    agencies = df['agency_name'].unique()
    logger.info(f"Found {len(agencies)} unique agencies")
    logger.info("Agencies: " + ", ".join(sorted(agencies)[:10]) + "..." if len(agencies) > 10 else ", ".join(sorted(agencies)))

    # 2. Look at how agency_name is associated with program_name
    program_agency_counts = df.groupby('program_name')['agency_name'].nunique()

    # Look for programs with multiple agencies
    multi_agency_programs = program_agency_counts[program_agency_counts > 1]
    logger.info(f"Found {len(multi_agency_programs)} programs with multiple agencies")

    if len(multi_agency_programs) > 0:
        logger.info("Multi-agency programs:")
        for program, count in multi_agency_programs.items():
            agencies = df[df['program_name'] == program]['agency_name'].unique()
            logger.info(f"  {program}: {count} agencies ({', '.join(agencies)})")

    # 3. Look for other potential indicators of collaboration
    # Check if there are other columns that might indicate collaboration
    potential_collab_indicators = [
        'agency_name', 'program_name', 'sub_program_name',
        # Add other potential columns here
    ]

    # Look for terms that might indicate collaboration
    collab_terms = ['collab', 'partner', 'joint', 'multi', 'together', 'coop']

    # Search for collaboration terms across relevant columns
    for col in potential_collab_indicators:
        if col in df.columns and df[col].dtype == 'object':
            # Search for collaboration terms in the column
            matches = []
            for term in collab_terms:
                term_matches = df[df[col].str.contains(term, case=False, na=False)]
                if len(term_matches) > 0:
                    matches.append((term, len(term_matches)))

            if matches:
                logger.info(f"Found potential collaboration indicators in column '{col}':")
                for term, count in matches:
                    logger.info(f"  Term '{term}': {count} matches")

    # 4. Look for potential co-funding patterns
    # Sometimes collaboration is indicated by multiple funding sources
    funding_cols = [col for col in df.columns if 'funding' in col.lower()]
    logger.info(f"Found {len(funding_cols)} funding-related columns: {', '.join(funding_cols)}")

    # 5. Generate a set of recommendations for identifying collaboration
    recommendations = [
        "1. Consider using a different approach to identify multi-agency programs:",
        "   - Look at sub_program_name for indicators of collaboration",
        "   - Check if there are text fields with partnership information",
        "   - Consider if programs can have multiple sub-agencies that aren't captured in agency_name",
        "2. Explore if collaboration occurs at higher levels (program level) rather than project level",
        "3. Check if some agencies have been merged or renamed in the data",
        "4. For temporal analysis, check if collaboration patterns changed over time",
        "5. Consider if collaboration is defined by regions/jurisdictions rather than state agencies"
    ]

    if output_path:
        with open(output_path, 'w') as f:
            f.write("CALIFORNIA CLIMATE INVESTMENTS (CCI) COLLABORATION DETECTION ANALYSIS\n")
            f.write("==================================================================\n\n")

            f.write("AGENCY ANALYSIS\n")
            f.write(f"Found {len(agencies)} unique agencies\n")
            f.write("Agencies: " + ", ".join(sorted(agencies)[:20]) + "...\n\n" if len(agencies) > 20 else ", ".join(sorted(agencies)) + "\n\n")

            f.write("MULTI-AGENCY PROGRAM ANALYSIS\n")
            f.write(f"Found {len(multi_agency_programs)} programs with multiple agencies\n\n")

            if len(multi_agency_programs) > 0:
                f.write("Multi-agency programs:\n")
                for program, count in multi_agency_programs.items():
                    agencies = df[df['program_name'] == program]['agency_name'].unique()
                    f.write(f"  {program}: {count} agencies ({', '.join(agencies)})\n")
            else:
                f.write("No multi-agency programs found using current detection method\n")

            f.write("\nRECOMMENDATIONS\n")
            for rec in recommendations:
                f.write(f"{rec}\n")

        logger.info(f"Saved collaboration detection analysis to {output_path}")

    # Return the number of multi-agency programs
    return len(multi_agency_programs)

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Investigate collaboration patterns in CCI data')
    parser.add_argument('--input_path', type=str, required=True, help='Path to the CCI data CSV file')
    parser.add_argument('--output_path', type=str, help='Path to save findings')

    args = parser.parse_args()

    investigate_collaboration(args.input_path, args.output_path)