put a new local mistral llm to work on spills. EJ analyiss

2025-07-05 00:12:30 -07:00
parent 7b398324e8
commit e07ce642df
10 changed files with 1124 additions and 0 deletions
--- a/data/spill_analysis.py
+++ b/data/spill_analysis.py
@@ -0,0 +1,307 @@
+import pandas as pd
+import requests
+import json
+from collections import Counter, defaultdict
+import numpy as np
+
+def query_ollama(prompt, model="mistral"):
+    """Send query to local Ollama instance"""
+    try:
+        response = requests.post('http://localhost:11434/api/generate',
+            json={
+                'model': model,
+                'prompt': prompt,
+                'stream': False
+            })
+        return response.json()['response']
+    except Exception as e:
+        print(f"Error querying Ollama: {e}")
+        return None
+
+def analyze_spill_demographics(df):
+    """Analyze demographic patterns in spill data"""
+    
+    # Basic demographic statistics
+    demo_stats = {
+        'total_spills': len(df),
+        'avg_median_income': df['median_household_income'].mean(),
+        'avg_poverty_rate': df['percent_poverty'].mean(),
+        'avg_white_percentage': df['percent_white'].mean(),
+        'avg_hispanic_percentage': df['percent_hispanic'].mean(),
+        'avg_unemployment': df['unemployment_rate'].mean()
+    }
+    
+    # Environmental justice analysis
+    # Define high-poverty communities (>15% poverty rate)
+    high_poverty = df[df['percent_poverty'] > 15]
+    low_poverty = df[df['percent_poverty'] <= 15]
+    
+    # Define minority communities (>30% non-white)
+    minority_communities = df[df['percent_white'] < 70]
+    white_communities = df[df['percent_white'] >= 70]
+    
+    # Convert spill volumes to numeric, handling 'Unknown' values
+    produced_water_numeric = pd.to_numeric(df['Produced Water Spill Volume'], errors='coerce')
+    high_poverty_volumes = pd.to_numeric(high_poverty['Produced Water Spill Volume'], errors='coerce')
+    
+    ej_analysis = {
+        'high_poverty_spills': len(high_poverty),
+        'high_poverty_avg_volume': high_poverty_volumes.sum(),
+        'minority_community_spills': len(minority_communities),
+        'spills_by_income_quartile': df.groupby(pd.qcut(df['median_household_income'], 4, labels=['Q1(Lowest)', 'Q2', 'Q3', 'Q4(Highest)'])).size().to_dict(),
+        'major_spills_by_poverty': {
+            'high_poverty_major': len(high_poverty[high_poverty['More than five barrels spilled'] == 'Y']),
+            'low_poverty_major': len(low_poverty[low_poverty['More than five barrels spilled'] == 'Y'])
+        }
+    }
+    
+    return demo_stats, ej_analysis
+
+def analyze_root_causes(df):
+    """Analyze already-categorized root causes"""
+    
+    # Count existing cause categories, handling NaN values
+    cause_counts = {
+        'human_error': df['Human Error'].fillna(0).sum(),
+        'equipment_failure': df['Equipment Failure'].fillna(0).sum(), 
+        'historical_unknown': df['Historical Unkown'].fillna(0).sum(),  # Note: typo in original data
+        'other': df['Other'].fillna(0).sum()
+    }
+    
+    # Get specific root cause descriptions
+    root_causes = df['Root Cause'].dropna().value_counts().head(10)
+    
+    return cause_counts, root_causes
+
+def analyze_spill_themes_llm(df, sample_size=50):
+    """Use LLM to analyze themes in spill descriptions"""
+    
+    # Sample descriptions for LLM analysis (to avoid overwhelming it)
+    descriptions_series = df['Spill Description'].dropna()
+    if len(descriptions_series) == 0:
+        return "No spill descriptions available for analysis."
+    
+    sample_descriptions = descriptions_series.sample(min(sample_size, len(descriptions_series))).tolist()
+    
+    # Combine descriptions for batch analysis
+    combined_text = "\n---\n".join(sample_descriptions)
+    
+    prompt = f"""
+    Analyze these oil and gas spill incident descriptions to identify themes and patterns.
+    Focus on:
+    1. Common equipment failures (tanks, valves, pipelines, etc.)
+    2. Operational issues (overflow, leaks, maintenance problems)
+    3. Environmental factors (weather, terrain, wildlife)
+    4. Human factors (operator error, maintenance issues)
+    5. Discovery methods (routine inspection, alarms, third-party reports)
+    6. Spill severity indicators
+    
+    Incident descriptions:
+    {combined_text}
+    
+    Provide a structured analysis with:
+    - Top 5 equipment failure patterns
+    - Most common operational issues  
+    - Environmental risk factors
+    - Human factor patterns
+    - Recommendations for prevention based on these patterns
+    
+    Format as a concise regulatory summary suitable for policy recommendations.
+    """
+    
+    return query_ollama(prompt)
+
+def demographic_spill_analysis(df):
+    """Analyze spill patterns by demographic characteristics"""
+    
+    # Create demographic categories
+    df_analysis = df.copy()
+    df_analysis['income_category'] = pd.cut(df_analysis['median_household_income'], 
+                                          bins=3, labels=['Low Income', 'Middle Income', 'High Income'])
+    df_analysis['poverty_category'] = pd.cut(df_analysis['percent_poverty'], 
+                                           bins=[0, 10, 20, 100], labels=['Low Poverty', 'Moderate Poverty', 'High Poverty'])
+    df_analysis['race_category'] = df_analysis['percent_white'].apply(
+        lambda x: 'Majority White' if x >= 70 else 'Minority Community'
+    )
+    
+    # Analyze spill patterns by demographics
+    demo_patterns = {
+        'spills_by_income': df_analysis.groupby('income_category').size().to_dict(),
+        'spills_by_poverty': df_analysis.groupby('poverty_category').size().to_dict(),
+        'spills_by_race': df_analysis.groupby('race_category').size().to_dict(),
+        'volume_by_demographics': {
+            'high_poverty_major_spills': len(df_analysis[(df_analysis['percent_poverty'] > 15) & 
+                                                       (df_analysis['More than five barrels spilled'].astype(str) == 'Y')]),
+            'minority_major_spills': len(df_analysis[(df_analysis['percent_white'] < 70) & 
+                                                   (df_analysis['More than five barrels spilled'].astype(str) == 'Y')])
+        }
+    }
+    
+    return demo_patterns
+
+def analyze_environmental_justice(df, sample_descriptions=20):
+    """Use LLM to analyze environmental justice implications"""
+    
+    # Get descriptions from high-poverty and minority communities
+    high_poverty_desc = df[df['percent_poverty'] > 15]['Spill Description'].dropna()
+    minority_desc = df[df['percent_white'] < 70]['Spill Description'].dropna()
+    
+    if len(high_poverty_desc) == 0 or len(minority_desc) == 0:
+        return "Insufficient data for environmental justice analysis."
+    
+    high_poverty_spills = high_poverty_desc.sample(min(sample_descriptions//2, len(high_poverty_desc))).tolist()
+    minority_spills = minority_desc.sample(min(sample_descriptions//2, len(minority_desc))).tolist()
+    
+    combined_ej_text = "\n---HIGH POVERTY AREA---\n".join(high_poverty_spills) + "\n---MINORITY COMMUNITY---\n".join(minority_spills)
+    
+    prompt = f"""
+    Analyze these spill incidents from high-poverty and minority communities for environmental justice concerns.
+    
+    Consider:
+    1. Severity of incidents in vulnerable communities
+    2. Response effectiveness and cleanup completion
+    3. Long-term environmental impacts
+    4. Patterns that might indicate disproportionate impacts
+    5. Regulatory compliance and enforcement patterns
+    
+    Spill descriptions:
+    {combined_ej_text}
+    
+    Provide an environmental justice assessment focusing on:
+    - Whether vulnerable communities face more severe incidents
+    - Quality of response and remediation
+    - Policy recommendations for equitable environmental protection
+    """
+    
+    return query_ollama(prompt)
+
+def comprehensive_spill_analysis(csv_file):
+    """Run complete analysis of spill data"""
+    
+    print("Loading spill data...")
+    df = pd.read_csv(csv_file)
+    
+    print(f"Analyzing {len(df)} spill incidents...")
+    
+    # Basic demographic analysis
+    demo_stats, ej_analysis = analyze_spill_demographics(df)
+    
+    # Root cause analysis (using existing categorizations)
+    cause_counts, root_causes = analyze_root_causes(df)
+    
+    # Demographic patterns
+    demo_patterns = demographic_spill_analysis(df)
+    
+    # LLM-based theme analysis
+    print("Running LLM analysis on spill descriptions...")
+    theme_analysis = analyze_spill_themes_llm(df, sample_size=100)
+    
+    # Environmental justice analysis
+    print("Analyzing environmental justice implications...")
+    ej_llm_analysis = analyze_environmental_justice(df, sample_descriptions=30)
+    
+    # Compile comprehensive results
+    results = {
+        'summary_statistics': {
+            'total_incidents': len(df),
+            'date_range': f"{df['Date of Discovery'].min()} to {df['Date of Discovery'].max()}",
+            'counties_affected': df['county'].nunique(),
+            'operators_involved': df['Operator'].nunique()
+        },
+        'demographic_statistics': demo_stats,
+        'environmental_justice_analysis': ej_analysis,
+        'root_cause_analysis': {
+            'cause_counts': cause_counts,
+            'top_root_causes': root_causes.to_dict()
+        },
+        'demographic_patterns': demo_patterns,
+        'llm_theme_analysis': theme_analysis,
+        'llm_environmental_justice': ej_llm_analysis
+    }
+    
+    return results
+
+def generate_policy_report(results):
+    """Generate policy-focused summary using LLM"""
+    
+    # Create summary for LLM to process
+    summary_text = f"""
+    SPILL DATA ANALYSIS SUMMARY:
+    
+    Total Incidents: {results['summary_statistics']['total_incidents']}
+    Date Range: {results['summary_statistics']['date_range']}
+    
+    DEMOGRAPHIC PATTERNS:
+    - Average poverty rate in affected areas: {results['demographic_statistics']['avg_poverty_rate']:.1f}%
+    - Average income: ${results['demographic_statistics']['avg_median_income']:,.0f}
+    - Spills in high-poverty areas: {results['environmental_justice_analysis']['high_poverty_spills']}
+    - Spills in minority communities: {results['environmental_justice_analysis']['minority_community_spills']}
+    
+    ROOT CAUSES:
+    - Equipment failures: {results['root_cause_analysis']['cause_counts']['equipment_failure']}
+    - Human error: {results['root_cause_analysis']['cause_counts']['human_error']}
+    - Historical/unknown: {results['root_cause_analysis']['cause_counts']['historical_unknown']}
+    
+    THEME ANALYSIS:
+    {results['llm_theme_analysis']}
+    
+    ENVIRONMENTAL JUSTICE ANALYSIS:
+    {results['llm_environmental_justice']}
+    """
+    
+    policy_prompt = f"""
+    Based on this comprehensive spill data analysis, create a policy-focused executive summary.
+    
+    Data Summary:
+    {summary_text}
+    
+    Provide:
+    1. Key findings on environmental justice impacts
+    2. Priority areas for regulatory attention
+    3. Specific policy recommendations for prevention
+    4. Recommendations for equitable enforcement
+    5. Suggested regulatory changes based on patterns identified
+    
+    Format as an executive summary suitable for regulatory decision-makers and policy researchers.
+    """
+    
+    return query_ollama(policy_prompt)
+
+# Execute comprehensive analysis
+if __name__ == "__main__":
+    # Run the analysis
+    results = comprehensive_spill_analysis('spills_with_demographics.csv')
+    
+    # Generate policy report
+    print("\nGenerating policy-focused summary...")
+    policy_report = generate_policy_report(results)
+    
+    # Save all results
+    with open('comprehensive_spill_analysis.json', 'w') as f:
+        json.dump(results, f, indent=2, default=str)
+    
+    with open('policy_executive_summary.txt', 'w') as f:
+        f.write(policy_report)
+    
+    # Print key findings
+    print("\n" + "="*60)
+    print("COMPREHENSIVE SPILL ANALYSIS COMPLETE")
+    print("="*60)
+    
+    print(f"\nTotal incidents analyzed: {results['summary_statistics']['total_incidents']:,}")
+    print(f"Counties affected: {results['summary_statistics']['counties_affected']}")
+    print(f"Average poverty rate in affected areas: {results['demographic_statistics']['avg_poverty_rate']:.1f}%")
+    print(f"Spills in high-poverty communities: {results['environmental_justice_analysis']['high_poverty_spills']:,}")
+    print(f"Spills in minority communities: {results['environmental_justice_analysis']['minority_community_spills']:,}")
+    
+    print(f"\nRoot cause breakdown:")
+    for cause, count in results['root_cause_analysis']['cause_counts'].items():
+        print(f"  {cause.replace('_', ' ').title()}: {count:,}")
+    
+    print(f"\nResults saved to:")
+    print(f"  - comprehensive_spill_analysis.json (detailed data)")
+    print(f"  - policy_executive_summary.txt (executive summary)")
+    
+    print(f"\nPolicy Summary Preview:")
+    print("="*40)
+    print(policy_report[:500] + "...")