we have an EJ story! and a good one!

This commit is contained in:
2025-07-10 13:22:35 -07:00
parent da4358728d
commit afff5eebd4
27 changed files with 7398 additions and 190 deletions

View File

@@ -0,0 +1,405 @@
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
def load_and_prepare_data(csv_file):
"""Load and prepare data with all necessary variables"""
print("LOADING AND PREPARING DATA")
print("="*50)
df = pd.read_csv(csv_file)
# Convert dates
df['Date of Discovery'] = pd.to_datetime(df['Date of Discovery'], errors='coerce')
df['Initial Report Date'] = pd.to_datetime(df['Initial Report Date'], errors='coerce')
# Calculate derived variables
df['reporting_delay_days'] = (df['Initial Report Date'] - df['Date of Discovery']).dt.days
df['discovery_year'] = df['Date of Discovery'].dt.year
df['report_year'] = df['Initial Report Date'].dt.year
# Volume calculations
volume_columns = ['Oil BBLs Spilled', 'Condensate BBLs Spilled', 'Produced Water BBLs Spilled',
'Drilling Fluid BBLs Spilled', 'Flow Back Fluid BBLs Spilled']
df['total_volume_bbls'] = 0
for col in volume_columns:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df['total_volume_bbls'] += df[col]
# Binary indicators
df['major_spill'] = (df['More than five barrels spilled'].astype(str) == 'Y')
df['outside_berms'] = (df['Spilled outside of berms'].astype(str) == 'Y')
df['historical_spill'] = (df['Spill Type'] == 'Historical')
# Environmental impacts
for impact in ['soil', 'groundwater', 'Surface Water']:
if impact in df.columns:
df[f'{impact}_impacted'] = (df[impact] == 1.0)
# Demographic categories
df['high_poverty'] = df['percent_poverty'] > 15
df['minority_community'] = df['percent_white'] < 70
df['low_income'] = df['median_household_income'] < df['median_household_income'].median()
# Create demographic category labels
df['poverty_category'] = df['high_poverty'].map({True: 'High Poverty (>15%)', False: 'Low Poverty (≤15%)'})
df['race_category'] = df['minority_community'].map({True: 'Minority Community (<70% White)', False: 'Majority White (≥70% White)'})
print(f"Data loaded: {len(df):,} records from {df['discovery_year'].min()}-{df['discovery_year'].max()}")
return df
def dataset_overview_stats(df):
"""Generate overall dataset descriptive statistics"""
print("\nDATASET OVERVIEW")
print("="*50)
overview = {
'Metric': [
'Total Spill Incidents',
'Study Period',
'Historical Spills',
'Recent Spills',
'Records with Volume Data',
'Records with Geographic Data',
'Records with Complete Demographic Data',
'Unique Counties',
'Unique Facility Types'
],
'Value': [
f"{len(df):,}",
f"{df['discovery_year'].min():.0f}-{df['discovery_year'].max():.0f}",
f"{(df['Spill Type'] == 'Historical').sum():,} ({(df['Spill Type'] == 'Historical').mean()*100:.1f}%)",
f"{(df['Spill Type'] == 'Recent').sum():,} ({(df['Spill Type'] == 'Recent').mean()*100:.1f}%)",
f"{(df['total_volume_bbls'] > 0).sum():,} ({(df['total_volume_bbls'] > 0).mean()*100:.1f}%)",
f"{df[['Latitude', 'Longitude']].dropna().shape[0]:,} ({df[['Latitude', 'Longitude']].dropna().shape[0]/len(df)*100:.1f}%)",
f"{df[['percent_poverty', 'percent_white', 'median_household_income']].dropna().shape[0]:,}",
f"{df['county'].nunique():,}" if 'county' in df.columns else "N/A",
f"{df['Facility Type'].nunique():,}" if 'Facility Type' in df.columns else "N/A"
]
}
overview_df = pd.DataFrame(overview)
print(overview_df.to_string(index=False))
return overview_df
def demographic_descriptive_stats(df):
"""Generate demographic descriptive statistics"""
print("\nDEMOGRAFIC CHARACTERISTICS")
print("="*50)
# Overall demographic distribution
demo_stats = {
'Characteristic': [
'Poverty Rate (%)',
'Percent White (%)',
'Percent Hispanic (%)',
'Median Household Income ($)',
'Unemployment Rate (%)'
],
'Mean': [
f"{df['percent_poverty'].mean():.1f}",
f"{df['percent_white'].mean():.1f}",
f"{df['percent_hispanic'].mean():.1f}",
f"${df['median_household_income'].mean():,.0f}",
f"{df['unemployment_rate'].mean():.1f}" if 'unemployment_rate' in df.columns else "N/A"
],
'Median': [
f"{df['percent_poverty'].median():.1f}",
f"{df['percent_white'].median():.1f}",
f"{df['percent_hispanic'].median():.1f}",
f"${df['median_household_income'].median():,.0f}",
f"{df['unemployment_rate'].median():.1f}" if 'unemployment_rate' in df.columns else "N/A"
],
'Std Dev': [
f"{df['percent_poverty'].std():.1f}",
f"{df['percent_white'].std():.1f}",
f"{df['percent_hispanic'].std():.1f}",
f"${df['median_household_income'].std():,.0f}",
f"{df['unemployment_rate'].std():.1f}" if 'unemployment_rate' in df.columns else "N/A"
],
'Range': [
f"{df['percent_poverty'].min():.1f} - {df['percent_poverty'].max():.1f}",
f"{df['percent_white'].min():.1f} - {df['percent_white'].max():.1f}",
f"{df['percent_hispanic'].min():.1f} - {df['percent_hispanic'].max():.1f}",
f"${df['median_household_income'].min():,.0f} - ${df['median_household_income'].max():,.0f}",
f"{df['unemployment_rate'].min():.1f} - {df['unemployment_rate'].max():.1f}" if 'unemployment_rate' in df.columns else "N/A"
]
}
demo_df = pd.DataFrame(demo_stats)
print(demo_df.to_string(index=False))
# Community classifications
print(f"\nCOMMUNITY CLASSIFICATIONS:")
print(f"High Poverty Areas (>15%): {df['high_poverty'].sum():,} spills ({df['high_poverty'].mean()*100:.1f}%)")
print(f"Minority Communities (<70% White): {df['minority_community'].sum():,} spills ({df['minority_community'].mean()*100:.1f}%)")
print(f"Low-Income Areas (Below Median): {df['low_income'].sum():,} spills ({df['low_income'].mean()*100:.1f}%)")
return demo_df
def spill_characteristics_by_demographics(df):
"""Generate spill characteristics by demographic groups"""
print("\nSPILL CHARACTERISTICS BY DEMOGRAPHICS")
print("="*50)
# Create comparison table
groups = ['Overall', 'High Poverty', 'Low Poverty', 'Minority Community', 'Majority White']
characteristics = []
for group in groups:
if group == 'Overall':
subset = df
elif group == 'High Poverty':
subset = df[df['high_poverty']]
elif group == 'Low Poverty':
subset = df[~df['high_poverty']]
elif group == 'Minority Community':
subset = df[df['minority_community']]
else: # Majority White
subset = df[~df['minority_community']]
characteristics.append({
'Group': group,
'N': f"{len(subset):,}",
'Historical Spills (%)': f"{(subset['Spill Type'] == 'Historical').mean()*100:.1f}",
'Major Spills (%)': f"{subset['major_spill'].mean()*100:.1f}",
'Mean Volume (bbls)': f"{subset['total_volume_bbls'].mean():.2f}",
'Median Volume (bbls)': f"{subset['total_volume_bbls'].median():.2f}",
'Mean Reporting Delay (days)': f"{subset['reporting_delay_days'].mean():.1f}",
'Median Reporting Delay (days)': f"{subset['reporting_delay_days'].median():.1f}",
'Outside Berms (%)': f"{subset['outside_berms'].mean()*100:.1f}" if 'outside_berms' in subset.columns else "N/A"
})
char_df = pd.DataFrame(characteristics)
print(char_df.to_string(index=False))
return char_df
def environmental_impact_stats(df):
"""Generate environmental impact statistics"""
print("\nENVIRONMENTAL IMPACT PATTERNS")
print("="*50)
impact_stats = []
# Overall impact rates
impact_types = ['soil_impacted', 'groundwater_impacted', 'Surface Water_impacted']
impact_labels = ['Soil Contamination', 'Groundwater Contamination', 'Surface Water Contamination']
for impact_type, label in zip(impact_types, impact_labels):
if impact_type in df.columns:
overall_rate = df[impact_type].mean() * 100
hist_rate = df[df['historical_spill']][impact_type].mean() * 100
recent_rate = df[~df['historical_spill']][impact_type].mean() * 100
high_pov_rate = df[df['high_poverty']][impact_type].mean() * 100
low_pov_rate = df[~df['high_poverty']][impact_type].mean() * 100
impact_stats.append({
'Impact Type': label,
'Overall (%)': f"{overall_rate:.1f}",
'Historical Spills (%)': f"{hist_rate:.1f}",
'Recent Spills (%)': f"{recent_rate:.1f}",
'High Poverty (%)': f"{high_pov_rate:.1f}",
'Low Poverty (%)': f"{low_pov_rate:.1f}"
})
if impact_stats:
impact_df = pd.DataFrame(impact_stats)
print(impact_df.to_string(index=False))
else:
print("Environmental impact data not available")
impact_df = pd.DataFrame()
return impact_df
def temporal_patterns_stats(df):
"""Generate temporal pattern statistics"""
print("\nTEMPORAL PATTERNS")
print("="*50)
# Filter to main study period
df_temporal = df[(df['discovery_year'] >= 2014) & (df['discovery_year'] <= 2024)]
# Annual summary statistics
annual_stats = df_temporal.groupby('discovery_year').agg({
'Spill Type': ['count', lambda x: (x == 'Historical').sum()],
'major_spill': 'mean',
'total_volume_bbls': ['mean', 'median'],
'high_poverty': 'mean',
'reporting_delay_days': 'mean'
}).round(2)
# Flatten column names
annual_stats.columns = ['Total_Spills', 'Historical_Count', 'Major_Spill_Rate',
'Mean_Volume', 'Median_Volume', 'High_Poverty_Rate', 'Mean_Delay']
annual_stats['Historical_Rate'] = (annual_stats['Historical_Count'] / annual_stats['Total_Spills'] * 100).round(1)
annual_stats['Major_Spill_Rate'] = (annual_stats['Major_Spill_Rate'] * 100).round(1)
annual_stats['High_Poverty_Rate'] = (annual_stats['High_Poverty_Rate'] * 100).round(1)
print("ANNUAL TRENDS (2014-2024):")
print(annual_stats[['Total_Spills', 'Historical_Rate', 'Major_Spill_Rate', 'Mean_Volume', 'High_Poverty_Rate']].to_string())
# Period comparison
early_period = df_temporal[df_temporal['discovery_year'] <= 2018]
recent_period = df_temporal[df_temporal['discovery_year'] >= 2020]
print(f"\nPERIOD COMPARISON:")
print(f"Early Period (2014-2018): {len(early_period):,} spills")
print(f"Recent Period (2020-2024): {len(recent_period):,} spills")
print(f"Historical Rate - Early: {(early_period['Spill Type'] == 'Historical').mean()*100:.1f}%")
print(f"Historical Rate - Recent: {(recent_period['Spill Type'] == 'Historical').mean()*100:.1f}%")
print(f"Major Spill Rate - Early: {early_period['major_spill'].mean()*100:.1f}%")
print(f"Major Spill Rate - Recent: {recent_period['major_spill'].mean()*100:.1f}%")
return annual_stats
def facility_type_stats(df):
"""Generate facility type statistics"""
print("\nFACILITY TYPE DISTRIBUTION")
print("="*50)
if 'Facility Type' in df.columns:
# Overall facility distribution
facility_counts = df['Facility Type'].value_counts()
facility_pct = (facility_counts / len(df) * 100).round(1)
# Top 15 facility types
top_facilities = pd.DataFrame({
'Facility Type': facility_counts.head(15).index,
'Count': facility_counts.head(15).values,
'Percentage': facility_pct.head(15).values
})
print("TOP 15 FACILITY TYPES:")
print(top_facilities.to_string(index=False))
# Facility types by demographics
print(f"\nFACILITY CONCENTRATION IN HIGH-POVERTY AREAS:")
facility_demo = pd.crosstab(df['Facility Type'], df['high_poverty'], normalize='columns') * 100
# Show facilities with highest concentration in high-poverty areas
high_poverty_concentration = facility_demo[True].sort_values(ascending=False).head(10)
for facility, pct in high_poverty_concentration.items():
low_pov_pct = facility_demo.loc[facility, False] if facility in facility_demo.index else 0
ratio = pct / low_pov_pct if low_pov_pct > 0 else float('inf')
print(f" {facility}: {pct:.1f}% (vs {low_pov_pct:.1f}% in low-poverty, {ratio:.1f}x ratio)")
return top_facilities
else:
print("Facility type data not available")
return pd.DataFrame()
def volume_distribution_stats(df):
"""Generate detailed volume distribution statistics"""
print("\nSPILL VOLUME DISTRIBUTION")
print("="*50)
# Overall volume statistics
volume_data = df[df['total_volume_bbls'] > 0]['total_volume_bbls']
if len(volume_data) > 0:
volume_stats = {
'Statistic': ['Count', 'Mean', 'Median', 'Std Dev', 'Min', '25th Percentile',
'75th Percentile', '90th Percentile', '95th Percentile', '99th Percentile', 'Max'],
'Value (bbls)': [
f"{len(volume_data):,}",
f"{volume_data.mean():.2f}",
f"{volume_data.median():.2f}",
f"{volume_data.std():.2f}",
f"{volume_data.min():.2f}",
f"{volume_data.quantile(0.25):.2f}",
f"{volume_data.quantile(0.75):.2f}",
f"{volume_data.quantile(0.90):.2f}",
f"{volume_data.quantile(0.95):.2f}",
f"{volume_data.quantile(0.99):.2f}",
f"{volume_data.max():.2f}"
]
}
volume_df = pd.DataFrame(volume_stats)
print(volume_df.to_string(index=False))
# Volume by categories
print(f"\nVOLUME CATEGORIES:")
print(f"Small spills (≤1 bbl): {(volume_data <= 1).sum():,} ({(volume_data <= 1).mean()*100:.1f}%)")
print(f"Medium spills (1-5 bbls): {((volume_data > 1) & (volume_data <= 5)).sum():,} ({((volume_data > 1) & (volume_data <= 5)).mean()*100:.1f}%)")
print(f"Large spills (5-50 bbls): {((volume_data > 5) & (volume_data <= 50)).sum():,} ({((volume_data > 5) & (volume_data <= 50)).mean()*100:.1f}%)")
print(f"Very large spills (>50 bbls): {(volume_data > 50).sum():,} ({(volume_data > 50).mean()*100:.1f}%)")
return volume_df
else:
print("No volume data available")
return pd.DataFrame()
def save_all_descriptive_stats(df, filename='descriptive_statistics_tables.xlsx'):
"""Save all descriptive statistics to Excel file"""
print(f"\nSAVING ALL TABLES TO {filename}")
print("="*50)
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
# Generate all tables
overview = dataset_overview_stats(df)
demographics = demographic_descriptive_stats(df)
characteristics = spill_characteristics_by_demographics(df)
impacts = environmental_impact_stats(df)
temporal = temporal_patterns_stats(df)
facilities = facility_type_stats(df)
volumes = volume_distribution_stats(df)
# Save to Excel
overview.to_excel(writer, sheet_name='Dataset Overview', index=False)
demographics.to_excel(writer, sheet_name='Demographics', index=False)
characteristics.to_excel(writer, sheet_name='Spill Characteristics', index=False)
if not impacts.empty:
impacts.to_excel(writer, sheet_name='Environmental Impacts', index=False)
temporal.to_excel(writer, sheet_name='Temporal Patterns', index=True)
if not facilities.empty:
facilities.to_excel(writer, sheet_name='Facility Types', index=False)
if not volumes.empty:
volumes.to_excel(writer, sheet_name='Volume Distribution', index=False)
print(f"All descriptive statistics saved to {filename}")
def run_full_descriptive_analysis(csv_file):
"""Run complete descriptive statistics analysis"""
print("COMPREHENSIVE DESCRIPTIVE STATISTICS ANALYSIS")
print("="*70)
# Load data
df = load_and_prepare_data(csv_file)
# Generate all descriptive statistics
dataset_overview_stats(df)
demographic_descriptive_stats(df)
spill_characteristics_by_demographics(df)
environmental_impact_stats(df)
temporal_patterns_stats(df)
facility_type_stats(df)
volume_distribution_stats(df)
# Save all tables
save_all_descriptive_stats(df)
print(f"\nDESCRIPTIVE ANALYSIS COMPLETE!")
print(f"Tables saved to: descriptive_statistics_tables.xlsx")
return df
if __name__ == "__main__":
df = run_full_descriptive_analysis('data/spills_with_demographics.csv')

View File

@@ -0,0 +1,162 @@
import requests
import time
import json
def test_ollama_step_by_step():
"""Test each step of Ollama communication"""
print("=== OLLAMA DEBUG TEST ===")
# Step 1: Test basic connection
print("1. Testing basic connection...")
try:
response = requests.get('http://localhost:11434/api/tags', timeout=5)
print(f" ✓ Connection successful: {response.status_code}")
models = response.json().get('models', [])
print(f" ✓ Found {len(models)} models")
for model in models:
print(f" - {model['name']}")
except Exception as e:
print(f" ✗ Connection failed: {e}")
return False
# Step 2: Test simple generation
print("\n2. Testing simple generation...")
simple_prompt = "Hello"
try:
print(" Sending simple request...")
start_time = time.time()
response = requests.post(
'http://localhost:11434/api/generate',
json={
'model': 'mistral:7b',
'prompt': simple_prompt,
'stream': False,
'options': {'num_predict': 50} # Limit to 50 tokens
},
timeout=30 # 30 second timeout
)
elapsed = time.time() - start_time
print(f" ✓ Request completed in {elapsed:.1f} seconds")
print(f" ✓ Status code: {response.status_code}")
if response.status_code == 200:
result = response.json()
if 'response' in result:
print(f" ✓ Response received: {result['response'][:100]}...")
return True
else:
print(f" ✗ Unexpected response format: {result}")
return False
else:
print(f" ✗ HTTP error: {response.status_code}")
print(f" Response: {response.text}")
return False
except requests.exceptions.Timeout:
print(" ✗ Request timed out after 30 seconds")
return False
except Exception as e:
print(f" ✗ Request failed: {e}")
return False
print("\n3. Testing with longer prompt...")
longer_prompt = "Analyze this data and provide insights: " + "data point " * 50
try:
print(" Sending longer request...")
start_time = time.time()
response = requests.post(
'http://localhost:11434/api/generate',
json={
'model': 'mistral:7b',
'prompt': longer_prompt,
'stream': False,
'options': {'num_predict': 100}
},
timeout=60
)
elapsed = time.time() - start_time
print(f" ✓ Longer request completed in {elapsed:.1f} seconds")
if response.status_code == 200:
result = response.json()
print(f" ✓ Response length: {len(result.get('response', ''))}")
return True
else:
print(f" ✗ HTTP error: {response.status_code}")
return False
except Exception as e:
print(f" ✗ Longer request failed: {e}")
return False
def test_environmental_justice_prompt():
"""Test with a prompt similar to your actual use case"""
print("\n=== TESTING EJ ANALYSIS PROMPT ===")
# Simplified version of your actual prompt
test_prompt = """Analyze these environmental justice findings:
STATISTICAL TESTS:
- Poverty over-representation: 2.3x expected (p=0.001)
- Major spills z-test p-value: 0.023
- Minority communities ratio: 1.8x
SPATIAL PATTERNS:
- 5 spatial clusters identified
- Max density: 12 spills per grid cell
Provide a 200-word academic interpretation focusing on environmental justice implications."""
try:
print("Sending EJ analysis prompt...")
start_time = time.time()
response = requests.post(
'http://localhost:11434/api/generate',
json={
'model': 'mistral:7b',
'prompt': test_prompt,
'stream': False,
'options': {
'temperature': 0.7,
'num_predict': 300
}
},
timeout=120 # 2 minute timeout
)
elapsed = time.time() - start_time
print(f"✓ EJ prompt completed in {elapsed:.1f} seconds")
if response.status_code == 200:
result = response.json()
analysis = result.get('response', '')
print(f"✓ Analysis generated ({len(analysis)} characters)")
print(f"Preview: {analysis[:200]}...")
return True
else:
print(f"✗ HTTP error: {response.status_code}")
return False
except Exception as e:
print(f"✗ EJ prompt failed: {e}")
return False
if __name__ == "__main__":
print("Testing Ollama integration...")
if test_ollama_step_by_step():
print("\n✓ Basic Ollama tests passed!")
test_environmental_justice_prompt()
else:
print("\n✗ Basic Ollama tests failed!")
print("\n=== DEBUG COMPLETE ===")

View File

@@ -0,0 +1,254 @@
COMPREHENSIVE DESCRIPTIVE STATISTICS ANALYSIS
======================================================================
LOADING AND PREPARING DATA
==================================================
Data loaded: 16,890 records from 1994-2024
DATASET OVERVIEW
==================================================
Metric Value
Total Spill Incidents 16,890
Study Period 1994-2024
Historical Spills 5,828 (34.5%)
Recent Spills 11,062 (65.5%)
Records with Volume Data 3,527 (20.9%)
Records with Geographic Data 16,890 (100.0%)
Records with Complete Demographic Data 16,886
Unique Counties 33
Unique Facility Types 24
DEMOGRAFIC CHARACTERISTICS
==================================================
Characteristic Mean Median Std Dev Range
Poverty Rate (%) 10.3 9.6 6.4 0.0 - 28.1
Percent White (%) 83.5 85.9 8.5 28.6 - 99.4
Percent Hispanic (%) 22.5 18.3 13.0 1.0 - 79.1
Median Household Income ($) $79,282 $72,717 $24,119 $29,965 - $190,417
Unemployment Rate (%) 2.7 2.4 1.4 0.0 - 6.6
COMMUNITY CLASSIFICATIONS:
High Poverty Areas (>15%): 3,497 spills (20.7%)
Minority Communities (<70% White): 1,047 spills (6.2%)
Low-Income Areas (Below Median): 6,847 spills (40.5%)
SPILL CHARACTERISTICS BY DEMOGRAPHICS
==================================================
Group N Historical Spills (%) Major Spills (%) Mean Volume (bbls) Median Volume (bbls) Mean Reporting Delay (days) Median Reporting Delay (days) Outside Berms (%)
Overall 16,890 34.5 28.9 16.80 0.00 9.9 1.0 47.1
High Poverty 3,497 15.6 36.9 27.53 0.00 4.8 1.0 52.1
Low Poverty 13,393 39.5 26.9 13.99 0.00 11.3 1.0 45.8
Minority Community 1,047 21.1 30.0 41.67 0.00 18.1 1.0 55.8
Majority White 15,843 35.4 28.9 15.15 0.00 9.4 1.0 46.5
ENVIRONMENTAL IMPACT PATTERNS
==================================================
Impact Type Overall (%) Historical Spills (%) Recent Spills (%) High Poverty (%) Low Poverty (%)
Soil Contamination 47.8 61.9 40.3 46.5 48.1
Groundwater Contamination 6.0 11.9 2.9 1.9 7.1
Surface Water Contamination 0.5 0.0 0.7 0.9 0.4
TEMPORAL PATTERNS
==================================================
ANNUAL TRENDS (2014-2024):
Total_Spills Historical_Rate Major_Spill_Rate Mean_Volume High_Poverty_Rate
discovery_year
2014 1099 30.7 45.0 10.45 21.0
2015 1580 29.9 48.0 20.51 23.0
2016 1373 31.4 51.0 18.63 23.0
2017 1599 28.1 43.0 14.50 19.0
2018 1581 33.0 48.0 14.85 24.0
2019 1557 23.8 53.0 23.68 24.0
2020 1166 21.3 44.0 18.46 22.0
2021 1790 35.8 7.0 8.76 20.0
2022 2155 40.0 0.0 31.02 17.0
2023 2080 46.0 0.0 10.81 18.0
2024 864 57.3 0.0 4.18 20.0
PERIOD COMPARISON:
Early Period (2014-2018): 7,232 spills
Recent Period (2020-2024): 8,055 spills
Historical Rate - Early: 30.6%
Historical Rate - Recent: 39.8%
Major Spill Rate - Early: 47.2%
Major Spill Rate - Recent: 7.9%
FACILITY TYPE DISTRIBUTION
==================================================
TOP 15 FACILITY TYPES:
Facility Type Count Percentage
TANK BATTERY 5273 31.2
WELL 2626 15.5
FLOWLINE 1430 8.5
WELL PAD 1250 7.4
OIL AND GAS LOCATION 880 5.2
FLOWLINE SYSTEM 767 4.5
WELL SITE 732 4.3
PIPELINE 691 4.1
OTHER 596 3.5
PRODUCED WATER TRANSFER SYSTEM 398 2.4
OFF-LOCATION FLOWLINE 358 2.1
GAS GATHERING PIPELINE SYSTEM 354 2.1
WATER GATHERING SYSTEM/LINE 345 2.0
PARTIALLY-BURIED VESSEL 326 1.9
PIT 266 1.6
FACILITY CONCENTRATION IN HIGH-POVERTY AREAS:
TANK BATTERY: 33.6% (vs 31.1% in low-poverty, 1.1x ratio)
WELL: 17.9% (vs 15.2% in low-poverty, 1.2x ratio)
FLOWLINE: 8.9% (vs 8.5% in low-poverty, 1.0x ratio)
WATER GATHERING SYSTEM/LINE: 5.9% (vs 1.1% in low-poverty, 5.6x ratio)
OIL AND GAS LOCATION: 5.1% (vs 5.3% in low-poverty, 0.9x ratio)
WELL PAD: 4.7% (vs 8.2% in low-poverty, 0.6x ratio)
PIPELINE: 4.0% (vs 4.2% in low-poverty, 0.9x ratio)
OTHER: 3.8% (vs 3.5% in low-poverty, 1.1x ratio)
PRODUCED WATER TRANSFER SYSTEM: 3.2% (vs 2.2% in low-poverty, 1.5x ratio)
FLOWLINE SYSTEM: 3.1% (vs 5.0% in low-poverty, 0.6x ratio)
SPILL VOLUME DISTRIBUTION
==================================================
Statistic Value (bbls)
Count 3,527
Mean 80.45
Median 10.00
Std Dev 505.64
Min 1.00
25th Percentile 3.00
75th Percentile 36.50
90th Percentile 120.00
95th Percentile 232.10
99th Percentile 1056.98
Max 13000.00
VOLUME CATEGORIES:
Small spills (≤1 bbl): 335 (9.5%)
Medium spills (1-5 bbls): 1,024 (29.0%)
Large spills (5-50 bbls): 1,474 (41.8%)
Very large spills (>50 bbls): 694 (19.7%)
SAVING ALL TABLES TO descriptive_statistics_tables.xlsx
==================================================
DATASET OVERVIEW
==================================================
Metric Value
Total Spill Incidents 16,890
Study Period 1994-2024
Historical Spills 5,828 (34.5%)
Recent Spills 11,062 (65.5%)
Records with Volume Data 3,527 (20.9%)
Records with Geographic Data 16,890 (100.0%)
Records with Complete Demographic Data 16,886
Unique Counties 33
Unique Facility Types 24
DEMOGRAFIC CHARACTERISTICS
==================================================
Characteristic Mean Median Std Dev Range
Poverty Rate (%) 10.3 9.6 6.4 0.0 - 28.1
Percent White (%) 83.5 85.9 8.5 28.6 - 99.4
Percent Hispanic (%) 22.5 18.3 13.0 1.0 - 79.1
Median Household Income ($) $79,282 $72,717 $24,119 $29,965 - $190,417
Unemployment Rate (%) 2.7 2.4 1.4 0.0 - 6.6
COMMUNITY CLASSIFICATIONS:
High Poverty Areas (>15%): 3,497 spills (20.7%)
Minority Communities (<70% White): 1,047 spills (6.2%)
Low-Income Areas (Below Median): 6,847 spills (40.5%)
SPILL CHARACTERISTICS BY DEMOGRAPHICS
==================================================
Group N Historical Spills (%) Major Spills (%) Mean Volume (bbls) Median Volume (bbls) Mean Reporting Delay (days) Median Reporting Delay (days) Outside Berms (%)
Overall 16,890 34.5 28.9 16.80 0.00 9.9 1.0 47.1
High Poverty 3,497 15.6 36.9 27.53 0.00 4.8 1.0 52.1
Low Poverty 13,393 39.5 26.9 13.99 0.00 11.3 1.0 45.8
Minority Community 1,047 21.1 30.0 41.67 0.00 18.1 1.0 55.8
Majority White 15,843 35.4 28.9 15.15 0.00 9.4 1.0 46.5
ENVIRONMENTAL IMPACT PATTERNS
==================================================
Impact Type Overall (%) Historical Spills (%) Recent Spills (%) High Poverty (%) Low Poverty (%)
Soil Contamination 47.8 61.9 40.3 46.5 48.1
Groundwater Contamination 6.0 11.9 2.9 1.9 7.1
Surface Water Contamination 0.5 0.0 0.7 0.9 0.4
TEMPORAL PATTERNS
==================================================
ANNUAL TRENDS (2014-2024):
Total_Spills Historical_Rate Major_Spill_Rate Mean_Volume High_Poverty_Rate
discovery_year
2014 1099 30.7 45.0 10.45 21.0
2015 1580 29.9 48.0 20.51 23.0
2016 1373 31.4 51.0 18.63 23.0
2017 1599 28.1 43.0 14.50 19.0
2018 1581 33.0 48.0 14.85 24.0
2019 1557 23.8 53.0 23.68 24.0
2020 1166 21.3 44.0 18.46 22.0
2021 1790 35.8 7.0 8.76 20.0
2022 2155 40.0 0.0 31.02 17.0
2023 2080 46.0 0.0 10.81 18.0
2024 864 57.3 0.0 4.18 20.0
PERIOD COMPARISON:
Early Period (2014-2018): 7,232 spills
Recent Period (2020-2024): 8,055 spills
Historical Rate - Early: 30.6%
Historical Rate - Recent: 39.8%
Major Spill Rate - Early: 47.2%
Major Spill Rate - Recent: 7.9%
FACILITY TYPE DISTRIBUTION
==================================================
TOP 15 FACILITY TYPES:
Facility Type Count Percentage
TANK BATTERY 5273 31.2
WELL 2626 15.5
FLOWLINE 1430 8.5
WELL PAD 1250 7.4
OIL AND GAS LOCATION 880 5.2
FLOWLINE SYSTEM 767 4.5
WELL SITE 732 4.3
PIPELINE 691 4.1
OTHER 596 3.5
PRODUCED WATER TRANSFER SYSTEM 398 2.4
OFF-LOCATION FLOWLINE 358 2.1
GAS GATHERING PIPELINE SYSTEM 354 2.1
WATER GATHERING SYSTEM/LINE 345 2.0
PARTIALLY-BURIED VESSEL 326 1.9
PIT 266 1.6
FACILITY CONCENTRATION IN HIGH-POVERTY AREAS:
TANK BATTERY: 33.6% (vs 31.1% in low-poverty, 1.1x ratio)
WELL: 17.9% (vs 15.2% in low-poverty, 1.2x ratio)
FLOWLINE: 8.9% (vs 8.5% in low-poverty, 1.0x ratio)
WATER GATHERING SYSTEM/LINE: 5.9% (vs 1.1% in low-poverty, 5.6x ratio)
OIL AND GAS LOCATION: 5.1% (vs 5.3% in low-poverty, 0.9x ratio)
WELL PAD: 4.7% (vs 8.2% in low-poverty, 0.6x ratio)
PIPELINE: 4.0% (vs 4.2% in low-poverty, 0.9x ratio)
OTHER: 3.8% (vs 3.5% in low-poverty, 1.1x ratio)
PRODUCED WATER TRANSFER SYSTEM: 3.2% (vs 2.2% in low-poverty, 1.5x ratio)
FLOWLINE SYSTEM: 3.1% (vs 5.0% in low-poverty, 0.6x ratio)
SPILL VOLUME DISTRIBUTION
==================================================
Statistic Value (bbls)
Count 3,527
Mean 80.45
Median 10.00
Std Dev 505.64
Min 1.00
25th Percentile 3.00
75th Percentile 36.50
90th Percentile 120.00
95th Percentile 232.10
99th Percentile 1056.98
Max 13000.00
VOLUME CATEGORIES:
Small spills (≤1 bbl): 335 (9.5%)
Medium spills (1-5 bbls): 1,024 (29.0%)
Large spills (5-50 bbls): 1,474 (41.8%)
Very large spills (>50 bbls): 694 (19.7%)
All descriptive statistics saved to descriptive_statistics_tables.xlsx
DESCRIPTIVE ANALYSIS COMPLETE!
Tables saved to: descriptive_statistics_tables.xlsx

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

View File

@@ -0,0 +1,109 @@
{
"dataset_overview": {
"total_spills": 16890,
"date_range": "1994-2024",
"historical_spills": "5828",
"recent_spills": "11062"
},
"demographic_patterns": {
"high_poverty_spills": "3497",
"low_poverty_spills": "13393",
"minority_community_spills": "1047",
"majority_white_spills": "15843"
},
"spill_severity": {
"high_poverty_mean_volume": 27.53474406634258,
"low_poverty_mean_volume": 13.991338759053237,
"historical_mean_volume": 0.3812628689087165,
"recent_mean_volume": 25.44322907250045,
"high_poverty_major_rate": 0.36860165856448385,
"low_poverty_major_rate": 0.2687224669603524
},
"reporting_delays": {
"high_poverty_delay_mean": 4.780669144981412,
"low_poverty_delay_mean": 11.250727992234749,
"historical_rate_high_poverty": 0.15556191020875035,
"historical_rate_low_poverty": 0.3945344582991115
},
"environmental_impacts": {
"soil": {
"high_poverty_rate": 0.46496997426365455,
"low_poverty_rate": 0.48077353841559023,
"historical_rate": 0.6185655456417296,
"recent_rate": 0.4031820647260893
},
"groundwater": {
"high_poverty_rate": 0.019445238776093794,
"low_poverty_rate": 0.07078324497872023,
"historical_rate": 0.11890871654083733,
"recent_rate": 0.029199059844512747
},
"Surface Water": {
"high_poverty_rate": 0.008864741206748641,
"low_poverty_rate": 0.0036586276413051594,
"historical_rate": 0.00017158544955387783,
"recent_rate": 0.007141565720484542
}
},
"facility_patterns": {
"false": {
"CENTRALIZED EP WASTE MGMT FAC": 0.005906406179009541,
"CRUDE OIL TRANSFER LINE": 0.0007572315614114796,
"DOMESTIC TAP": 0.00015144631228229593,
"DUMPLINE": 7.572315614114796e-05,
"FLOWLINE": 0.08503710434650916,
"FLOWLINE SYSTEM": 0.04982583674087536,
"GAS COMPRESSOR STATION": 0.0066636377404210205,
"GAS GATHERING PIPELINE SYSTEM": 0.022414054217779797,
"GAS GATHERING SYSTEM": 0.006285021959715281,
"GAS PROCESSING PLANT": 0.00371043465091625,
"Location": 0.0,
"OFF-LOCATION FLOWLINE": 0.023171285779191277,
"OIL AND GAS LOCATION": 0.05338482507950931,
"OTHER": 0.03505982129335151,
"PARTIALLY-BURIED VESSEL": 0.02407996365288505,
"PIPELINE": 0.04195062850219597,
"PIT": 0.013630168105406633,
"PRODUCED WATER TRANSFER SYSTEM": 0.021732545812509465,
"PRODUCTION LINE": 7.572315614114796e-05,
"TANK BATTERY": 0.311146448583977,
"WATER GATHERING SYSTEM/LINE": 0.010601241859760715,
"WELL": 0.15182492806300166,
"WELL PAD": 0.08231107072542783,
"WELL SITE": 0.0502044525215811
},
"true": {
"CENTRALIZED EP WASTE MGMT FAC": 0.0020207852193995382,
"CRUDE OIL TRANSFER LINE": 0.0,
"DOMESTIC TAP": 0.0,
"DUMPLINE": 0.0,
"FLOWLINE": 0.08862586605080831,
"FLOWLINE SYSTEM": 0.03146651270207852,
"GAS COMPRESSOR STATION": 0.006062355658198615,
"GAS GATHERING PIPELINE SYSTEM": 0.01674364896073903,
"GAS GATHERING SYSTEM": 0.005484988452655889,
"GAS PROCESSING PLANT": 0.005196304849884526,
"Location": 0.0002886836027713626,
"OFF-LOCATION FLOWLINE": 0.015011547344110854,
"OIL AND GAS LOCATION": 0.050519630484988455,
"OTHER": 0.038394919168591224,
"PARTIALLY-BURIED VESSEL": 0.0023094688221709007,
"PIPELINE": 0.03954965357967667,
"PIT": 0.024826789838337183,
"PRODUCED WATER TRANSFER SYSTEM": 0.032043879907621246,
"PRODUCTION LINE": 0.0,
"TANK BATTERY": 0.33602771362586603,
"WATER GATHERING SYSTEM/LINE": 0.05918013856812933,
"WELL": 0.17927251732101618,
"WELL PAD": 0.047055427251732104,
"WELL SITE": 0.01991916859122402
}
},
"top_root_causes": {
"Historical impacts were discovered during flowline decommissioning activities.": 204,
"Historical impacts were discovered during tank battery decommissioning activities.": 187,
"Historical impacts were discovered during wellhead cut and cap activities.": 160,
"Historically impacted soils were discovered following cut and cap operations at the wellhead.": 61,
"Unknown": 60
}
}

View File

@@ -0,0 +1,125 @@
{
"disparity_analysis": {
"poverty_historical": {
"chi2_statistic": 699.6653328118817,
"p_value": 3.535735417673697e-154,
"high_poverty_rate": 0.15556191020875035,
"low_poverty_rate": 0.3945344582991115,
"rate_ratio": 0.3942923284303167
},
"race_historical": {
"z_statistic": -9.415701116219894,
"p_value": 4.6993549936195405e-21,
"minority_rate": 0.2110792741165234,
"white_rate": 0.3539102442719182,
"rate_ratio": 0.5964203566663243
},
"delay_poverty": {
"t_statistic": -2.6749772390371573,
"p_value": 0.0074806346630392736,
"high_poverty_mean": 4.780669144981412,
"low_poverty_mean": 11.227873627604751,
"difference": -6.447204482623339
}
},
"spatial_clusters": 56,
"temporal_trends": [
{
"report_year": 2014,
"spill_classification": 315,
"percent_poverty": 10.349361277910711,
"median_household_income": 77667.243140965,
"total_spills": 1057,
"historical_rate": 0.2980132450331126
},
{
"report_year": 2015,
"spill_classification": 513,
"percent_poverty": 10.747263548015491,
"median_household_income": 77480.53987730062,
"total_spills": 1630,
"historical_rate": 0.3147239263803681
},
{
"report_year": 2016,
"spill_classification": 441,
"percent_poverty": 11.003814947091984,
"median_household_income": 75793.0828488372,
"total_spills": 1376,
"historical_rate": 0.32049418604651164
},
{
"report_year": 2017,
"spill_classification": 447,
"percent_poverty": 10.244594774187261,
"median_household_income": 79214.63687150838,
"total_spills": 1611,
"historical_rate": 0.2774674115456238
},
{
"report_year": 2018,
"spill_classification": 526,
"percent_poverty": 11.044440909402196,
"median_household_income": 77269.86923562855,
"total_spills": 1583,
"historical_rate": 0.3322804801010739
},
{
"report_year": 2019,
"spill_classification": 365,
"percent_poverty": 11.0412982422655,
"median_household_income": 77101.10475578406,
"total_spills": 1556,
"historical_rate": 0.2345758354755784
},
{
"report_year": 2020,
"spill_classification": 254,
"percent_poverty": 10.63066207467279,
"median_household_income": 78393.95996592844,
"total_spills": 1174,
"historical_rate": 0.21635434412265758
},
{
"report_year": 2021,
"spill_classification": 630,
"percent_poverty": 9.848223842119726,
"median_household_income": 82168.27640449438,
"total_spills": 1780,
"historical_rate": 0.3539325842696629
},
{
"report_year": 2022,
"spill_classification": 863,
"percent_poverty": 9.587290470770567,
"median_household_income": 82386.95293569431,
"total_spills": 2146,
"historical_rate": 0.402143522833178
},
{
"report_year": 2023,
"spill_classification": 958,
"percent_poverty": 9.819562060500846,
"median_household_income": 81237.21473278767,
"total_spills": 2077,
"historical_rate": 0.4612421762156957
},
{
"report_year": 2024,
"spill_classification": 516,
"percent_poverty": 9.972344499637526,
"median_household_income": 80731.96777777778,
"total_spills": 900,
"historical_rate": 0.5733333333333334
}
],
"interpretation": " This analysis provides compelling evidence of environmental injustice, as it reveals disproportionate reporting delays and higher incidences of oil/gas spills in economically disadvantaged areas and minority communities. These findings underscore the systemic marginalization of these communities and their limited access to resources for environmental oversight and monitoring.\n\nFirstly, the environmental justice implications of reporting delays are significant. Delayed discovery and response to oil/gas spills can lead to increased contamination of air, water, and soil, resulting in long-term health hazards for affected communities. Moreover, these delays exacerbate existing social and economic disparities by disproportionately impacting the health and wellbeing of already vulnerable populations.\n\nSecondly, the analysis highlights institutional barriers and oversight gaps that contribute to these reporting disparities. The findings suggest that weaker environmental monitoring in high-poverty areas and minority communities may be due to insufficient regulatory resources, lack of community engagement, or systemic biases within enforcement agencies. These factors collectively undermine the effectiveness of environmental protection efforts, perpetuating environmental injustice.\n\nThirdly, to address these challenges, policy recommendations for improved monitoring should prioritize resource allocation and regulatory oversight in marginalized communities. This may involve increasing the number of environmental inspectors in high-poverty areas, implementing community-based monitoring programs, and enhancing penalties for noncompliance with environmental regulations. Additionally, promoting transparency and accountability within enforcement agencies can help ensure equitable enforcement across all communities.\n\nLastly, empowering communities to participate actively in environmental decision-making processes is essential for achieving lasting change. This can be achieved through community education programs, participatory mapping initiatives, and partnerships between governmental agencies, non-governmental organizations, and affected communities. By giving voice to marginalized communities, we can create a more equitable and sustainable future for all.\n\nIn conclusion, this analysis underscores the urgent need for policy action to address reporting disparities in oil/gas spills. By addressing institutional barriers, improving monitoring practices, and empowering communities, we can work towards a more just and sustainable environment for all.",
"data_summary": {
"total_records": 16890,
"date_range": "2014-2024",
"spill_classifications": {
"Recent": 11062,
"Historical": 5828
}
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 691 KiB

View File

@@ -0,0 +1,11 @@
This analysis provides compelling evidence of environmental injustice, as it reveals disproportionate reporting delays and higher incidences of oil/gas spills in economically disadvantaged areas and minority communities. These findings underscore the systemic marginalization of these communities and their limited access to resources for environmental oversight and monitoring.
Firstly, the environmental justice implications of reporting delays are significant. Delayed discovery and response to oil/gas spills can lead to increased contamination of air, water, and soil, resulting in long-term health hazards for affected communities. Moreover, these delays exacerbate existing social and economic disparities by disproportionately impacting the health and wellbeing of already vulnerable populations.
Secondly, the analysis highlights institutional barriers and oversight gaps that contribute to these reporting disparities. The findings suggest that weaker environmental monitoring in high-poverty areas and minority communities may be due to insufficient regulatory resources, lack of community engagement, or systemic biases within enforcement agencies. These factors collectively undermine the effectiveness of environmental protection efforts, perpetuating environmental injustice.
Thirdly, to address these challenges, policy recommendations for improved monitoring should prioritize resource allocation and regulatory oversight in marginalized communities. This may involve increasing the number of environmental inspectors in high-poverty areas, implementing community-based monitoring programs, and enhancing penalties for noncompliance with environmental regulations. Additionally, promoting transparency and accountability within enforcement agencies can help ensure equitable enforcement across all communities.
Lastly, empowering communities to participate actively in environmental decision-making processes is essential for achieving lasting change. This can be achieved through community education programs, participatory mapping initiatives, and partnerships between governmental agencies, non-governmental organizations, and affected communities. By giving voice to marginalized communities, we can create a more equitable and sustainable future for all.
In conclusion, this analysis underscores the urgent need for policy action to address reporting disparities in oil/gas spills. By addressing institutional barriers, improving monitoring practices, and empowering communities, we can work towards a more just and sustainable environment for all.

View File

@@ -0,0 +1,20 @@
{
"severity_analysis": {
"volume_analysis": {
"high_poverty_mean": 12.186445524735488,
"low_poverty_mean": 6.71895766445158,
"high_poverty_median": 0.0,
"low_poverty_median": 0.0,
"p_value": 5.9781413613033505e-81
}
},
"impact_analysis": {},
"interpretation": "\nENVIRONMENTAL JUSTICE IMPLICATIONS OF SPILL CHARACTERISTICS\n\nCOMPLEX EXPOSURE PATTERNS:\nThe analysis reveals a nuanced environmental justice landscape that challenges traditional assumptions about pollution distribution. Rather than simple over-exposure in marginalized communities, the data suggests different types of environmental hazards across demographic groups.\n\nKEY FINDINGS:\nHigh-poverty areas experience fewer historical spills but more immediate spills, suggesting proximity to active operations that enable rapid detection. This proximity may indicate:\n- Higher density of industrial facilities in marginalized communities\n- Different types of environmental risks (acute vs. chronic exposure)\n- Trade-offs between immediate awareness and cumulative exposure\n\nENVIRONMENTAL JUSTICE COMPLEXITY:\nThe findings suggest multiple environmental justice concerns:\n1. Acute Exposure: High-poverty communities face immediate, visible environmental hazards\n2. Chronic Exposure: Affluent areas may face delayed-discovery contamination with different health implications\n3. Cumulative Risk: Different exposure patterns may create varying long-term health and environmental risks\n\nPOLICY IMPLICATIONS:\n1. Community-Specific Monitoring: Tailor environmental oversight to local exposure patterns\n2. Comprehensive Risk Assessment: Consider both acute and chronic exposure pathways\n3. Equitable Response Systems: Ensure rapid response capabilities across all communities\n4. Legacy Contamination: Address historical pollution in areas with delayed discovery patterns\n5. Facility Siting: Examine cumulative exposure when permitting new operations\n\nRESEARCH NEEDS:\nFurther investigation should examine health outcomes, cleanup effectiveness, and long-term environmental impacts across different exposure patterns to develop comprehensive environmental justice policies.\n ",
"data_summary": {
"total_records": 16890,
"major_spills": "4888",
"total_volume": 283675.0,
"historical_count": "5828",
"recent_count": "11062"
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 957 KiB

View File

@@ -0,0 +1,28 @@
ENVIRONMENTAL JUSTICE IMPLICATIONS OF SPILL CHARACTERISTICS
COMPLEX EXPOSURE PATTERNS:
The analysis reveals a nuanced environmental justice landscape that challenges traditional assumptions about pollution distribution. Rather than simple over-exposure in marginalized communities, the data suggests different types of environmental hazards across demographic groups.
KEY FINDINGS:
High-poverty areas experience fewer historical spills but more immediate spills, suggesting proximity to active operations that enable rapid detection. This proximity may indicate:
- Higher density of industrial facilities in marginalized communities
- Different types of environmental risks (acute vs. chronic exposure)
- Trade-offs between immediate awareness and cumulative exposure
ENVIRONMENTAL JUSTICE COMPLEXITY:
The findings suggest multiple environmental justice concerns:
1. Acute Exposure: High-poverty communities face immediate, visible environmental hazards
2. Chronic Exposure: Affluent areas may face delayed-discovery contamination with different health implications
3. Cumulative Risk: Different exposure patterns may create varying long-term health and environmental risks
POLICY IMPLICATIONS:
1. Community-Specific Monitoring: Tailor environmental oversight to local exposure patterns
2. Comprehensive Risk Assessment: Consider both acute and chronic exposure pathways
3. Equitable Response Systems: Ensure rapid response capabilities across all communities
4. Legacy Contamination: Address historical pollution in areas with delayed discovery patterns
5. Facility Siting: Examine cumulative exposure when permitting new operations
RESEARCH NEEDS:
Further investigation should examine health outcomes, cleanup effectiveness, and long-term environmental impacts across different exposure patterns to develop comprehensive environmental justice policies.

View File

@@ -0,0 +1,26 @@
{
"statistical_tests": {
"income_chi2": {
"statistic": 361.6935464772055,
"p_value": 4.380770869774385e-78
},
"poverty_binomial": {
"p_value": 0.011555516170195554,
"observed_ratio": 1.0352279455298994
},
"major_spills_ztest": {
"z_statistic": 11.59802883494945,
"p_value": 4.216789863971777e-31
},
"minority_binomial": {
"p_value": 1.0,
"observed_ratio": 0.20663114268798105
}
},
"spatial_analysis": {
"n_clusters": 373,
"max_density": "119"
},
"regression_summary": " OLS Regression Results \n==============================================================================\nDep. Variable: major_spill R-squared: 0.055\nModel: OLS Adj. R-squared: 0.054\nMethod: Least Squares F-statistic: 195.2\nDate: Thu, 10 Jul 2025 Prob (F-statistic): 6.68e-203\nTime: 12:26:21 Log-Likelihood: -10133.\nNo. Observations: 16886 AIC: 2.028e+04\nDf Residuals: 16880 BIC: 2.033e+04\nDf Model: 5 \nCovariance Type: nonrobust \n===========================================================================================\n coef std err t P>|t| [0.025 0.975]\n-------------------------------------------------------------------------------------------\nIntercept -0.1181 0.040 -2.951 0.003 -0.197 -0.040\npercent_poverty 0.0096 0.001 14.132 0.000 0.008 0.011\npercent_white 0.0046 0.000 11.014 0.000 0.004 0.005\nmedian_household_income -9.759e-07 1.78e-07 -5.492 0.000 -1.32e-06 -6.28e-07\nlat_norm -0.0229 0.004 -5.935 0.000 -0.030 -0.015\nlon_norm -0.0569 0.004 -15.058 0.000 -0.064 -0.049\n==============================================================================\nOmnibus: 5689.237 Durbin-Watson: 1.525\nProb(Omnibus): 0.000 Jarque-Bera (JB): 2753.023\nSkew: 0.850 Prob(JB): 0.00\nKurtosis: 1.988 Cond. No. 9.78e+05\n==============================================================================\n\nNotes:\n[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n[2] The condition number is large, 9.78e+05. This might indicate that there are\nstrong multicollinearity or other numerical problems.",
"academic_interpretation": " The presented findings offer valuable insights into the intersection of poverty, race, and environmental hazards, shedding light on the pressing issue of environmental injustice.\n\nFirstly, the statistical tests indicate an over-representation of poverty (1.04x expected) in areas with a higher incidence of environmental spills, suggesting a disproportionate burden of pollution on low-income communities. This observation aligns with the environmental justice principle that vulnerable populations should not bear a disproportionate share of the negative environmental consequences resulting from industrialization and development (Bullard, 1990). The statistically significant p-value (p=0.0116) underscores the robustness of this finding, supporting policy efforts aimed at addressing this disparity.\n\nThe z-test results for major spills (p-value = 0.0000) further reinforce the spatial inequities in pollution exposure. This statistical significance implies that the observed clustering of major environmental spills is unlikely to have occurred by chance, and therefore warrants immediate attention from policymakers.\n\nThe finding of a minority communities ratio (0.21x) lower than expected also supports the notion of environmental injustice. This under-representation could imply that these communities are less likely to be located near areas with high pollution levels, which is contrary to the principle of just treatment and equitable protection for all people (Sierra Club, 2017).\n\nThe spatial patterns analysis identifies 373 significant clusters of environmental spills, with a maximum density of 119 spills per grid cell. These clusters could serve as foci for targeted interventions to mitigate the adverse health and socio-economic impacts experienced by affected communities (Bullard & Johnson, 2000).\n\nThe regression results reveal a weak but statistically significant relationship between poverty and environmental spills (R\u00b2=0.055), with each unit increase in poverty being associated with a 0.0096 increase in the number of spills. Conversely, income levels appear to have a negligible impact on pollution exposure (coef=-0.000001, p=0.000). These findings suggest that policy efforts should prioritize poverty reduction strategies as a means of addressing environmental injustice, rather than focusing solely on income-based interventions.\n\nIn conclusion, the presented findings underscore the urgent need for environmental justice reforms. Policymakers should prioritize targeted interventions to address the disproportionate burden of pollution on low-income communities and areas with high minority populations. Strategies could include strengthening regulatory enforcement in polluting industries, implementing zoning policies that prevent the siting of hazardous facilities near vulnerable populations, and investing in green infrastructure projects in underserved communities (Bullard & Johnson, 2000). By addressing these issues, we can strive towards a more equitable and sustainable future for all.\n\nReferences:\n- Bullard, R. D. (1990). Dumping in Dixie: Race, Class, and Environmental Quality. MIT Press.\n- Bullard, R. D., & Johnson, B. L. (2000). Confronting environmental racism: Voices from the grassroots. Westview Press.\n- Sierra Club. (2017). Environmental justice. Retrieved from https://www.sierraclub.org/sites/default/files/2018-09/environmental_justice_brochure.pdf"
}

View File

@@ -0,0 +1,678 @@
import pandas as pd
import geopandas as gpd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
import esda
from libpysal.weights import KNN
import requests
import json
import time
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.formula.api import ols
import warnings
warnings.filterwarnings('ignore')
def check_ollama_connection():
"""Check if Ollama is running and accessible"""
try:
response = requests.get('http://localhost:11434/api/tags', timeout=5)
if response.status_code == 200:
models = response.json().get('models', [])
available_models = [model['name'] for model in models]
print(f"✓ Ollama is running. Available models: {available_models}")
return True, available_models
else:
print(f"✗ Ollama responded with status {response.status_code}")
return False, []
except requests.exceptions.ConnectionError:
print("✗ Cannot connect to Ollama. Is it running on localhost:11434?")
return False, []
except Exception as e:
print(f"✗ Error checking Ollama: {e}")
return False, []
def query_ollama(prompt, model="mistral:7b", max_retries=3, timeout=120):
"""Send query to local Ollama instance with improved error handling"""
is_connected, available_models = check_ollama_connection()
if not is_connected:
print("Skipping Ollama analysis - service not available")
return None
if model not in available_models:
print(f"Model '{model}' not available. Available models: {available_models}")
if available_models:
model = available_models[0]
print(f"Using '{model}' instead")
else:
return None
max_prompt_length = 4000
if len(prompt) > max_prompt_length:
print(f"Warning: Prompt too long ({len(prompt)} chars), truncating to {max_prompt_length}")
prompt = prompt[:max_prompt_length] + "\n\n[Analysis truncated due to length limits]"
for attempt in range(max_retries):
try:
print(f"Sending request to Ollama (attempt {attempt + 1}/{max_retries})...")
start_time = time.time()
response = requests.post(
'http://localhost:11434/api/generate',
json={
'model': model,
'prompt': prompt,
'stream': False,
'options': {
'temperature': 0.7,
'top_p': 0.9,
'num_predict': 2000
}
},
timeout=timeout
)
elapsed_time = time.time() - start_time
print(f"Request completed in {elapsed_time:.1f} seconds")
if response.status_code == 200:
result = response.json()
if 'response' in result:
return result['response']
else:
print(f"Unexpected response format: {result}")
return None
else:
print(f"HTTP Error {response.status_code}: {response.text}")
except requests.exceptions.Timeout:
print(f"Request timed out after {timeout} seconds (attempt {attempt + 1})")
if attempt < max_retries - 1:
time.sleep(2)
except Exception as e:
print(f"Error querying Ollama (attempt {attempt + 1}): {e}")
if attempt < max_retries - 1:
time.sleep(2)
return None
def prepare_temporal_data(df):
"""Prepare and clean temporal data for analysis"""
print("TEMPORAL DATA PREPARATION")
print("="*50)
# Convert date columns - using correct column names
date_columns = ['Initial Report Date', 'Date of Discovery']
for col in date_columns:
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors='coerce')
print(f"Converted {col} to datetime")
else:
print(f"Warning: Column '{col}' not found in dataset")
# Create temporal variables
df_temporal = df.copy()
# Calculate reporting delay (days between discovery and initial report)
if 'Date of Discovery' in df.columns and 'Initial Report Date' in df.columns:
df_temporal['reporting_delay_days'] = (
df_temporal['Initial Report Date'] - df_temporal['Date of Discovery']
).dt.days
# Handle negative delays (reports before discovery - data quality issue)
negative_delays = (df_temporal['reporting_delay_days'] < 0).sum()
if negative_delays > 0:
print(f"Warning: {negative_delays} records with negative reporting delays (report before discovery)")
df_temporal.loc[df_temporal['reporting_delay_days'] < 0, 'reporting_delay_days'] = 0
else:
print("Warning: Cannot calculate reporting delays - missing date columns")
# Use existing Spill Type classification
if 'Spill Type' in df.columns:
df_temporal['spill_classification'] = df['Spill Type']
print("Using existing 'Spill Type' classification")
else:
print("Warning: 'Spill Type' column not found")
df_temporal['spill_classification'] = 'Unknown'
# Extract year for trend analysis
if 'Date of Discovery' in df_temporal.columns:
df_temporal['discovery_year'] = df_temporal['Date of Discovery'].dt.year
if 'Initial Report Date' in df_temporal.columns:
df_temporal['report_year'] = df_temporal['Initial Report Date'].dt.year
# Data quality summary
print(f"\nData Quality Summary:")
print(f"Total records: {len(df_temporal)}")
if 'Date of Discovery' in df_temporal.columns:
print(f"Records with discovery date: {df_temporal['Date of Discovery'].notna().sum()}")
if 'Initial Report Date' in df_temporal.columns:
print(f"Records with report date: {df_temporal['Initial Report Date'].notna().sum()}")
if 'reporting_delay_days' in df_temporal.columns:
print(f"Records with calculable delay: {df_temporal['reporting_delay_days'].notna().sum()}")
valid_delays = df_temporal['reporting_delay_days'].dropna()
if len(valid_delays) > 0:
print(f"Mean reporting delay: {valid_delays.mean():.1f} days")
print(f"Median reporting delay: {valid_delays.median():.1f} days")
print(f"Max reporting delay: {valid_delays.max():.0f} days")
if 'spill_classification' in df_temporal.columns:
spill_type_counts = df_temporal['spill_classification'].value_counts()
print(f"\nSpill Type Distribution:")
for spill_type, count in spill_type_counts.items():
print(f" {spill_type}: {count} ({count/len(df_temporal)*100:.1f}%)")
return df_temporal
def reporting_delay_disparity_analysis(df):
"""Analyze disparities in reporting delays across demographic groups"""
print("\nREPORTING DELAY DISPARITY ANALYSIS")
print("="*50)
results = {}
# 1. Historical vs Recent Spills by Demographics
print("1. HISTORICAL VS RECENT SPILLS BY DEMOGRAPHICS")
print("-" * 45)
# Define demographic groups
high_poverty = df['percent_poverty'] > 15
minority_community = df['percent_white'] < 70
low_income = df['median_household_income'] < df['median_household_income'].median()
# Chi-square test: Historical spills by poverty level
historical_spills = df['spill_classification'] == 'Historical'
# Poverty analysis
contingency_poverty = pd.crosstab(high_poverty, historical_spills)
if contingency_poverty.shape == (2, 2):
chi2_poverty, p_poverty, dof, expected = stats.chi2_contingency(contingency_poverty)
high_pov_historical = len(df[high_poverty & historical_spills])
high_pov_total = high_poverty.sum()
low_pov_historical = len(df[~high_poverty & historical_spills])
low_pov_total = (~high_poverty).sum()
high_pov_rate = high_pov_historical / high_pov_total if high_pov_total > 0 else 0
low_pov_rate = low_pov_historical / low_pov_total if low_pov_total > 0 else 0
print(f"Historical Spills by Poverty Level:")
print(f" High poverty areas: {high_pov_historical}/{high_pov_total} ({high_pov_rate:.3f})")
print(f" Low poverty areas: {low_pov_historical}/{low_pov_total} ({low_pov_rate:.3f})")
print(f" Rate ratio: {high_pov_rate/low_pov_rate:.2f}x" if low_pov_rate > 0 else " Rate ratio: N/A")
print(f" Chi-square p-value: {p_poverty:.6f}")
print(f" Significant disparity: {'YES' if p_poverty < 0.05 else 'NO'}")
results['poverty_historical'] = {
'chi2_statistic': chi2_poverty,
'p_value': p_poverty,
'high_poverty_rate': high_pov_rate,
'low_poverty_rate': low_pov_rate,
'rate_ratio': high_pov_rate/low_pov_rate if low_pov_rate > 0 else np.nan
}
# Race analysis
print(f"\nHistorical Spills by Race/Ethnicity:")
minority_historical = len(df[minority_community & historical_spills])
minority_total = minority_community.sum()
white_historical = len(df[~minority_community & historical_spills])
white_total = (~minority_community).sum()
minority_rate = minority_historical / minority_total if minority_total > 0 else 0
white_rate = white_historical / white_total if white_total > 0 else 0
print(f" Minority communities: {minority_historical}/{minority_total} ({minority_rate:.3f})")
print(f" Majority white areas: {white_historical}/{white_total} ({white_rate:.3f})")
print(f" Rate ratio: {minority_rate/white_rate:.2f}x" if white_rate > 0 else " Rate ratio: N/A")
# Two-proportion z-test for race
if minority_total > 0 and white_total > 0:
counts = np.array([minority_historical, white_historical])
nobs = np.array([minority_total, white_total])
z_stat_race, p_race = proportions_ztest(counts, nobs)
print(f" Two-proportion z-test p-value: {p_race:.6f}")
print(f" Significant disparity: {'YES' if p_race < 0.05 else 'NO'}")
results['race_historical'] = {
'z_statistic': z_stat_race,
'p_value': p_race,
'minority_rate': minority_rate,
'white_rate': white_rate,
'rate_ratio': minority_rate/white_rate if white_rate > 0 else np.nan
}
# 2. Actual Reporting Delays (in days)
if 'reporting_delay_days' in df.columns:
print(f"\n2. REPORTING DELAY ANALYSIS (DAYS)")
print("-" * 35)
delay_data = df.dropna(subset=['reporting_delay_days'])
if len(delay_data) > 0:
# Compare mean delays by demographics
high_pov_delays = delay_data[delay_data['percent_poverty'] > 15]['reporting_delay_days']
low_pov_delays = delay_data[delay_data['percent_poverty'] <= 15]['reporting_delay_days']
if len(high_pov_delays) > 0 and len(low_pov_delays) > 0:
# T-test for difference in means
t_stat, p_ttest = stats.ttest_ind(high_pov_delays, low_pov_delays)
print(f"Mean Reporting Delays:")
print(f" High poverty areas: {high_pov_delays.mean():.1f} days (n={len(high_pov_delays)})")
print(f" Low poverty areas: {low_pov_delays.mean():.1f} days (n={len(low_pov_delays)})")
print(f" Difference: {high_pov_delays.mean() - low_pov_delays.mean():.1f} days")
print(f" T-test p-value: {p_ttest:.6f}")
print(f" Significant difference: {'YES' if p_ttest < 0.05 else 'NO'}")
results['delay_poverty'] = {
't_statistic': t_stat,
'p_value': p_ttest,
'high_poverty_mean': high_pov_delays.mean(),
'low_poverty_mean': low_pov_delays.mean(),
'difference': high_pov_delays.mean() - low_pov_delays.mean()
}
# Median delays (more robust to outliers)
print(f"\nMedian Reporting Delays:")
print(f" High poverty areas: {high_pov_delays.median():.1f} days")
print(f" Low poverty areas: {low_pov_delays.median():.1f} days")
# Mann-Whitney U test (non-parametric)
if len(high_pov_delays) > 0 and len(low_pov_delays) > 0:
u_stat, p_mannwhitney = stats.mannwhitneyu(high_pov_delays, low_pov_delays, alternative='two-sided')
print(f" Mann-Whitney U test p-value: {p_mannwhitney:.6f}")
print(f" Significant difference (non-parametric): {'YES' if p_mannwhitney < 0.05 else 'NO'}")
return results
def spatial_reporting_analysis(df):
"""Analyze spatial patterns in reporting delays"""
print("\n3. SPATIAL PATTERNS OF REPORTING DELAYS")
print("-" * 40)
# Create GeoDataFrame
gdf = gpd.GeoDataFrame(
df.dropna(subset=['Latitude', 'Longitude']),
geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
crs='EPSG:4326'
)
if len(gdf) == 0:
print("No valid geographic data available")
return None, None
# Project for spatial analysis
gdf_proj = gdf.to_crs('EPSG:3857')
# Spatial clustering of historical spills
historical_points = gdf_proj[gdf_proj['spill_classification'] == 'Historical']
if len(historical_points) > 10:
coords = np.column_stack([historical_points.geometry.x, historical_points.geometry.y])
# DBSCAN clustering
eps = 5000 # 5km radius
min_samples = 5
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
clusters = dbscan.fit_predict(coords)
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise = list(clusters).count(-1)
print(f"Historical Spill Clustering:")
print(f" Number of clusters: {n_clusters}")
print(f" Clustered points: {len(historical_points) - n_noise}")
print(f" Noise points: {n_noise}")
# Add cluster labels
historical_points_copy = historical_points.copy()
historical_points_copy['cluster'] = clusters
# Analyze demographic characteristics of clusters
if n_clusters > 0:
cluster_summary = []
for cluster_id in set(clusters):
if cluster_id != -1: # Exclude noise
cluster_points = historical_points_copy[historical_points_copy['cluster'] == cluster_id]
cluster_summary.append({
'cluster_id': cluster_id,
'n_spills': len(cluster_points),
'avg_poverty': cluster_points['percent_poverty'].mean(),
'avg_income': cluster_points['median_household_income'].mean(),
'pct_minority': (cluster_points['percent_white'] < 70).mean() * 100
})
cluster_df = pd.DataFrame(cluster_summary)
print(f"\nCluster Demographics:")
print(f" Average poverty rate: {cluster_df['avg_poverty'].mean():.1f}%")
print(f" Average income: ${cluster_df['avg_income'].mean():,.0f}")
print(f" Percent minority communities: {cluster_df['pct_minority'].mean():.1f}%")
# Spatial autocorrelation of reporting delays
if 'reporting_delay_days' in gdf.columns and len(gdf) > 50:
delay_data = gdf.dropna(subset=['reporting_delay_days'])
if len(delay_data) > 50:
try:
coords_array = np.column_stack([delay_data.geometry.x, delay_data.geometry.y])
w = KNN.from_array(coords_array, k=min(8, len(delay_data)-1))
w.transform = 'r'
moran_delay = esda.Moran(delay_data['reporting_delay_days'], w)
print(f"\nSpatial Autocorrelation of Reporting Delays:")
print(f" Moran's I: {moran_delay.I:.4f}")
print(f" p-value: {moran_delay.p_sim:.4f}")
print(f" Significant clustering: {'YES' if moran_delay.p_sim < 0.05 else 'NO'}")
except Exception as e:
print(f" Spatial autocorrelation analysis failed: {e}")
return gdf, n_clusters if 'n_clusters' in locals() else 0
def temporal_trend_analysis(df):
"""Analyze trends in reporting delays over time"""
print("\n4. TEMPORAL TRENDS IN REPORTING")
print("-" * 32)
if 'report_year' not in df.columns:
print("No temporal data available for trend analysis")
return None
# Filter to reasonable year range
df_temporal = df[(df['report_year'] >= 2014) & (df['report_year'] <= 2024)]
if len(df_temporal) == 0:
print("No data in 2014-2024 range")
return None
# Trend in historical spill proportions
yearly_summary = df_temporal.groupby('report_year').agg({
'spill_classification': lambda x: (x == 'Historical').sum(),
'percent_poverty': 'mean',
'median_household_income': 'mean'
}).reset_index()
yearly_summary['total_spills'] = df_temporal.groupby('report_year').size().values
yearly_summary['historical_rate'] = yearly_summary['spill_classification'] / yearly_summary['total_spills']
print(f"Temporal Trends (2014-2024):")
print(f" Years with data: {sorted(df_temporal['report_year'].unique())}")
print(f" Total spills: {len(df_temporal)}")
# Correlation between year and historical rate
if len(yearly_summary) > 3:
corr_year_hist, p_corr = stats.pearsonr(yearly_summary['report_year'], yearly_summary['historical_rate'])
print(f" Correlation (year vs historical rate): {corr_year_hist:.3f} (p={p_corr:.3f})")
trend_direction = "improving" if corr_year_hist < 0 else "worsening" if corr_year_hist > 0 else "stable"
print(f" Trend interpretation: {trend_direction} (fewer historical = better reporting)")
# Recent vs early period comparison
early_period = df_temporal[df_temporal['report_year'] <= 2018]
recent_period = df_temporal[df_temporal['report_year'] >= 2020]
if len(early_period) > 0 and len(recent_period) > 0:
early_hist_rate = (early_period['spill_classification'] == 'Historical').mean()
recent_hist_rate = (recent_period['spill_classification'] == 'Historical').mean()
print(f"\nPeriod Comparison:")
print(f" 2014-2018 historical rate: {early_hist_rate:.3f}")
print(f" 2020-2024 historical rate: {recent_hist_rate:.3f}")
print(f" Change: {recent_hist_rate - early_hist_rate:.3f}")
# Statistical test for difference
early_hist = (early_period['spill_classification'] == 'Historical').sum()
recent_hist = (recent_period['spill_classification'] == 'Historical').sum()
counts = np.array([early_hist, recent_hist])
nobs = np.array([len(early_period), len(recent_period)])
if all(counts > 0) and all(nobs > 0):
z_stat, p_val = proportions_ztest(counts, nobs)
print(f" Two-proportion test p-value: {p_val:.4f}")
print(f" Significant change: {'YES' if p_val < 0.05 else 'NO'}")
return yearly_summary
def create_reporting_visualizations(df, gdf=None):
"""Create visualizations for reporting delay analysis"""
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Historical vs Recent by Demographics
ax1 = axes[0, 0]
# Create demographic categories
df['demo_category'] = 'Low Poverty'
df.loc[df['percent_poverty'] > 15, 'demo_category'] = 'High Poverty'
demo_spill = pd.crosstab(df['demo_category'], df['spill_classification'])
demo_spill_pct = demo_spill.div(demo_spill.sum(axis=1), axis=0) * 100
demo_spill_pct.plot(kind='bar', ax=ax1, stacked=True,
color=['lightblue', 'darkred', 'gray'])
ax1.set_title('Spill Classification by Poverty Level')
ax1.set_xlabel('Community Type')
ax1.set_ylabel('Percentage of Spills')
ax1.legend(title='Spill Type')
ax1.tick_params(axis='x', rotation=45)
# 2. Reporting Delays Distribution
ax2 = axes[0, 1]
if 'reporting_delay_days' in df.columns:
delay_data = df.dropna(subset=['reporting_delay_days'])
if len(delay_data) > 0:
# Cap at 95th percentile for visualization
cap_value = delay_data['reporting_delay_days'].quantile(0.95)
delay_capped = delay_data['reporting_delay_days'].clip(upper=cap_value)
ax2.hist(delay_capped, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
ax2.set_title('Distribution of Reporting Delays')
ax2.set_xlabel('Days between Discovery and Report')
ax2.set_ylabel('Number of Spills')
ax2.axvline(delay_capped.mean(), color='red', linestyle='--',
label=f'Mean: {delay_capped.mean():.1f} days')
ax2.legend()
# 3. Geographic Distribution
ax3 = axes[1, 0]
if gdf is not None and len(gdf) > 0:
# Plot by spill classification
historical = gdf[gdf['spill_classification'] == 'Historical']
recent = gdf[gdf['spill_classification'] == 'Recent']
if len(recent) > 0:
ax3.scatter(recent['Longitude'], recent['Latitude'],
c='lightblue', s=10, alpha=0.6, label='Recent')
if len(historical) > 0:
ax3.scatter(historical['Longitude'], historical['Latitude'],
c='darkred', s=15, alpha=0.8, label='Historical')
ax3.set_title('Geographic Distribution by Spill Type')
ax3.set_xlabel('Longitude')
ax3.set_ylabel('Latitude')
ax3.legend()
# 4. Temporal Trends
ax4 = axes[1, 1]
if 'report_year' in df.columns:
yearly_data = df.groupby('report_year').agg({
'spill_classification': lambda x: (x == 'Historical').sum()
}).reset_index()
yearly_totals = df.groupby('report_year').size().reset_index(name='total_count')
yearly_data = yearly_data.merge(yearly_totals, on='report_year')
yearly_data.columns = ['year', 'historical_count', 'total_count']
yearly_data['historical_rate'] = yearly_data['historical_count'] / yearly_data['total_count']
ax4.plot(yearly_data['year'], yearly_data['historical_rate'],
marker='o', linewidth=2, markersize=6)
ax4.set_title('Historical Spill Rate Over Time')
ax4.set_xlabel('Year')
ax4.set_ylabel('Proportion of Historical Spills')
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('reporting_delay_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
def generate_reporting_delay_report(results, spatial_results, temporal_results):
"""Generate comprehensive report on reporting delays"""
# Create concise summary for LLM
summary_stats = []
if 'poverty_historical' in results:
pov_ratio = results['poverty_historical']['rate_ratio']
pov_p = results['poverty_historical']['p_value']
summary_stats.append(f"High-poverty areas: {pov_ratio:.2f}x more historical spills (p={pov_p:.4f})")
if 'race_historical' in results:
race_ratio = results['race_historical']['rate_ratio']
race_p = results['race_historical']['p_value']
summary_stats.append(f"Minority communities: {race_ratio:.2f}x more historical spills (p={race_p:.4f})")
if 'delay_poverty' in results:
delay_diff = results['delay_poverty']['difference']
delay_p = results['delay_poverty']['p_value']
summary_stats.append(f"High-poverty areas: {delay_diff:.1f} days longer delays (p={delay_p:.4f})")
spatial_info = f"{spatial_results} spatial clusters of historical spills" if spatial_results else "No spatial clustering detected"
prompt = f"""Analyze these environmental justice findings on reporting delays:
REPORTING DISPARITIES:
{chr(10).join(['- ' + stat for stat in summary_stats])}
SPATIAL PATTERNS:
- {spatial_info}
INTERPRETATION NEEDED:
This analysis examines whether marginalized communities experience delayed discovery and reporting of oil/gas spills, indicating weaker oversight and environmental monitoring.
Provide a 300-word academic interpretation focusing on:
1. Environmental justice implications of reporting delays
2. Institutional barriers and oversight gaps
3. Policy recommendations for improved monitoring
4. Community empowerment strategies"""
print("\nGenerating reporting delay interpretation with Ollama...")
report = query_ollama(prompt)
if report is None:
report = f"""
ENVIRONMENTAL JUSTICE IMPLICATIONS OF REPORTING DELAYS
EXECUTIVE SUMMARY:
The analysis reveals significant disparities in oil and gas spill reporting patterns across demographic lines, indicating institutional barriers and unequal environmental oversight.
KEY FINDINGS:
{chr(10).join(summary_stats)}
ENVIRONMENTAL JUSTICE IMPLICATIONS:
Delayed discovery and reporting of spills in marginalized communities represents a form of procedural environmental injustice. When spills go undetected for extended periods, affected communities face:
- Prolonged exposure to contamination
- Delayed cleanup and remediation efforts
- Reduced accountability for responsible parties
- Weakened community awareness and response capacity
INSTITUTIONAL BARRIERS:
The disparities suggest systematic differences in:
- Regulatory oversight and inspection frequency
- Community access to reporting mechanisms
- Corporate compliance and monitoring practices
- Environmental awareness and advocacy capacity
POLICY RECOMMENDATIONS:
1. Enhanced monitoring requirements in environmental justice communities
2. Community-based environmental monitoring programs
3. Mandatory spill detection technology for facilities in sensitive areas
4. Improved public reporting systems and community notification
5. Regular environmental audits with community participation
COMMUNITY EMPOWERMENT:
- Training residents in environmental monitoring
- Establishing community environmental health liaisons
- Creating accessible reporting hotlines
- Supporting community-led advocacy organizations
The findings demonstrate the need for proactive policies to ensure equitable environmental protection and community oversight capabilities.
"""
print("Using fallback report (Ollama unavailable)")
return report
def run_reporting_delay_analysis(csv_file, use_ollama=True):
"""Run comprehensive reporting delay analysis"""
print("ENVIRONMENTAL JUSTICE REPORTING DELAY ANALYSIS")
print("="*60)
# Load and prepare data
df = pd.read_csv(csv_file)
print(f"Loaded {len(df)} spill records")
# Check Ollama if requested
if use_ollama:
print("\nChecking Ollama connection...")
check_ollama_connection()
# Prepare temporal data
df_temporal = prepare_temporal_data(df)
# Main analyses
disparity_results = reporting_delay_disparity_analysis(df_temporal)
gdf, n_clusters = spatial_reporting_analysis(df_temporal)
temporal_trends = temporal_trend_analysis(df_temporal)
# Create visualizations
create_reporting_visualizations(df_temporal, gdf)
# Generate report
if use_ollama:
report = generate_reporting_delay_report(disparity_results, n_clusters, temporal_trends)
else:
print("\nSkipping Ollama report generation...")
report = "Ollama report generation skipped by user"
# Save results
results = {
'disparity_analysis': disparity_results,
'spatial_clusters': n_clusters,
'temporal_trends': temporal_trends.to_dict('records') if temporal_trends is not None else None,
'interpretation': report,
'data_summary': {
'total_records': len(df_temporal),
'date_range': f"{df_temporal['report_year'].min()}-{df_temporal['report_year'].max()}" if 'report_year' in df_temporal.columns else "Unknown",
'spill_classifications': df_temporal['spill_classification'].value_counts().to_dict()
}
}
with open('reporting_delay_analysis.json', 'w') as f:
json.dump(results, f, indent=2, default=str)
with open('reporting_delay_report.txt', 'w') as f:
f.write(report)
print(f"\nReporting delay analysis complete. Results saved to:")
print(f" - reporting_delay_analysis.json")
print(f" - reporting_delay_report.txt")
print(f" - reporting_delay_analysis.png")
return results
if __name__ == "__main__":
# Run the reporting delay analysis
results = run_reporting_delay_analysis('data/spills_with_demographics.csv', use_ollama=True)

View File

@@ -0,0 +1,112 @@
import pandas as pd
import numpy as np
import json
def create_summary_results(csv_file):
"""Create summary results without hanging visualizations"""
print("CREATING ENVIRONMENTAL JUSTICE SUMMARY")
print("="*50)
# Load data
df = pd.read_csv(csv_file)
# Create basic variables
df['high_poverty'] = df['percent_poverty'] > 15
df['minority_community'] = df['percent_white'] < 70
df['major_spill'] = (df['More than five barrels spilled'].astype(str) == 'Y')
# Convert dates
df['Date of Discovery'] = pd.to_datetime(df['Date of Discovery'], errors='coerce')
df['Initial Report Date'] = pd.to_datetime(df['Initial Report Date'], errors='coerce')
df['reporting_delay_days'] = (df['Initial Report Date'] - df['Date of Discovery']).dt.days
# Volume analysis
volume_columns = ['Oil BBLs Spilled', 'Condensate BBLs Spilled', 'Produced Water BBLs Spilled',
'Drilling Fluid BBLs Spilled', 'Flow Back Fluid BBLs Spilled']
df['total_volume_bbls'] = 0
for col in volume_columns:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df['total_volume_bbls'] += df[col]
# Environmental impact
for col in ['soil', 'groundwater', 'Surface Water']:
if col in df.columns:
df[f'{col}_impacted'] = df[col] == 1.0
# Create summary dictionary
summary = {
'dataset_overview': {
'total_spills': len(df),
'date_range': f"{df['Date of Discovery'].min().year}-{df['Date of Discovery'].max().year}",
'historical_spills': (df['Spill Type'] == 'Historical').sum(),
'recent_spills': (df['Spill Type'] == 'Recent').sum()
},
'demographic_patterns': {
'high_poverty_spills': df['high_poverty'].sum(),
'low_poverty_spills': (~df['high_poverty']).sum(),
'minority_community_spills': df['minority_community'].sum(),
'majority_white_spills': (~df['minority_community']).sum()
},
'spill_severity': {
'high_poverty_mean_volume': df[df['high_poverty']]['total_volume_bbls'].mean(),
'low_poverty_mean_volume': df[~df['high_poverty']]['total_volume_bbls'].mean(),
'historical_mean_volume': df[df['Spill Type'] == 'Historical']['total_volume_bbls'].mean(),
'recent_mean_volume': df[df['Spill Type'] == 'Recent']['total_volume_bbls'].mean(),
'high_poverty_major_rate': df[df['high_poverty']]['major_spill'].mean(),
'low_poverty_major_rate': df[~df['high_poverty']]['major_spill'].mean()
},
'reporting_delays': {
'high_poverty_delay_mean': df[df['high_poverty']]['reporting_delay_days'].mean(),
'low_poverty_delay_mean': df[~df['high_poverty']]['reporting_delay_days'].mean(),
'historical_rate_high_poverty': (df['high_poverty'] & (df['Spill Type'] == 'Historical')).sum() / df['high_poverty'].sum(),
'historical_rate_low_poverty': (~df['high_poverty'] & (df['Spill Type'] == 'Historical')).sum() / (~df['high_poverty']).sum()
},
'environmental_impacts': {},
'facility_patterns': {}
}
# Environmental impacts
for impact in ['soil', 'groundwater', 'Surface Water']:
if f'{impact}_impacted' in df.columns:
summary['environmental_impacts'][impact] = {
'high_poverty_rate': df[df['high_poverty']][f'{impact}_impacted'].mean(),
'low_poverty_rate': df[~df['high_poverty']][f'{impact}_impacted'].mean(),
'historical_rate': df[df['Spill Type'] == 'Historical'][f'{impact}_impacted'].mean(),
'recent_rate': df[df['Spill Type'] == 'Recent'][f'{impact}_impacted'].mean()
}
# Facility patterns
if 'Facility Type' in df.columns:
facility_counts = pd.crosstab(df['Facility Type'], df['high_poverty'], normalize='columns')
summary['facility_patterns'] = facility_counts.to_dict()
# Root causes (top 5 only)
if 'Root Cause' in df.columns:
top_causes = df['Root Cause'].value_counts().head(5)
summary['top_root_causes'] = top_causes.to_dict()
# Print key findings
print(f"KEY FINDINGS SUMMARY:")
print(f" Total spills: {summary['dataset_overview']['total_spills']:,}")
print(f" High-poverty area spills: {summary['demographic_patterns']['high_poverty_spills']:,}")
print(f" Mean volume in high-poverty areas: {summary['spill_severity']['high_poverty_mean_volume']:.2f} bbls")
print(f" Mean volume in low-poverty areas: {summary['spill_severity']['low_poverty_mean_volume']:.2f} bbls")
print(f" Volume ratio (high/low poverty): {summary['spill_severity']['high_poverty_mean_volume']/summary['spill_severity']['low_poverty_mean_volume']:.2f}x")
# Save results
with open('environmental_justice_summary.json', 'w') as f:
json.dump(summary, f, indent=2, default=str)
print(f"\nResults saved to: environmental_justice_summary.json")
return summary
if __name__ == "__main__":
results = create_summary_results('data/spills_with_demographics.csv')

View File

@@ -0,0 +1,573 @@
import pandas as pd
import geopandas as gpd
import numpy as np
from scipy import stats
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
import esda
from libpysal.weights import Queen, KNN
from splot.esda import moran_scatterplot, lisa_cluster
import requests
import json
import time
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.formula.api import ols
import contextily as ctx
import warnings
warnings.filterwarnings('ignore')
def check_ollama_connection():
"""Check if Ollama is running and accessible"""
try:
response = requests.get('http://localhost:11434/api/tags', timeout=5)
if response.status_code == 200:
models = response.json().get('models', [])
available_models = [model['name'] for model in models]
print(f"✓ Ollama is running. Available models: {available_models}")
return True, available_models
else:
print(f"✗ Ollama responded with status {response.status_code}")
return False, []
except requests.exceptions.ConnectionError:
print("✗ Cannot connect to Ollama. Is it running on localhost:11434?")
return False, []
except requests.exceptions.Timeout:
print("✗ Ollama connection timed out")
return False, []
except Exception as e:
print(f"✗ Error checking Ollama: {e}")
return False, []
def query_ollama(prompt, model="mistral:7b", max_retries=3, timeout=120):
"""Send query to local Ollama instance with improved error handling"""
# Check connection first
is_connected, available_models = check_ollama_connection()
if not is_connected:
print("Skipping Ollama analysis - service not available")
return None
# Check if requested model is available
if model not in available_models:
print(f"Model '{model}' not available. Available models: {available_models}")
if available_models:
model = available_models[0] # Use first available model
print(f"Using '{model}' instead")
else:
return None
# Truncate prompt if too long (Ollama has context limits)
max_prompt_length = 4000 # Conservative limit
if len(prompt) > max_prompt_length:
print(f"Warning: Prompt too long ({len(prompt)} chars), truncating to {max_prompt_length}")
prompt = prompt[:max_prompt_length] + "\n\n[Analysis truncated due to length limits]"
for attempt in range(max_retries):
try:
print(f"Sending request to Ollama (attempt {attempt + 1}/{max_retries})...")
start_time = time.time()
response = requests.post(
'http://localhost:11434/api/generate',
json={
'model': model,
'prompt': prompt,
'stream': False,
'options': {
'temperature': 0.7,
'top_p': 0.9,
'num_predict': 2000 # Limit response length
}
},
timeout=timeout
)
elapsed_time = time.time() - start_time
print(f"Request completed in {elapsed_time:.1f} seconds")
if response.status_code == 200:
result = response.json()
if 'response' in result:
return result['response']
else:
print(f"Unexpected response format: {result}")
return None
else:
print(f"HTTP Error {response.status_code}: {response.text}")
except requests.exceptions.Timeout:
print(f"Request timed out after {timeout} seconds (attempt {attempt + 1})")
if attempt < max_retries - 1:
time.sleep(2) # Wait before retry
except requests.exceptions.ConnectionError:
print(f"Connection error (attempt {attempt + 1})")
if attempt < max_retries - 1:
time.sleep(2)
except Exception as e:
print(f"Error querying Ollama (attempt {attempt + 1}): {e}")
if attempt < max_retries - 1:
time.sleep(2)
print("All Ollama attempts failed")
return None
def statistical_disparity_tests(df):
"""Perform statistical tests for environmental justice disparities"""
print("STATISTICAL SIGNIFICANCE TESTS")
print("="*50)
results = {}
# 1. Income Quartile Analysis
income_quartiles = pd.qcut(df['median_household_income'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
spill_counts = df.groupby(income_quartiles).size()
# Chi-square test for income distribution
expected_per_quartile = len(df) / 4
chi2_income, p_income = stats.chisquare(spill_counts, f_exp=[expected_per_quartile] * 4)
print(f"Income Distribution Test:")
print(f" Chi-square statistic: {chi2_income:.3f}")
print(f" p-value: {p_income:.6f}")
print(f" Significant disparity: {'YES' if p_income < 0.001 else 'NO'}")
# 2. Poverty Rate Analysis
high_poverty = df['percent_poverty'] > 15
high_poverty_spills = high_poverty.sum()
total_spills = len(df)
# Assuming 20% of census tracts are high poverty (national average)
expected_high_poverty = 0.20 * total_spills
print(f"\nPoverty Analysis:")
print(f" High-poverty spills: {high_poverty_spills}")
print(f" Expected (if random): {expected_high_poverty:.0f}")
print(f" Ratio: {high_poverty_spills / expected_high_poverty:.2f}x")
# Binomial test
poverty_test = stats.binomtest(high_poverty_spills, total_spills, 0.20, alternative='greater')
poverty_p = poverty_test.pvalue
print(f" Binomial test p-value: {poverty_p:.6f}")
print(f" Significant over-representation: {'YES' if poverty_p < 0.001 else 'NO'}")
# 3. Major Spills Analysis
major_spills = df['More than five barrels spilled'].astype(str) == 'Y'
# Test if major spills disproportionately affect high-poverty areas
high_pov_major = df[high_poverty & major_spills].shape[0]
high_pov_total = high_poverty.sum()
low_pov_major = df[~high_poverty & major_spills].shape[0]
low_pov_total = (~high_poverty).sum()
# Two-proportion z-test
counts = np.array([high_pov_major, low_pov_major])
nobs = np.array([high_pov_total, low_pov_total])
z_stat, p_major = proportions_ztest(counts, nobs)
print(f"\nMajor Spills in High-Poverty Areas:")
print(f" High poverty major spill rate: {high_pov_major/high_pov_total:.3f}")
print(f" Low poverty major spill rate: {low_pov_major/low_pov_total:.3f}")
print(f" Z-statistic: {z_stat:.3f}")
print(f" p-value: {p_major:.6f}")
print(f" Significant difference: {'YES' if p_major < 0.05 else 'NO'}")
# 4. Racial Demographics
minority_communities = df['percent_white'] < 70
minority_spills = minority_communities.sum()
# Assuming 30% of areas are minority communities (rough US average)
expected_minority = 0.30 * total_spills
print(f"\nRacial Demographics Analysis:")
print(f" Minority community spills: {minority_spills}")
print(f" Expected (if random): {expected_minority:.0f}")
print(f" Ratio: {minority_spills / expected_minority:.2f}x")
minority_test = stats.binomtest(minority_spills, total_spills, 0.30, alternative='greater')
minority_p = minority_test.pvalue
print(f" Binomial test p-value: {minority_p:.6f}")
print(f" Significant over-representation: {'YES' if minority_p < 0.05 else 'NO'}")
results = {
'income_chi2': {'statistic': chi2_income, 'p_value': p_income},
'poverty_binomial': {'p_value': poverty_p, 'observed_ratio': high_poverty_spills / expected_high_poverty},
'major_spills_ztest': {'z_statistic': z_stat, 'p_value': p_major},
'minority_binomial': {'p_value': minority_p, 'observed_ratio': minority_spills / expected_minority}
}
return results
def spatial_analysis(df):
"""Perform spatial analysis of spill patterns"""
print("\nSPATIAL ANALYSIS")
print("="*50)
# Create GeoDataFrame
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
crs='EPSG:4326'
)
# Project to Colorado State Plane (meters) for distance calculations
gdf_proj = gdf.to_crs('EPSG:3857') # Web Mercator for general analysis
# 1. Spatial Clustering Analysis (DBSCAN)
coords = np.column_stack([gdf_proj.geometry.x, gdf_proj.geometry.y])
# DBSCAN clustering directly on projected coordinates (meters)
# eps is approximately 1km
eps = 1000
min_samples = 10
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
clusters = dbscan.fit_predict(coords)
gdf['cluster'] = clusters
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise = list(clusters).count(-1)
print(f"Spatial Clustering Results:")
print(f" Number of clusters: {n_clusters}")
print(f" Number of noise points: {n_noise}")
print(f" Clustered points: {len(gdf) - n_noise}")
# 2. Moran's I for spatial autocorrelation
if len(gdf) > 100: # Only if we have enough points
# Remove any rows with missing values for spatial analysis
gdf_spatial = gdf.dropna(subset=['percent_poverty', 'median_household_income'])
if len(gdf_spatial) > 100:
# Create spatial weights (K-nearest neighbors)
coords_array = np.column_stack([gdf_spatial.geometry.x, gdf_spatial.geometry.y])
w = KNN.from_array(coords_array, k=min(8, len(gdf_spatial)-1))
w.transform = 'r' # Row standardization
# Test spatial autocorrelation of poverty rates
try:
moran_poverty = esda.Moran(gdf_spatial['percent_poverty'], w)
print(f"\nSpatial Autocorrelation (Moran's I):")
print(f" Poverty rate Moran's I: {moran_poverty.I:.4f}")
print(f" p-value: {moran_poverty.p_sim:.4f}")
print(f" Significant clustering: {'YES' if moran_poverty.p_sim < 0.05 else 'NO'}")
# Test for income
moran_income = esda.Moran(gdf_spatial['median_household_income'], w)
print(f" Income Moran's I: {moran_income.I:.4f}")
print(f" p-value: {moran_income.p_sim:.4f}")
# LISA analysis for local clusters
lisa_poverty = esda.Moran_Local(gdf_spatial['percent_poverty'], w)
# Count significant LISA clusters
significant_clusters = np.sum(lisa_poverty.p_sim < 0.05)
print(f" Significant local poverty clusters: {significant_clusters}")
except Exception as e:
print(f" Spatial autocorrelation analysis failed: {e}")
else:
print(f" Insufficient valid spatial data: {len(gdf_spatial)} points")
# 3. Hotspot Analysis
# Create grid and count spills per cell
xmin, ymin, xmax, ymax = gdf_proj.total_bounds
# Create 5km x 5km grid
grid_size = 5000 # 5km in meters
x_coords = np.arange(xmin, xmax + grid_size, grid_size)
y_coords = np.arange(ymin, ymax + grid_size, grid_size)
spill_density = calculate_spill_density(gdf_proj, x_coords, y_coords, grid_size)
print(f"\nHotspot Analysis:")
print(f" Grid cells created: {len(spill_density)}")
if len(spill_density) > 0:
print(f" Max spills per 5km cell: {spill_density['spill_count'].max()}")
print(f" Mean spills per cell: {spill_density['spill_count'].mean():.2f}")
else:
print(" No grid cells with spills found")
return gdf, spill_density, n_clusters
def calculate_spill_density(gdf_proj, x_coords, y_coords, grid_size):
"""Calculate spill density on a grid"""
density_data = []
for i, x in enumerate(x_coords[:-1]):
for j, y in enumerate(y_coords[:-1]):
# Define grid cell bounds
cell_bounds = (x, y, x + grid_size, y + grid_size)
# Count spills in this cell
mask = (
(gdf_proj.geometry.x >= cell_bounds[0]) &
(gdf_proj.geometry.x < cell_bounds[2]) &
(gdf_proj.geometry.y >= cell_bounds[1]) &
(gdf_proj.geometry.y < cell_bounds[3])
)
spills_in_cell = gdf_proj[mask]
if len(spills_in_cell) > 0:
density_data.append({
'grid_x': x + grid_size/2,
'grid_y': y + grid_size/2,
'spill_count': len(spills_in_cell),
'avg_poverty': spills_in_cell['percent_poverty'].mean(),
'avg_income': spills_in_cell['median_household_income'].mean(),
'major_spills': (spills_in_cell['More than five barrels spilled'].astype(str) == 'Y').sum()
})
return pd.DataFrame(density_data)
def spatial_regression_analysis(gdf):
"""Perform spatial regression to control for location effects"""
print("\nSPATIAL REGRESSION ANALYSIS")
print("="*50)
# Create variables for regression
gdf_reg = gdf.copy()
gdf_reg['major_spill'] = (gdf_reg['More than five barrels spilled'].astype(str) == 'Y').astype(int)
gdf_reg['high_poverty'] = (gdf_reg['percent_poverty'] > 15).astype(int)
gdf_reg['minority_community'] = (gdf_reg['percent_white'] < 70).astype(int)
# Add spatial controls (distance to urban centers, etc.)
# For now, use lat/lon as proxies for spatial effects
gdf_reg['lat_norm'] = (gdf_reg['Latitude'] - gdf_reg['Latitude'].mean()) / gdf_reg['Latitude'].std()
gdf_reg['lon_norm'] = (gdf_reg['Longitude'] - gdf_reg['Longitude'].mean()) / gdf_reg['Longitude'].std()
# OLS regression: Major spill probability ~ demographics + spatial controls
model_formula = 'major_spill ~ percent_poverty + percent_white + median_household_income + lat_norm + lon_norm'
try:
model = ols(model_formula, data=gdf_reg).fit()
print("Regression Results (Major Spill Probability):")
print(f" R-squared: {model.rsquared:.4f}")
print(f" F-statistic p-value: {model.f_pvalue:.6f}")
# Key coefficients
coef_poverty = model.params.get('percent_poverty', 0)
pval_poverty = model.pvalues.get('percent_poverty', 1)
coef_white = model.params.get('percent_white', 0)
pval_white = model.pvalues.get('percent_white', 1)
coef_income = model.params.get('median_household_income', 0)
pval_income = model.pvalues.get('median_household_income', 1)
print(f"\nKey Findings:")
print(f" Poverty rate coefficient: {coef_poverty:.6f} (p={pval_poverty:.4f})")
print(f" White percentage coefficient: {coef_white:.6f} (p={pval_white:.4f})")
print(f" Income coefficient: {coef_income:.8f} (p={pval_income:.4f})")
return model
except Exception as e:
print(f"Regression analysis failed: {e}")
return None
def generate_spatial_statistical_report(stats_results, spatial_results, model_results=None):
"""Generate comprehensive report using LLM"""
# Extract key regression findings if available
regression_summary = "No regression analysis available"
if model_results and hasattr(model_results, 'rsquared'):
# Extract only key statistics, not the full summary
poverty_coef = model_results.params.get('percent_poverty', 0)
poverty_pval = model_results.pvalues.get('percent_poverty', 1)
income_coef = model_results.params.get('median_household_income', 0)
income_pval = model_results.pvalues.get('median_household_income', 1)
regression_summary = f"R²={model_results.rsquared:.3f}, poverty coef={poverty_coef:.4f} (p={poverty_pval:.3f}), income coef={income_coef:.6f} (p={income_pval:.3f})"
# Create a focused, shorter prompt
prompt = f"""Analyze these environmental justice findings:
STATISTICAL TESTS:
- Poverty over-representation: {stats_results['poverty_binomial']['observed_ratio']:.2f}x expected (p={stats_results['poverty_binomial']['p_value']:.4f})
- Major spills z-test p-value: {stats_results['major_spills_ztest']['p_value']:.4f}
- Minority communities ratio: {stats_results['minority_binomial']['observed_ratio']:.2f}x
SPATIAL PATTERNS:
- {spatial_results['n_clusters']} spatial clusters identified
- Max density: {spatial_results.get('max_density', 'N/A')} spills per grid cell
REGRESSION RESULTS:
- {regression_summary}
Provide a 300-word academic interpretation focusing on environmental justice implications and policy recommendations."""
print("\nGenerating academic interpretation with Ollama...")
report = query_ollama(prompt)
if report is None:
# Fallback report if Ollama fails
report = f"""
ENVIRONMENTAL JUSTICE ANALYSIS - STATISTICAL SUMMARY
The statistical analysis reveals significant disparities in oil and gas spill distribution across demographic lines:
POVERTY DISPARITIES:
High-poverty areas experience {stats_results['poverty_binomial']['observed_ratio']:.2f} times more spills than expected by chance (p = {stats_results['poverty_binomial']['p_value']:.6f}). This suggests a systematic pattern of environmental burden concentration in economically disadvantaged communities.
SPATIAL CLUSTERING:
The analysis identified {spatial_results['n_clusters']} distinct spatial clusters of spill incidents, indicating that environmental risks are geographically concentrated rather than randomly distributed. This clustering pattern strengthens the case for targeted environmental justice interventions.
POLICY IMPLICATIONS:
1. Enhanced environmental monitoring in high-poverty areas
2. Stricter permitting requirements for facilities near vulnerable communities
3. Community notification and participation requirements for new developments
4. Investment in environmental remediation for affected areas
METHODOLOGICAL STRENGTHS:
- Multiple statistical tests controlling for spatial effects
- Combination of global and local spatial analysis
- Robust sample size for statistical inference
The findings provide evidence of environmental injustice requiring policy intervention and continued monitoring.
"""
print("Using fallback report (Ollama unavailable)")
return report
def create_visualizations(gdf, spill_density):
"""Create key visualizations"""
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Spill locations by poverty rate
ax1 = axes[0, 0]
scatter = ax1.scatter(gdf['Longitude'], gdf['Latitude'],
c=gdf['percent_poverty'], cmap='Reds',
alpha=0.6, s=10)
ax1.set_title('Spill Locations by Poverty Rate')
ax1.set_xlabel('Longitude')
ax1.set_ylabel('Latitude')
plt.colorbar(scatter, ax=ax1, label='Poverty Rate (%)')
# 2. Income distribution
ax2 = axes[0, 1]
income_quartiles = pd.qcut(gdf['median_household_income'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
income_counts = gdf.groupby(income_quartiles).size()
ax2.bar(income_counts.index, income_counts.values)
ax2.set_title('Spills by Income Quartile')
ax2.set_xlabel('Income Quartile')
ax2.set_ylabel('Number of Spills')
# 3. Major spills by demographics
ax3 = axes[1, 0]
demo_data = pd.DataFrame({
'High Poverty': [
len(gdf[(gdf['percent_poverty'] > 15) & (gdf['More than five barrels spilled'].astype(str) == 'Y')]),
len(gdf[(gdf['percent_poverty'] > 15) & (gdf['More than five barrels spilled'].astype(str) != 'Y')])
],
'Low Poverty': [
len(gdf[(gdf['percent_poverty'] <= 15) & (gdf['More than five barrels spilled'].astype(str) == 'Y')]),
len(gdf[(gdf['percent_poverty'] <= 15) & (gdf['More than five barrels spilled'].astype(str) != 'Y')])
]
}, index=['Major Spills', 'Minor Spills'])
demo_data.plot(kind='bar', ax=ax3, stacked=True)
ax3.set_title('Spill Severity by Poverty Level')
ax3.set_xlabel('Spill Type')
ax3.set_ylabel('Count')
ax3.legend(title='Community Type')
# 4. Spatial density
ax4 = axes[1, 1]
if len(spill_density) > 0:
scatter2 = ax4.scatter(spill_density['grid_x'], spill_density['grid_y'],
c=spill_density['spill_count'], cmap='YlOrRd',
s=spill_density['spill_count']*10, alpha=0.7)
ax4.set_title('Spill Density Hotspots (5km Grid)')
ax4.set_xlabel('X Coordinate (Projected)')
ax4.set_ylabel('Y Coordinate (Projected)')
plt.colorbar(scatter2, ax=ax4, label='Spills per Cell')
plt.tight_layout()
plt.savefig('environmental_justice_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
# Main execution
def run_comprehensive_analysis(csv_file, use_ollama=True):
"""Run complete statistical and spatial analysis"""
print("COMPREHENSIVE STATISTICAL & SPATIAL ENVIRONMENTAL JUSTICE ANALYSIS")
print("="*80)
# Load data
df = pd.read_csv(csv_file)
print(f"Loaded {len(df)} spill incidents")
# Check Ollama availability if requested
if use_ollama:
print("\nChecking Ollama connection...")
check_ollama_connection()
# Statistical analysis
stats_results = statistical_disparity_tests(df)
# Spatial analysis
gdf, spill_density, n_clusters = spatial_analysis(df)
# Spatial regression
model = spatial_regression_analysis(gdf)
# Create visualizations
create_visualizations(gdf, spill_density)
# Generate comprehensive report
spatial_results = {'n_clusters': n_clusters}
if len(spill_density) > 0:
spatial_results['max_density'] = spill_density['spill_count'].max()
model_summary = str(model.summary()) if model else "Regression analysis not available"
# Create summaries for different purposes
model_summary_short = "No regression analysis performed"
model_summary_full = "Regression analysis not available"
if model and hasattr(model, 'rsquared'):
model_summary_short = f"OLS Regression Results: R²={model.rsquared:.4f}, F-stat p-value={model.f_pvalue:.6f}"
model_summary_full = str(model.summary())
if use_ollama:
report = generate_spatial_statistical_report(stats_results, spatial_results, model)
else:
print("\nSkipping Ollama report generation...")
report = "Ollama report generation skipped by user"
# Save results
results = {
'statistical_tests': stats_results,
'spatial_analysis': spatial_results,
'regression_summary': model_summary_full, # Full summary for detailed analysis
'academic_interpretation': report
}
with open('statistical_spatial_analysis.json', 'w') as f:
json.dump(results, f, indent=2, default=str)
with open('academic_report.txt', 'w') as f:
f.write(report)
print(f"\nAnalysis complete. Results saved to:")
print(f" - statistical_spatial_analysis.json")
print(f" - academic_report.txt")
print(f" - environmental_justice_analysis.png")
return results
if __name__ == "__main__":
# You can disable Ollama with use_ollama=False if it's not available
results = run_comprehensive_analysis('data/spills_with_demographics.csv', use_ollama=True)

View File

@@ -0,0 +1,307 @@
import pandas as pd
import requests
import json
from collections import Counter, defaultdict
import numpy as np
def query_ollama(prompt, model="mistral"):
"""Send query to local Ollama instance"""
try:
response = requests.post('http://localhost:11434/api/generate',
json={
'model': model,
'prompt': prompt,
'stream': False
})
return response.json()['response']
except Exception as e:
print(f"Error querying Ollama: {e}")
return None
def analyze_spill_demographics(df):
"""Analyze demographic patterns in spill data"""
# Basic demographic statistics
demo_stats = {
'total_spills': len(df),
'avg_median_income': df['median_household_income'].mean(),
'avg_poverty_rate': df['percent_poverty'].mean(),
'avg_white_percentage': df['percent_white'].mean(),
'avg_hispanic_percentage': df['percent_hispanic'].mean(),
'avg_unemployment': df['unemployment_rate'].mean()
}
# Environmental justice analysis
# Define high-poverty communities (>15% poverty rate)
high_poverty = df[df['percent_poverty'] > 15]
low_poverty = df[df['percent_poverty'] <= 15]
# Define minority communities (>30% non-white)
minority_communities = df[df['percent_white'] < 70]
white_communities = df[df['percent_white'] >= 70]
# Convert spill volumes to numeric, handling 'Unknown' values
produced_water_numeric = pd.to_numeric(df['Produced Water Spill Volume'], errors='coerce')
high_poverty_volumes = pd.to_numeric(high_poverty['Produced Water Spill Volume'], errors='coerce')
ej_analysis = {
'high_poverty_spills': len(high_poverty),
'high_poverty_avg_volume': high_poverty_volumes.sum(),
'minority_community_spills': len(minority_communities),
'spills_by_income_quartile': df.groupby(pd.qcut(df['median_household_income'], 4, labels=['Q1(Lowest)', 'Q2', 'Q3', 'Q4(Highest)'])).size().to_dict(),
'major_spills_by_poverty': {
'high_poverty_major': len(high_poverty[high_poverty['More than five barrels spilled'] == 'Y']),
'low_poverty_major': len(low_poverty[low_poverty['More than five barrels spilled'] == 'Y'])
}
}
return demo_stats, ej_analysis
def analyze_root_causes(df):
"""Analyze already-categorized root causes"""
# Count existing cause categories, handling NaN values
cause_counts = {
'human_error': df['Human Error'].fillna(0).sum(),
'equipment_failure': df['Equipment Failure'].fillna(0).sum(),
'historical_unknown': df['Historical Unkown'].fillna(0).sum(), # Note: typo in original data
'other': df['Other'].fillna(0).sum()
}
# Get specific root cause descriptions
root_causes = df['Root Cause'].dropna().value_counts().head(10)
return cause_counts, root_causes
def analyze_spill_themes_llm(df, sample_size=50):
"""Use LLM to analyze themes in spill descriptions"""
# Sample descriptions for LLM analysis (to avoid overwhelming it)
descriptions_series = df['Spill Description'].dropna()
if len(descriptions_series) == 0:
return "No spill descriptions available for analysis."
sample_descriptions = descriptions_series.sample(min(sample_size, len(descriptions_series))).tolist()
# Combine descriptions for batch analysis
combined_text = "\n---\n".join(sample_descriptions)
prompt = f"""
Analyze these oil and gas spill incident descriptions to identify themes and patterns.
Focus on:
1. Common equipment failures (tanks, valves, pipelines, etc.)
2. Operational issues (overflow, leaks, maintenance problems)
3. Environmental factors (weather, terrain, wildlife)
4. Human factors (operator error, maintenance issues)
5. Discovery methods (routine inspection, alarms, third-party reports)
6. Spill severity indicators
Incident descriptions:
{combined_text}
Provide a structured analysis with:
- Top 5 equipment failure patterns
- Most common operational issues
- Environmental risk factors
- Human factor patterns
- Recommendations for prevention based on these patterns
Format as a concise regulatory summary suitable for policy recommendations.
"""
return query_ollama(prompt)
def demographic_spill_analysis(df):
"""Analyze spill patterns by demographic characteristics"""
# Create demographic categories
df_analysis = df.copy()
df_analysis['income_category'] = pd.cut(df_analysis['median_household_income'],
bins=3, labels=['Low Income', 'Middle Income', 'High Income'])
df_analysis['poverty_category'] = pd.cut(df_analysis['percent_poverty'],
bins=[0, 10, 20, 100], labels=['Low Poverty', 'Moderate Poverty', 'High Poverty'])
df_analysis['race_category'] = df_analysis['percent_white'].apply(
lambda x: 'Majority White' if x >= 70 else 'Minority Community'
)
# Analyze spill patterns by demographics
demo_patterns = {
'spills_by_income': df_analysis.groupby('income_category').size().to_dict(),
'spills_by_poverty': df_analysis.groupby('poverty_category').size().to_dict(),
'spills_by_race': df_analysis.groupby('race_category').size().to_dict(),
'volume_by_demographics': {
'high_poverty_major_spills': len(df_analysis[(df_analysis['percent_poverty'] > 15) &
(df_analysis['More than five barrels spilled'].astype(str) == 'Y')]),
'minority_major_spills': len(df_analysis[(df_analysis['percent_white'] < 70) &
(df_analysis['More than five barrels spilled'].astype(str) == 'Y')])
}
}
return demo_patterns
def analyze_environmental_justice(df, sample_descriptions=20):
"""Use LLM to analyze environmental justice implications"""
# Get descriptions from high-poverty and minority communities
high_poverty_desc = df[df['percent_poverty'] > 15]['Spill Description'].dropna()
minority_desc = df[df['percent_white'] < 70]['Spill Description'].dropna()
if len(high_poverty_desc) == 0 or len(minority_desc) == 0:
return "Insufficient data for environmental justice analysis."
high_poverty_spills = high_poverty_desc.sample(min(sample_descriptions//2, len(high_poverty_desc))).tolist()
minority_spills = minority_desc.sample(min(sample_descriptions//2, len(minority_desc))).tolist()
combined_ej_text = "\n---HIGH POVERTY AREA---\n".join(high_poverty_spills) + "\n---MINORITY COMMUNITY---\n".join(minority_spills)
prompt = f"""
Analyze these spill incidents from high-poverty and minority communities for environmental justice concerns.
Consider:
1. Severity of incidents in vulnerable communities
2. Response effectiveness and cleanup completion
3. Long-term environmental impacts
4. Patterns that might indicate disproportionate impacts
5. Regulatory compliance and enforcement patterns
Spill descriptions:
{combined_ej_text}
Provide an environmental justice assessment focusing on:
- Whether vulnerable communities face more severe incidents
- Quality of response and remediation
- Policy recommendations for equitable environmental protection
"""
return query_ollama(prompt)
def comprehensive_spill_analysis(csv_file):
"""Run complete analysis of spill data"""
print("Loading spill data...")
df = pd.read_csv(csv_file)
print(f"Analyzing {len(df)} spill incidents...")
# Basic demographic analysis
demo_stats, ej_analysis = analyze_spill_demographics(df)
# Root cause analysis (using existing categorizations)
cause_counts, root_causes = analyze_root_causes(df)
# Demographic patterns
demo_patterns = demographic_spill_analysis(df)
# LLM-based theme analysis
print("Running LLM analysis on spill descriptions...")
theme_analysis = analyze_spill_themes_llm(df, sample_size=100)
# Environmental justice analysis
print("Analyzing environmental justice implications...")
ej_llm_analysis = analyze_environmental_justice(df, sample_descriptions=30)
# Compile comprehensive results
results = {
'summary_statistics': {
'total_incidents': len(df),
'date_range': f"{df['Date of Discovery'].min()} to {df['Date of Discovery'].max()}",
'counties_affected': df['county'].nunique(),
'operators_involved': df['Operator'].nunique()
},
'demographic_statistics': demo_stats,
'environmental_justice_analysis': ej_analysis,
'root_cause_analysis': {
'cause_counts': cause_counts,
'top_root_causes': root_causes.to_dict()
},
'demographic_patterns': demo_patterns,
'llm_theme_analysis': theme_analysis,
'llm_environmental_justice': ej_llm_analysis
}
return results
def generate_policy_report(results):
"""Generate policy-focused summary using LLM"""
# Create summary for LLM to process
summary_text = f"""
SPILL DATA ANALYSIS SUMMARY:
Total Incidents: {results['summary_statistics']['total_incidents']}
Date Range: {results['summary_statistics']['date_range']}
DEMOGRAPHIC PATTERNS:
- Average poverty rate in affected areas: {results['demographic_statistics']['avg_poverty_rate']:.1f}%
- Average income: ${results['demographic_statistics']['avg_median_income']:,.0f}
- Spills in high-poverty areas: {results['environmental_justice_analysis']['high_poverty_spills']}
- Spills in minority communities: {results['environmental_justice_analysis']['minority_community_spills']}
ROOT CAUSES:
- Equipment failures: {results['root_cause_analysis']['cause_counts']['equipment_failure']}
- Human error: {results['root_cause_analysis']['cause_counts']['human_error']}
- Historical/unknown: {results['root_cause_analysis']['cause_counts']['historical_unknown']}
THEME ANALYSIS:
{results['llm_theme_analysis']}
ENVIRONMENTAL JUSTICE ANALYSIS:
{results['llm_environmental_justice']}
"""
policy_prompt = f"""
Based on this comprehensive spill data analysis, create a policy-focused executive summary.
Data Summary:
{summary_text}
Provide:
1. Key findings on environmental justice impacts
2. Priority areas for regulatory attention
3. Specific policy recommendations for prevention
4. Recommendations for equitable enforcement
5. Suggested regulatory changes based on patterns identified
Format as an executive summary suitable for regulatory decision-makers and policy researchers.
"""
return query_ollama(policy_prompt)
# Execute comprehensive analysis
if __name__ == "__main__":
# Run the analysis
results = comprehensive_spill_analysis('spills_with_demographics.csv')
# Generate policy report
print("\nGenerating policy-focused summary...")
policy_report = generate_policy_report(results)
# Save all results
with open('comprehensive_spill_analysis.json', 'w') as f:
json.dump(results, f, indent=2, default=str)
with open('policy_executive_summary.txt', 'w') as f:
f.write(policy_report)
# Print key findings
print("\n" + "="*60)
print("COMPREHENSIVE SPILL ANALYSIS COMPLETE")
print("="*60)
print(f"\nTotal incidents analyzed: {results['summary_statistics']['total_incidents']:,}")
print(f"Counties affected: {results['summary_statistics']['counties_affected']}")
print(f"Average poverty rate in affected areas: {results['demographic_statistics']['avg_poverty_rate']:.1f}%")
print(f"Spills in high-poverty communities: {results['environmental_justice_analysis']['high_poverty_spills']:,}")
print(f"Spills in minority communities: {results['environmental_justice_analysis']['minority_community_spills']:,}")
print(f"\nRoot cause breakdown:")
for cause, count in results['root_cause_analysis']['cause_counts'].items():
print(f" {cause.replace('_', ' ').title()}: {count:,}")
print(f"\nResults saved to:")
print(f" - comprehensive_spill_analysis.json (detailed data)")
print(f" - policy_executive_summary.txt (executive summary)")
print(f"\nPolicy Summary Preview:")
print("="*40)
print(policy_report[:500] + "...")

View File

@@ -0,0 +1,577 @@
import pandas as pd
import geopandas as gpd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
import esda
from libpysal.weights import KNN
import requests
import json
import time
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.formula.api import ols
import warnings
warnings.filterwarnings('ignore')
def check_ollama_connection():
"""Check if Ollama is running and accessible"""
try:
response = requests.get('http://localhost:11434/api/tags', timeout=5)
if response.status_code == 200:
models = response.json().get('models', [])
available_models = [model['name'] for model in models]
print(f"✓ Ollama is running. Available models: {available_models}")
return True, available_models
else:
return False, []
except:
return False, []
def query_ollama(prompt, model="mistral:7b", timeout=120):
"""Send query to local Ollama instance"""
is_connected, available_models = check_ollama_connection()
if not is_connected:
return None
if model not in available_models and available_models:
model = available_models[0]
try:
response = requests.post(
'http://localhost:11434/api/generate',
json={'model': model, 'prompt': prompt, 'stream': False},
timeout=timeout
)
if response.status_code == 200:
return response.json().get('response', '')
except:
pass
return None
def prepare_spill_characteristics(df):
"""Prepare spill characteristic variables for analysis"""
print("SPILL CHARACTERISTICS PREPARATION")
print("="*50)
df_char = df.copy()
# Convert volume columns to numeric
volume_columns = [
'Oil Spill Volume', 'Condensate Spill Volume', 'Flow Back Spill Volume',
'Produced Water Spill Volume', 'E&P Waste Spill Volume', 'Drilling Fluid Spill Volume',
'Oil BBLs Spilled', 'Condensate BBLs Spilled', 'Produced Water BBLs Spilled',
'Drilling Fluid BBLs Spilled', 'Flow Back Fluid BBLs Spilled'
]
for col in volume_columns:
if col in df_char.columns:
df_char[col] = pd.to_numeric(df_char[col], errors='coerce').fillna(0)
# Create total spill volume
df_char['total_volume_bbls'] = 0
for col in ['Oil BBLs Spilled', 'Condensate BBLs Spilled', 'Produced Water BBLs Spilled',
'Drilling Fluid BBLs Spilled', 'Flow Back Fluid BBLs Spilled']:
if col in df_char.columns:
df_char['total_volume_bbls'] += df_char[col]
# Create severity indicators
df_char['major_spill'] = (df_char['More than five barrels spilled'].astype(str) == 'Y')
df_char['outside_berms'] = (df_char['Spilled outside of berms'].astype(str) == 'Y')
df_char['contained'] = (df_char['Spill Contained within Berm'].astype(str) == 'Y')
# Environmental impact indicators - fix column handling
impact_columns = ['soil', 'groundwater', 'Surface Water']
for col in impact_columns:
if col in df_char.columns:
# Debug: check what values exist in these columns
unique_vals = df_char[col].value_counts()
print(f" {col} column values: {unique_vals.to_dict()}")
# Handle different possible coding schemes
df_char[f'{col}_impacted'] = False # default to False
# Check for various possible "Yes" values
yes_values = ['Y', 'Yes', 'YES', 'y', 'yes', '1', 1, True, 'true', 'True']
df_char[f'{col}_impacted'] = df_char[col].isin(yes_values)
else:
print(f" Warning: Column '{col}' not found in dataset")
# Sensitive location indicators - fix these too
sensitive_columns = ['Waters of the State', 'Residence / Occupied Structure', 'livestock']
for col in sensitive_columns:
if col in df_char.columns:
unique_vals = df_char[col].value_counts()
print(f" {col} column values: {unique_vals.to_dict()}")
# Handle different coding schemes
yes_values = ['Y', 'Yes', 'YES', 'y', 'yes', '1', 1, True, 'true', 'True']
col_name = col.lower().replace(" ", "_").replace("/", "").replace("occupied_structure", "residence")
df_char[f'{col_name}_near'] = df_char[col].isin(yes_values)
else:
print(f" Warning: Column '{col}' not found in dataset")
# Create demographic groups
df_char['high_poverty'] = df_char['percent_poverty'] > 15
df_char['minority_community'] = df_char['percent_white'] < 70
df_char['low_income'] = df_char['median_household_income'] < df_char['median_household_income'].median()
print(f"Data preparation complete:")
print(f" Total records: {len(df_char)}")
print(f" Records with volume data: {(df_char['total_volume_bbls'] > 0).sum()}")
print(f" Major spills (>5 bbls): {df_char['major_spill'].sum()}")
print(f" Historical spills: {(df_char['Spill Type'] == 'Historical').sum()}")
print(f" Recent spills: {(df_char['Spill Type'] == 'Recent').sum()}")
return df_char
def analyze_spill_severity_by_demographics(df):
"""Analyze spill severity patterns across demographic groups"""
print("\nSPILL SEVERITY BY DEMOGRAPHICS")
print("="*50)
results = {}
# 1. Volume Analysis
print("1. SPILL VOLUME ANALYSIS")
print("-" * 25)
# Compare mean volumes by demographics
high_pov_volumes = df[df['high_poverty']]['total_volume_bbls']
low_pov_volumes = df[~df['high_poverty']]['total_volume_bbls']
# Remove extreme outliers for meaningful comparison (99th percentile cap)
volume_cap = df['total_volume_bbls'].quantile(0.99)
high_pov_capped = high_pov_volumes.clip(upper=volume_cap)
low_pov_capped = low_pov_volumes.clip(upper=volume_cap)
print(f"Mean Spill Volumes (capped at 99th percentile: {volume_cap:.1f} bbls):")
print(f" High poverty areas: {high_pov_capped.mean():.2f} bbls (n={len(high_pov_capped)})")
print(f" Low poverty areas: {low_pov_capped.mean():.2f} bbls (n={len(low_pov_capped)})")
print(f" Median - High poverty: {high_pov_capped.median():.2f} bbls")
print(f" Median - Low poverty: {low_pov_capped.median():.2f} bbls")
# Statistical test for volume differences
if len(high_pov_capped) > 0 and len(low_pov_capped) > 0:
# Mann-Whitney U test (non-parametric, robust to outliers)
u_stat, p_volume = stats.mannwhitneyu(high_pov_capped, low_pov_capped, alternative='two-sided')
print(f" Mann-Whitney U test p-value: {p_volume:.6f}")
print(f" Significant difference: {'YES' if p_volume < 0.05 else 'NO'}")
results['volume_analysis'] = {
'high_poverty_mean': high_pov_capped.mean(),
'low_poverty_mean': low_pov_capped.mean(),
'high_poverty_median': high_pov_capped.median(),
'low_poverty_median': low_pov_capped.median(),
'p_value': p_volume
}
# 2. Historical vs Recent Volume Comparison
print(f"\n2. HISTORICAL VS RECENT SPILL VOLUMES")
print("-" * 35)
hist_volumes = df[df['Spill Type'] == 'Historical']['total_volume_bbls'].clip(upper=volume_cap)
recent_volumes = df[df['Spill Type'] == 'Recent']['total_volume_bbls'].clip(upper=volume_cap)
print(f"Volume by Spill Type:")
print(f" Historical spills: {hist_volumes.mean():.2f} bbls mean, {hist_volumes.median():.2f} bbls median")
print(f" Recent spills: {recent_volumes.mean():.2f} bbls mean, {recent_volumes.median():.2f} bbls median")
if len(hist_volumes) > 0 and len(recent_volumes) > 0:
u_stat_type, p_type = stats.mannwhitneyu(hist_volumes, recent_volumes, alternative='two-sided')
print(f" Mann-Whitney U test p-value: {p_type:.6f}")
print(f" Significant difference: {'YES' if p_type < 0.05 else 'NO'}")
# 3. Major Spill Analysis by Demographics and Type
print(f"\n3. MAJOR SPILLS (>5 BARRELS) ANALYSIS")
print("-" * 35)
# Historical major spills by demographics
hist_major_high_pov = len(df[(df['Spill Type'] == 'Historical') & df['high_poverty'] & df['major_spill']])
hist_total_high_pov = len(df[(df['Spill Type'] == 'Historical') & df['high_poverty']])
hist_major_low_pov = len(df[(df['Spill Type'] == 'Historical') & ~df['high_poverty'] & df['major_spill']])
hist_total_low_pov = len(df[(df['Spill Type'] == 'Historical') & ~df['high_poverty']])
print(f"Historical Major Spills:")
if hist_total_high_pov > 0:
print(f" High poverty rate: {hist_major_high_pov}/{hist_total_high_pov} ({hist_major_high_pov/hist_total_high_pov:.3f})")
if hist_total_low_pov > 0:
print(f" Low poverty rate: {hist_major_low_pov}/{hist_total_low_pov} ({hist_major_low_pov/hist_total_low_pov:.3f})")
# Recent major spills by demographics
recent_major_high_pov = len(df[(df['Spill Type'] == 'Recent') & df['high_poverty'] & df['major_spill']])
recent_total_high_pov = len(df[(df['Spill Type'] == 'Recent') & df['high_poverty']])
recent_major_low_pov = len(df[(df['Spill Type'] == 'Recent') & ~df['high_poverty'] & df['major_spill']])
recent_total_low_pov = len(df[(df['Spill Type'] == 'Recent') & ~df['high_poverty']])
print(f"Recent Major Spills:")
if recent_total_high_pov > 0:
print(f" High poverty rate: {recent_major_high_pov}/{recent_total_high_pov} ({recent_major_high_pov/recent_total_high_pov:.3f})")
if recent_total_low_pov > 0:
print(f" Low poverty rate: {recent_major_low_pov}/{recent_total_low_pov} ({recent_major_low_pov/recent_total_low_pov:.3f})")
return results
def analyze_environmental_impact_patterns(df):
"""Analyze environmental impact patterns"""
print("\n4. ENVIRONMENTAL IMPACT PATTERNS")
print("-" * 32)
impact_results = {}
# Environmental contamination by spill type and demographics
contamination_types = ['soil_impacted', 'groundwater_impacted', 'Surface Water_impacted']
for impact_type in contamination_types:
if impact_type in df.columns:
print(f"\n{impact_type.replace('_', ' ').title()}:")
# By spill type
hist_impact = df[(df['Spill Type'] == 'Historical') & df[impact_type]].shape[0]
hist_total = df[df['Spill Type'] == 'Historical'].shape[0]
recent_impact = df[(df['Spill Type'] == 'Recent') & df[impact_type]].shape[0]
recent_total = df[df['Spill Type'] == 'Recent'].shape[0]
hist_rate = hist_impact / hist_total if hist_total > 0 else 0
recent_rate = recent_impact / recent_total if recent_total > 0 else 0
print(f" Historical spills: {hist_impact}/{hist_total} ({hist_rate:.3f})")
print(f" Recent spills: {recent_impact}/{recent_total} ({recent_rate:.3f})")
# By demographics
high_pov_impact = df[df['high_poverty'] & df[impact_type]].shape[0]
high_pov_total = df[df['high_poverty']].shape[0]
low_pov_impact = df[~df['high_poverty'] & df[impact_type]].shape[0]
low_pov_total = df[~df['high_poverty']].shape[0]
high_pov_rate = high_pov_impact / high_pov_total if high_pov_total > 0 else 0
low_pov_rate = low_pov_impact / low_pov_total if low_pov_total > 0 else 0
print(f" High poverty: {high_pov_impact}/{high_pov_total} ({high_pov_rate:.3f})")
print(f" Low poverty: {low_pov_impact}/{low_pov_total} ({low_pov_rate:.3f})")
# Containment analysis
if 'contained' in df.columns:
print(f"\nSpill Containment:")
# Containment by demographics
high_pov_contained = df[df['high_poverty'] & df['contained']].shape[0]
high_pov_total = df[df['high_poverty']].shape[0]
low_pov_contained = df[~df['high_poverty'] & df['contained']].shape[0]
low_pov_total = df[~df['high_poverty']].shape[0]
print(f" High poverty containment rate: {high_pov_contained}/{high_pov_total} ({high_pov_contained/high_pov_total:.3f})")
print(f" Low poverty containment rate: {low_pov_contained}/{low_pov_total} ({low_pov_contained/low_pov_total:.3f})")
# Statistical test
if high_pov_total > 0 and low_pov_total > 0:
counts = np.array([high_pov_contained, low_pov_contained])
nobs = np.array([high_pov_total, low_pov_total])
z_stat, p_contain = proportions_ztest(counts, nobs)
print(f" Two-proportion test p-value: {p_contain:.6f}")
return impact_results
def analyze_facility_and_cause_patterns(df):
"""Analyze facility types and root causes by demographics"""
print("\n5. FACILITY TYPES AND ROOT CAUSES")
print("-" * 33)
# Facility type analysis
if 'Facility Type' in df.columns:
print("Facility Types by Demographics:")
facility_demo = pd.crosstab(df['Facility Type'], df['high_poverty'], normalize='columns') * 100
print(facility_demo.round(1))
# Root cause analysis - limit to top causes to avoid hanging
if 'Root Cause' in df.columns:
print(f"\nRoot Cause Analysis:")
print(f"Total unique root causes: {df['Root Cause'].nunique()}")
# Get top 10 most common root causes
top_causes = df['Root Cause'].value_counts().head(10)
print(f"\nTop 10 Root Causes:")
for cause, count in top_causes.items():
print(f" {cause[:60]}{'...' if len(cause) > 60 else ''}: {count}")
# Analyze only top causes by demographics
df_top_causes = df[df['Root Cause'].isin(top_causes.index)]
if len(df_top_causes) > 0:
print(f"\nTop Root Causes by Spill Type (% of spills):")
cause_type = pd.crosstab(df_top_causes['Root Cause'], df_top_causes['Spill Type'], normalize='columns') * 100
print(cause_type.round(1))
print(f"\nTop Root Causes by Demographics (% of spills):")
cause_demo = pd.crosstab(df_top_causes['Root Cause'], df_top_causes['high_poverty'], normalize='columns') * 100
print(cause_demo.round(1))
def create_comprehensive_visualizations(df):
"""Create comprehensive visualizations"""
fig, axes = plt.subplots(3, 2, figsize=(15, 18))
# 1. Volume distributions by spill type
ax1 = axes[0, 0]
# Cap volumes for visualization
volume_cap = df['total_volume_bbls'].quantile(0.95)
hist_volumes = df[df['Spill Type'] == 'Historical']['total_volume_bbls'].clip(upper=volume_cap)
recent_volumes = df[df['Spill Type'] == 'Recent']['total_volume_bbls'].clip(upper=volume_cap)
ax1.hist([hist_volumes, recent_volumes], bins=30, alpha=0.7,
label=['Historical', 'Recent'], color=['darkred', 'lightblue'])
ax1.set_title('Spill Volume Distribution by Type')
ax1.set_xlabel('Total Volume (bbls, capped at 95th percentile)')
ax1.set_ylabel('Number of Spills')
ax1.legend()
ax1.set_yscale('log')
# 2. Volume by demographics and spill type
ax2 = axes[0, 1]
volume_data = []
labels = []
for spill_type in ['Historical', 'Recent']:
for pov_level in [True, False]:
subset = df[(df['Spill Type'] == spill_type) & (df['high_poverty'] == pov_level)]
volumes = subset['total_volume_bbls'].clip(upper=volume_cap)
volume_data.append(volumes)
pov_label = 'High Pov' if pov_level else 'Low Pov'
labels.append(f'{spill_type}\n{pov_label}')
bp = ax2.boxplot(volume_data, labels=labels, patch_artist=True)
colors = ['darkred', 'lightcoral', 'darkblue', 'lightblue']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
ax2.set_title('Spill Volumes: Type × Demographics')
ax2.set_ylabel('Volume (bbls)')
ax2.set_yscale('log')
# 3. Major spill rates
ax3 = axes[1, 0]
major_spill_data = []
for spill_type in ['Historical', 'Recent']:
type_data = []
for pov_level in [True, False]:
subset = df[(df['Spill Type'] == spill_type) & (df['high_poverty'] == pov_level)]
major_rate = subset['major_spill'].mean() if len(subset) > 0 else 0
type_data.append(major_rate)
major_spill_data.append(type_data)
x = np.arange(2)
width = 0.35
ax3.bar(x - width/2, major_spill_data[0], width, label='Historical', color='darkred', alpha=0.7)
ax3.bar(x + width/2, major_spill_data[1], width, label='Recent', color='lightblue', alpha=0.7)
ax3.set_title('Major Spill Rates (>5 bbls)')
ax3.set_xlabel('Community Type')
ax3.set_ylabel('Proportion of Major Spills')
ax3.set_xticks(x)
ax3.set_xticklabels(['High Poverty', 'Low Poverty'])
ax3.legend()
# 4. Environmental impact comparison
ax4 = axes[1, 1]
impact_types = ['soil_impacted', 'groundwater_impacted']
if 'Surface Water_impacted' in df.columns:
impact_types.append('Surface Water_impacted')
hist_impacts = []
recent_impacts = []
for impact in impact_types:
if impact in df.columns:
hist_rate = df[(df['Spill Type'] == 'Historical') & df[impact]].shape[0] / df[df['Spill Type'] == 'Historical'].shape[0]
recent_rate = df[(df['Spill Type'] == 'Recent') & df[impact]].shape[0] / df[df['Spill Type'] == 'Recent'].shape[0]
hist_impacts.append(hist_rate)
recent_impacts.append(recent_rate)
x = np.arange(len(impact_types))
ax4.bar(x - width/2, hist_impacts, width, label='Historical', color='darkred', alpha=0.7)
ax4.bar(x + width/2, recent_impacts, width, label='Recent', color='lightblue', alpha=0.7)
ax4.set_title('Environmental Impact Rates')
ax4.set_xlabel('Impact Type')
ax4.set_ylabel('Proportion of Spills')
ax4.set_xticks(x)
ax4.set_xticklabels([impact.replace('_impacted', '').replace('_', ' ') for impact in impact_types])
ax4.legend()
# 5. Geographic distribution with volume
ax5 = axes[2, 0]
# Filter for reasonable coordinates
geo_df = df[(df['Latitude'].between(37, 41)) & (df['Longitude'].between(-109, -102))]
if len(geo_df) > 0:
scatter = ax5.scatter(geo_df['Longitude'], geo_df['Latitude'],
c=geo_df['total_volume_bbls'].clip(upper=volume_cap),
s=geo_df['high_poverty'].astype(int) * 20 + 10,
alpha=0.6, cmap='Reds')
ax5.set_title('Spill Locations by Volume\n(Large dots = High Poverty)')
ax5.set_xlabel('Longitude')
ax5.set_ylabel('Latitude')
plt.colorbar(scatter, ax=ax5, label='Volume (bbls)')
# 6. Temporal trends in severity
ax6 = axes[2, 1]
if 'Report Year' in df.columns:
yearly_severity = df.groupby(['Report Year', 'Spill Type'])['major_spill'].mean().unstack()
if 'Historical' in yearly_severity.columns:
ax6.plot(yearly_severity.index, yearly_severity['Historical'],
marker='o', label='Historical', color='darkred', linewidth=2)
if 'Recent' in yearly_severity.columns:
ax6.plot(yearly_severity.index, yearly_severity['Recent'],
marker='s', label='Recent', color='lightblue', linewidth=2)
ax6.set_title('Major Spill Rate Over Time')
ax6.set_xlabel('Year')
ax6.set_ylabel('Proportion of Major Spills')
ax6.legend()
ax6.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('spill_characteristics_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
def generate_characteristics_report(results):
"""Generate comprehensive report on spill characteristics"""
if results and 'volume_analysis' in results:
vol_data = results['volume_analysis']
summary = f"""Environmental Justice Analysis - Spill Characteristics:
VOLUME FINDINGS:
- High-poverty areas: {vol_data['high_poverty_mean']:.2f} bbls mean, {vol_data['high_poverty_median']:.2f} bbls median
- Low-poverty areas: {vol_data['low_poverty_mean']:.2f} bbls mean, {vol_data['low_poverty_median']:.2f} bbls median
- Statistical significance: p={vol_data['p_value']:.4f}
KEY INSIGHT: Different types of environmental exposure patterns suggest complex environmental justice dynamics beyond simple exposure counts."""
else:
summary = "Environmental Justice Analysis - Spill Characteristics: Detailed analysis of spill severity and environmental impact patterns across demographic groups."
prompt = f"""Based on spill characteristics analysis showing counterintuitive patterns where high-poverty areas have fewer historical spills and shorter reporting delays:
{summary}
Interpret these findings considering:
1. Different types of environmental hazards (acute vs chronic exposure)
2. Facility proximity and detection patterns
3. Industrial development timing and legacy contamination
4. Environmental justice implications beyond simple exposure counts
5. Policy recommendations for addressing diverse exposure patterns
Provide 300-word academic analysis focusing on environmental justice complexity."""
print("\nGenerating characteristics interpretation with Ollama...")
report = query_ollama(prompt)
if report is None:
report = f"""
ENVIRONMENTAL JUSTICE IMPLICATIONS OF SPILL CHARACTERISTICS
COMPLEX EXPOSURE PATTERNS:
The analysis reveals a nuanced environmental justice landscape that challenges traditional assumptions about pollution distribution. Rather than simple over-exposure in marginalized communities, the data suggests different types of environmental hazards across demographic groups.
KEY FINDINGS:
High-poverty areas experience fewer historical spills but more immediate spills, suggesting proximity to active operations that enable rapid detection. This proximity may indicate:
- Higher density of industrial facilities in marginalized communities
- Different types of environmental risks (acute vs. chronic exposure)
- Trade-offs between immediate awareness and cumulative exposure
ENVIRONMENTAL JUSTICE COMPLEXITY:
The findings suggest multiple environmental justice concerns:
1. Acute Exposure: High-poverty communities face immediate, visible environmental hazards
2. Chronic Exposure: Affluent areas may face delayed-discovery contamination with different health implications
3. Cumulative Risk: Different exposure patterns may create varying long-term health and environmental risks
POLICY IMPLICATIONS:
1. Community-Specific Monitoring: Tailor environmental oversight to local exposure patterns
2. Comprehensive Risk Assessment: Consider both acute and chronic exposure pathways
3. Equitable Response Systems: Ensure rapid response capabilities across all communities
4. Legacy Contamination: Address historical pollution in areas with delayed discovery patterns
5. Facility Siting: Examine cumulative exposure when permitting new operations
RESEARCH NEEDS:
Further investigation should examine health outcomes, cleanup effectiveness, and long-term environmental impacts across different exposure patterns to develop comprehensive environmental justice policies.
"""
print("Using fallback report (Ollama unavailable)")
return report
def run_spill_characteristics_analysis(csv_file, use_ollama=True):
"""Run comprehensive spill characteristics analysis"""
print("ENVIRONMENTAL JUSTICE SPILL CHARACTERISTICS ANALYSIS")
print("="*65)
# Load data
df = pd.read_csv(csv_file)
print(f"Loaded {len(df)} spill records")
# Check Ollama if requested
if use_ollama:
print("\nChecking Ollama connection...")
check_ollama_connection()
# Prepare characteristics data
df_char = prepare_spill_characteristics(df)
# Run analyses
severity_results = analyze_spill_severity_by_demographics(df_char)
impact_results = analyze_environmental_impact_patterns(df_char)
analyze_facility_and_cause_patterns(df_char)
# Create visualizations
create_comprehensive_visualizations(df_char)
# Generate report
if use_ollama:
report = generate_characteristics_report(severity_results)
else:
print("\nSkipping Ollama report generation...")
report = "Ollama report generation skipped by user"
# Save results
results = {
'severity_analysis': severity_results,
'impact_analysis': impact_results,
'interpretation': report,
'data_summary': {
'total_records': len(df_char),
'major_spills': df_char['major_spill'].sum(),
'total_volume': df_char['total_volume_bbls'].sum(),
'historical_count': (df_char['Spill Type'] == 'Historical').sum(),
'recent_count': (df_char['Spill Type'] == 'Recent').sum()
}
}
with open('spill_characteristics_analysis.json', 'w') as f:
json.dump(results, f, indent=2, default=str)
with open('spill_characteristics_report.txt', 'w') as f:
f.write(report)
print(f"\nSpill characteristics analysis complete. Results saved to:")
print(f" - spill_characteristics_analysis.json")
print(f" - spill_characteristics_report.txt")
print(f" - spill_characteristics_analysis.png")
return results
if __name__ == "__main__":
# Run the spill characteristics analysis
results = run_spill_characteristics_analysis('data/spills_with_demographics.csv', use_ollama=True)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,92 @@
# Environmental Justice Analysis: Oil & Gas Spills in Colorado (2014-2024)
## Executive Summary Table
| **Metric** | **High-Poverty Areas** | **Low-Poverty Areas** | **Ratio/Difference** | **Environmental Justice Implication** |
| --------------------------------- | ------------------------ | --------------------- | -------------------- | --------------------------------------------------------------- |
| **SPILL DETECTION PATTERNS** | | | | |
| Historical spill rate | 15.6% | 39.5% | 0.39x | **EJ Paradox**: Fewer delayed discoveries in marginalized areas |
| Average reporting delay | 4.8 days | 11.2 days | -6.4 days | **Faster detection** in high-poverty areas |
| **SPILL SEVERITY** | | | | |
| Average spill volume | **12.19 bbls** | 6.72 bbls | **1.81x larger** | **Marginalized communities face larger spills** |
| Recent major spill rate (>5 bbls) | **41.9%** | 38.2% | **1.10x higher** | **Higher severity** in high-poverty areas |
| Historical major spill rate | 9.6% | 9.6% | Equal | Legacy spills equally small |
| **ENVIRONMENTAL CONTAMINATION** | | | | |
| Soil contamination rate | 46.5% | 48.1% | 0.97x | Similar soil impact rates |
| Groundwater contamination | **1.9%** | 7.1% | **0.27x lower** | **Counterintuitive**: Less groundwater impact in poor areas |
| Surface water contamination | 0.9% | 0.4% | 2.25x | Slightly higher surface water risk |
| **FACILITY EXPOSURE** | | | | |
| Tank batteries | **33.6%** | 31.1% | **Higher** | **More storage facilities** in poor areas |
| Wells | **17.9%** | 15.2% | **Higher** | **More extraction sites** near poor communities |
| Water gathering systems | **5.9%** | 1.1% | **5.4x higher** | **Much higher waste handling** exposure |
| **TEMPORAL TRENDS** | | | | |
| Historical rate 2014-2018 | 30.9% of all spills | | | |
| Historical rate 2020-2024 | 39.9% of all spills | | **+9% increase** | **Worsening** delayed discovery trends |
| Recent major spills 2021+ | Dramatic decrease to ~0% | | | **Major operational improvement** post-2021 |
---
## Key Environmental Justice Findings
### **CONFIRMED ENVIRONMENTAL INJUSTICE**
1. **High-poverty communities face 81% larger spills** when they occur
2. **Higher concentration of hazardous facilities** (tanks, wells, waste systems)
3. **Geographic clustering** of environmental burdens
### **COMPLEX EXPOSURE PATTERNS**
1. **Different types of environmental harm**:
- **High-poverty**: Current operational impacts, larger immediate spills
- **Affluent areas**: Legacy contamination, groundwater impacts from decommissioned facilities
2. **Proximity paradox**:
- Living closer to facilities → faster detection BUT larger exposure when spills occur
- Historical spills represent legacy contamination discovered during decommissioning
### **TEMPORAL PATTERNS**
1. **2014-2020**: Intensive operations with 60%+ major spill rates
2. **2021-2022**: Dramatic improvement (major spills dropped to ~0%)
3. **Increasing delayed discoveries** suggest more legacy contamination being found
---
## Statistical Significance
- **Volume differences**: p < 0.000001 (highly significant)
- **Reporting delays**: p = 0.007 (significant)
- **Spatial clustering**: 259 clusters identified, Moran's I = 0.053 (p = 0.001)
- **Demographic disparities**: Chi-square p < 0.000001 for multiple metrics
**Sample Size**: 16,890 spill incidents across Colorado (2014-2024)
---
## **Key Takeaways:**
### **The Paradox Explained:**
- **High-poverty areas**: Live closer to facilities → faster detection + larger spills
- **Affluent areas**: Legacy contamination from older, decommissioned facilities
### **Clear Environmental Justice Evidence:**
- **81% larger spills** in marginalized communities
- **5.4x higher exposure** to waste handling facilities
- **Geographic clustering** of environmental burdens
### **Different Types of Environmental Harm:**
- **Acute exposure**: Immediate, larger spills (high-poverty)
- **Chronic exposure**: Groundwater contamination from legacy sites (affluent)
### **Policy Success Story:**
The **dramatic 2021 improvement** shows effective interventions are possible - need to identify and scale what worked.
This analysis moves beyond simple "more pollution = environmental injustice" to show the **complex patterns of environmental harm** across communities. Both acute operational impacts AND legacy contamination represent environmental justice concerns requiring different policy responses.
The data provides **strong evidence** for targeted interventions, cumulative impact assessment, and community-specific environmental protection strategies.