#!/usr/bin/env python3
"""
MLB Statcast Advanced Stats - Baseball Savant
Fetches xBA, xSLG, xwOBA, Barrel Rate, Hard Hit %, Sprint Speed
"""
import requests
import psycopg2
import csv
import io
from datetime import datetime, timezone

SAVANT_BASE = 'https://baseballsavant.mlb.com/leaderboard'

def load_db_url():
    with open('/var/www/html/eventheodds/.env', 'r') as f:
        for line in f:
            if line.startswith('SPORTS_DATABASE_URL='):
                return line.split('=', 1)[1].strip().split('?')[0]
    return ''

def fetch_statcast_batting(season=2024, min_pa=50):
    """Fetch Statcast batting leaderboard"""
    url = f'{SAVANT_BASE}/statcast'
    params = {
        'type': 'batter',
        'year': season,
        'position': '',
        'team': '',
        'min': min_pa,
        'csv': 'true'
    }
    
    print(f'Fetching Statcast batting for {season}...')
    try:
        resp = requests.get(url, params=params, timeout=60)
        if resp.status_code != 200:
            print(f'  Status: {resp.status_code}')
            return []
        
        # Remove BOM if present and parse CSV properly
        content = resp.text.lstrip('\ufeff')
        reader = csv.DictReader(io.StringIO(content))
        data = list(reader)
        print(f'  Found {len(data)} batters')
        return data
    except Exception as e:
        print(f'Error: {e}')
        return []

def fetch_statcast_pitching(season=2024, min_pa=50):
    """Fetch Statcast pitching leaderboard"""
    url = f'{SAVANT_BASE}/statcast'
    params = {
        'type': 'pitcher',
        'year': season,
        'position': '',
        'team': '',
        'min': min_pa,
        'csv': 'true'
    }
    
    print(f'Fetching Statcast pitching for {season}...')
    try:
        resp = requests.get(url, params=params, timeout=60)
        if resp.status_code != 200:
            print(f'  Status: {resp.status_code}')
            return []
        
        content = resp.text.lstrip('\ufeff')
        reader = csv.DictReader(io.StringIO(content))
        data = list(reader)
        print(f'  Found {len(data)} pitchers')
        return data
    except Exception as e:
        print(f'Error: {e}')
        return []

def fetch_expected_stats(season=2024, min_pa=25):
    """Fetch expected stats (xBA, xSLG, xwOBA) leaderboard"""
    url = f'{SAVANT_BASE}/expected_statistics'
    params = {
        'type': 'batter',
        'year': season,
        'position': '',
        'team': '',
        'min': min_pa,
        'csv': 'true'
    }
    
    print(f'Fetching Expected Stats for {season}...')
    try:
        resp = requests.get(url, params=params, timeout=60)
        if resp.status_code != 200:
            print(f'  Status: {resp.status_code}')
            return []
        
        content = resp.text.lstrip('\ufeff')
        reader = csv.DictReader(io.StringIO(content))
        data = list(reader)
        print(f'  Found {len(data)} batters')
        return data
    except Exception as e:
        print(f'Error: {e}')
        return []

def fetch_sprint_speed(season=2024):
    """Fetch sprint speed leaderboard"""
    url = f'{SAVANT_BASE}/sprint_speed'
    params = {
        'year': season,
        'position': '',
        'team': '',
        'min': 10,
        'csv': 'true'
    }
    
    print(f'Fetching sprint speed for {season}...')
    try:
        resp = requests.get(url, params=params, timeout=60)
        if resp.status_code != 200:
            print(f'  Status: {resp.status_code}')
            return []
        
        content = resp.text.lstrip('\ufeff')
        reader = csv.DictReader(io.StringIO(content))
        data = list(reader)
        print(f'  Found {len(data)} players')
        return data
    except Exception as e:
        print(f'Error: {e}')
        return []

def safe_float(val):
    """Safely convert to float"""
    try:
        if val is None or val == '' or val == 'null' or val == 'NA':
            return None
        # Remove any whitespace
        val = str(val).strip()
        return float(val)
    except:
        return None

def normalize_key(key):
    """Normalize column names to consistent format"""
    # Remove quotes, lowercase, replace spaces with underscores
    return key.strip('"').strip().lower().replace(' ', '_').replace(',', '')

def ingest_mlb_statcast(season=2024):
    """Main ingest function"""
    db_url = load_db_url()
    if not db_url:
        raise RuntimeError('SPORTS_DATABASE_URL not set')
    
    conn = psycopg2.connect(db_url)
    cur = conn.cursor()
    stats_added = 0
    
    game_key = f'{season}_MLB_SEASON'
    game_date = datetime(season, 4, 1, tzinfo=timezone.utc)
    
    sql_player = '''
        INSERT INTO "PlayerGameMetric"
        (league, season, "gameKey", "gameDate", "playerExternalId", "playerName", team, "statKey", value)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (league, season, "gameKey", "playerExternalId", "statKey")
        DO UPDATE SET value = EXCLUDED.value
    '''
    
    # 1. Statcast Batting (exit velo, launch angle, barrels)
    batting = fetch_statcast_batting(season)
    bat_count = 0
    for row in batting:
        # Normalize keys
        row = {normalize_key(k): v for k, v in row.items()}
        
        player_id = row.get('player_id', '')
        # Handle "last_name, first_name" format
        player_name = row.get('last_name_first_name', row.get('name', ''))
        if not player_name:
            # Try to construct from separate fields
            last = row.get('last_name', '')
            first = row.get('first_name', '')
            if last and first:
                player_name = f"{last}, {first}"
        
        if not player_id or not player_name:
            continue
        
        # Map Statcast columns to our metrics
        metrics = [
            ('mlb_avg_exit_velo', row.get('avg_hit_speed')),
            ('mlb_max_exit_velo', row.get('max_hit_speed')),
            ('mlb_avg_launch_angle', row.get('avg_hit_angle')),
            ('mlb_barrel_pct', row.get('brl_percent', row.get('barrel_batted_rate'))),
            ('mlb_hard_hit_pct', row.get('ev95percent', row.get('hard_hit_percent'))),
            ('mlb_sweet_spot_pct', row.get('anglesweetspotpercent')),
            ('mlb_avg_distance', row.get('avg_distance')),
            ('mlb_max_distance', row.get('max_distance')),
            ('mlb_gb_pct', row.get('gb', row.get('gb_percent'))),
            ('mlb_fb_ld_pct', row.get('fbld')),
            ('mlb_ev95plus', row.get('ev95plus')),
            ('mlb_ev50', row.get('ev50')),
        ]
        
        for stat_key, val in metrics:
            fval = safe_float(val)
            if fval is not None:
                try:
                    cur.execute(sql_player, (
                        'mlb', season, game_key, game_date,
                        str(player_id), player_name, '',
                        stat_key, fval
                    ))
                    stats_added += 1
                    bat_count += 1
                except Exception as e:
                    conn.rollback()
    
    conn.commit()
    print(f'  Batting Statcast added: {bat_count}')
    
    # 2. Expected Stats (xBA, xSLG, xwOBA)
    expected = fetch_expected_stats(season)
    exp_count = 0
    for row in expected:
        row = {normalize_key(k): v for k, v in row.items()}
        
        player_id = row.get('player_id', '')
        player_name = row.get('last_name_first_name', row.get('name', ''))
        
        if not player_id:
            continue
        
        metrics = [
            ('mlb_xba', row.get('est_ba', row.get('xba'))),
            ('mlb_xslg', row.get('est_slg', row.get('xslg'))),
            ('mlb_xwoba', row.get('est_woba', row.get('xwoba'))),
            ('mlb_ba', row.get('ba')),
            ('mlb_slg', row.get('slg')),
            ('mlb_woba', row.get('woba')),
            ('mlb_ba_minus_xba', row.get('ba_minus_xba', row.get('ba-xba'))),
            ('mlb_slg_minus_xslg', row.get('slg_minus_xslg', row.get('slg-xslg'))),
            ('mlb_woba_minus_xwoba', row.get('woba_minus_xwoba', row.get('woba-xwoba'))),
            ('mlb_pa', row.get('pa')),
            ('mlb_bip', row.get('bip')),  # Balls in play
        ]
        
        for stat_key, val in metrics:
            fval = safe_float(val)
            if fval is not None:
                try:
                    cur.execute(sql_player, (
                        'mlb', season, game_key, game_date,
                        str(player_id), player_name, '',
                        stat_key, fval
                    ))
                    stats_added += 1
                    exp_count += 1
                except:
                    conn.rollback()
    
    conn.commit()
    print(f'  Expected stats added: {exp_count}')
    
    # 3. Statcast Pitching
    pitching = fetch_statcast_pitching(season)
    pitch_count = 0
    for row in pitching:
        row = {normalize_key(k): v for k, v in row.items()}
        
        player_id = row.get('player_id', '')
        player_name = row.get('last_name_first_name', row.get('name', ''))
        
        if not player_id:
            continue
        
        metrics = [
            ('mlb_p_avg_exit_velo', row.get('avg_hit_speed')),
            ('mlb_p_max_exit_velo', row.get('max_hit_speed')),
            ('mlb_p_barrel_pct', row.get('brl_percent', row.get('barrel_batted_rate'))),
            ('mlb_p_hard_hit_pct', row.get('ev95percent', row.get('hard_hit_percent'))),
            ('mlb_p_sweet_spot_pct', row.get('anglesweetspotpercent')),
            ('mlb_p_gb_pct', row.get('gb', row.get('gb_percent'))),
        ]
        
        for stat_key, val in metrics:
            fval = safe_float(val)
            if fval is not None:
                try:
                    cur.execute(sql_player, (
                        'mlb', season, game_key, game_date,
                        str(player_id), player_name, '',
                        stat_key, fval
                    ))
                    stats_added += 1
                    pitch_count += 1
                except:
                    conn.rollback()
    
    conn.commit()
    print(f'  Pitching Statcast added: {pitch_count}')
    
    # 4. Sprint Speed
    sprint = fetch_sprint_speed(season)
    sprint_count = 0
    for row in sprint:
        row = {normalize_key(k): v for k, v in row.items()}
        
        player_name = row.get('last_name_first_name', row.get('name_display_last_first', ''))
        player_id = row.get('player_id', player_name)
        
        if not player_name:
            continue
        
        metrics = [
            ('mlb_sprint_speed', row.get('sprint_speed', row.get('r_sprint_speed_top50percent'))),
            ('mlb_hp_to_1b', row.get('hp_to_1b')),
            ('mlb_competitive_runs', row.get('competitive_runs')),
            ('mlb_bolts', row.get('bolts')),  # 30+ ft/s sprints
        ]
        
        for stat_key, val in metrics:
            fval = safe_float(val)
            if fval is not None:
                try:
                    cur.execute(sql_player, (
                        'mlb', season, game_key, game_date,
                        str(player_id), player_name, '',
                        stat_key, fval
                    ))
                    stats_added += 1
                    sprint_count += 1
                except:
                    conn.rollback()
    
    conn.commit()
    print(f'  Sprint speed added: {sprint_count}')
    
    cur.close()
    conn.close()
    
    print(f'')
    print(f'✅ MLB Statcast ingested: {stats_added} total metrics')
    return {'success': True, 'stats_added': stats_added}

if __name__ == '__main__':
    import sys
    season = int(sys.argv[1]) if len(sys.argv) > 1 else 2024
    ingest_mlb_statcast(season)
