#!/usr/bin/env python3
"""
Backfill MLB PlayerGameMetric from MLB Stats API Boxscores
Fetches game-by-game player stats for historical games.

Usage:
    python scripts/backfill_mlb_player_metrics.py --start 2024-03-28 --end 2024-10-31
    python scripts/backfill_mlb_player_metrics.py --days 30  # Last 30 days
"""
import os
import sys
import argparse
import requests
import psycopg2
from psycopg2.extras import execute_values
from datetime import datetime, timedelta
from collections import defaultdict
import time

# Load environment
def load_db_url():
    env_path = '/var/www/html/eventheodds/.env'
    try:
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('SPORTS_DATABASE_URL='):
                    return line.split('=', 1)[1].strip().split('?')[0]
    except FileNotFoundError:
        pass
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]

DB_URL = load_db_url()
MLB_API_BASE = "https://statsapi.mlb.com/api/v1"
HEADERS = {'User-Agent': 'EvenTheOdds/1.0'}

# Team name mapping (abbreviation -> possible API names)
TEAM_ABBREV_MAP = {
    'ARI': ['Arizona Diamondbacks', 'D-backs'],
    'ATL': ['Atlanta Braves'],
    'BAL': ['Baltimore Orioles'],
    'BOS': ['Boston Red Sox'],
    'CHC': ['Chicago Cubs'],
    'CHW': ['Chicago White Sox'],
    'CIN': ['Cincinnati Reds'],
    'CLE': ['Cleveland Guardians'],
    'COL': ['Colorado Rockies'],
    'DET': ['Detroit Tigers'],
    'HOU': ['Houston Astros'],
    'KC': ['Kansas City Royals'],
    'LAA': ['Los Angeles Angels'],
    'LAD': ['Los Angeles Dodgers'],
    'MIA': ['Miami Marlins'],
    'MIL': ['Milwaukee Brewers'],
    'MIN': ['Minnesota Twins'],
    'NYM': ['New York Mets'],
    'NYY': ['New York Yankees'],
    'OAK': ['Oakland Athletics'],
    'PHI': ['Philadelphia Phillies'],
    'PIT': ['Pittsburgh Pirates'],
    'SD': ['San Diego Padres'],
    'SF': ['San Francisco Giants'],
    'SEA': ['Seattle Mariners'],
    'STL': ['St. Louis Cardinals'],
    'TB': ['Tampa Bay Rays'],
    'TEX': ['Texas Rangers'],
    'TOR': ['Toronto Blue Jays'],
    'WSH': ['Washington Nationals'],
}

# Reverse mapping for lookup
TEAM_NAME_TO_ABBREV = {}
for abbrev, names in TEAM_ABBREV_MAP.items():
    for name in names:
        TEAM_NAME_TO_ABBREV[name.lower()] = abbrev

# Stats to extract from boxscore (batting)
BATTING_STATS = [
    ('atBats', 'mlb_at_bats'),
    ('runs', 'mlb_runs'),
    ('hits', 'mlb_hits'),
    ('doubles', 'mlb_doubles'),
    ('triples', 'mlb_triples'),
    ('homeRuns', 'mlb_home_runs'),
    ('rbi', 'mlb_rbi'),
    ('baseOnBalls', 'mlb_walks'),
    ('strikeOuts', 'mlb_strikeouts'),
    ('stolenBases', 'mlb_stolen_bases'),
    ('caughtStealing', 'mlb_caught_stealing'),
    ('totalBases', 'mlb_total_bases'),
]

# Stats to extract from boxscore (pitching)
PITCHING_STATS = [
    ('inningsPitched', 'mlb_innings_pitched'),
    ('hits', 'mlb_hits_allowed'),
    ('runs', 'mlb_runs_allowed'),
    ('earnedRuns', 'mlb_earned_runs'),
    ('baseOnBalls', 'mlb_walks_allowed'),
    ('strikeOuts', 'mlb_pitcher_strikeouts'),
    ('homeRuns', 'mlb_home_runs_allowed'),
    ('pitchesThrown', 'mlb_pitches'),
    ('strikes', 'mlb_strikes'),
]


def fetch_mlb_schedule(date_str):
    """Fetch MLB games for a specific date"""
    url = f"{MLB_API_BASE}/schedule?sportId=1&date={date_str}"
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        if resp.status_code != 200:
            return []
        data = resp.json()
        games = []
        for date_entry in data.get('dates', []):
            for game in date_entry.get('games', []):
                status = game.get('status', {}).get('detailedState', '')
                if status in ['Final', 'Completed Early']:
                    games.append({
                        'gamePk': game.get('gamePk'),
                        'homeTeam': game.get('teams', {}).get('home', {}).get('team', {}).get('name'),
                        'awayTeam': game.get('teams', {}).get('away', {}).get('team', {}).get('name'),
                        'homeScore': game.get('teams', {}).get('home', {}).get('score'),
                        'awayScore': game.get('teams', {}).get('away', {}).get('score'),
                    })
        return games
    except Exception as e:
        print(f"  Error fetching schedule: {e}")
        return []


def fetch_boxscore(game_pk):
    """Fetch detailed boxscore for a game"""
    url = f"{MLB_API_BASE}/game/{game_pk}/boxscore"
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        if resp.status_code != 200:
            return None
        return resp.json()
    except Exception as e:
        print(f"  Error fetching boxscore {game_pk}: {e}")
        return None


def parse_boxscore_stats(boxscore, game_date, game_pk):
    """Parse boxscore into PlayerGameMetric records"""
    metrics = []
    season = game_date.year
    game_key = f"mlb_{game_pk}"

    for side in ['home', 'away']:
        team_data = boxscore.get('teams', {}).get(side, {})
        team_name = team_data.get('team', {}).get('name', '')
        players = team_data.get('players', {})

        for player_key, player_data in players.items():
            player_id = player_data.get('person', {}).get('id')
            player_name = player_data.get('person', {}).get('fullName')
            position = player_data.get('position', {}).get('abbreviation', '')

            if not player_id or not player_name:
                continue

            stats = player_data.get('stats', {})

            # Batting stats
            batting = stats.get('batting', {})
            if batting:
                for api_key, stat_key in BATTING_STATS:
                    value = batting.get(api_key)
                    if value is not None:
                        try:
                            metrics.append({
                                'league': 'mlb',
                                'season': season,
                                'gameKey': game_key,
                                'gameDate': game_date,
                                'playerExternalId': str(player_id),
                                'playerName': player_name,
                                'position': position,
                                'team': team_name,
                                'statKey': stat_key,
                                'value': float(value) if not isinstance(value, str) else parse_innings(value),
                            })
                        except (ValueError, TypeError):
                            pass

            # Pitching stats
            pitching = stats.get('pitching', {})
            if pitching and pitching.get('inningsPitched'):
                for api_key, stat_key in PITCHING_STATS:
                    value = pitching.get(api_key)
                    if value is not None:
                        try:
                            metrics.append({
                                'league': 'mlb',
                                'season': season,
                                'gameKey': game_key,
                                'gameDate': game_date,
                                'playerExternalId': str(player_id),
                                'playerName': player_name,
                                'position': position,
                                'team': team_name,
                                'statKey': stat_key,
                                'value': float(value) if not isinstance(value, str) else parse_innings(value),
                            })
                        except (ValueError, TypeError):
                            pass

    return metrics


def parse_innings(ip_str):
    """Parse innings pitched string like '6.2' to float"""
    if isinstance(ip_str, (int, float)):
        return float(ip_str)
    try:
        # MLB format: 6.2 means 6 and 2/3 innings
        parts = str(ip_str).split('.')
        if len(parts) == 2:
            whole = int(parts[0])
            thirds = int(parts[1])
            return whole + (thirds / 3.0)
        return float(ip_str)
    except:
        return 0.0


def insert_metrics(conn, metrics):
    """Insert metrics into PlayerGameMetric with upsert"""
    if not metrics:
        return 0

    cur = conn.cursor()

    # Deduplicate by unique key (last value wins)
    unique_metrics = {}
    for m in metrics:
        key = (m['league'], m['season'], m['gameKey'], m['playerExternalId'], m['statKey'])
        unique_metrics[key] = m

    # Use upsert
    sql = '''
        INSERT INTO "PlayerGameMetric"
        (league, season, "gameKey", "gameDate", "playerExternalId", "playerName",
         position, team, "statKey", value, "createdAt")
        VALUES %s
        ON CONFLICT (league, season, "gameKey", "playerExternalId", "statKey")
        DO UPDATE SET value = EXCLUDED.value
    '''

    values = [
        (m['league'], m['season'], m['gameKey'], m['gameDate'], m['playerExternalId'],
         m['playerName'], m['position'], m['team'], m['statKey'], m['value'], datetime.now())
        for m in unique_metrics.values()
    ]

    execute_values(cur, sql, values, page_size=500)
    conn.commit()
    cur.close()

    return len(values)


def update_player_mapping(conn, metrics):
    """Update PlayerMapping with MLB player IDs"""
    cur = conn.cursor()

    players_seen = set()
    for m in metrics:
        key = (m['playerExternalId'], m['playerName'])
        if key not in players_seen:
            players_seen.add(key)

            # Upsert player mapping
            try:
                normalized = m['playerName'].lower().replace(' ', '_').replace('.', '').replace("'", '')
                cur.execute('''
                    INSERT INTO "PlayerMapping" (league, "playerName", "normalizedName", "sgoId", "nbaComId", "createdAt", "updatedAt")
                    VALUES ('mlb', %s, %s, %s, %s, NOW(), NOW())
                    ON CONFLICT (league, "normalizedName") DO UPDATE SET
                        "nbaComId" = COALESCE(EXCLUDED."nbaComId", "PlayerMapping"."nbaComId"),
                        "updatedAt" = NOW()
                ''', (m['playerName'], normalized, m['playerExternalId'], m['playerExternalId']))
            except Exception as e:
                pass  # Ignore errors

    conn.commit()
    cur.close()


def backfill_date_range(start_date, end_date):
    """Backfill MLB stats for a date range"""
    conn = psycopg2.connect(DB_URL)

    current = start_date
    total_metrics = 0
    total_games = 0

    print(f"\n{'='*60}")
    print(f"BACKFILLING MLB PLAYER METRICS")
    print(f"Date range: {start_date.date()} to {end_date.date()}")
    print(f"{'='*60}\n")

    while current <= end_date:
        date_str = current.strftime('%Y-%m-%d')
        print(f"\n[{date_str}] Fetching schedule...")

        games = fetch_mlb_schedule(date_str)
        if not games:
            print(f"  No completed games")
            current += timedelta(days=1)
            continue

        print(f"  Found {len(games)} completed games")

        for game in games:
            game_pk = game['gamePk']
            home = game['homeTeam'][:20]
            away = game['awayTeam'][:20]

            print(f"    {away} @ {home} (pk={game_pk})...", end=' ')

            boxscore = fetch_boxscore(game_pk)
            if not boxscore:
                print("FAILED")
                continue

            metrics = parse_boxscore_stats(boxscore, current, game_pk)
            if metrics:
                inserted = insert_metrics(conn, metrics)
                update_player_mapping(conn, metrics)
                total_metrics += inserted
                total_games += 1
                print(f"{inserted} stats")
            else:
                print("no stats")

            # Rate limit
            time.sleep(0.2)

        current += timedelta(days=1)

    conn.close()

    print(f"\n{'='*60}")
    print(f"BACKFILL COMPLETE")
    print(f"Games processed: {total_games}")
    print(f"Stats inserted: {total_metrics}")
    print(f"{'='*60}")

    return total_metrics


def main():
    parser = argparse.ArgumentParser(description='Backfill MLB PlayerGameMetric')
    parser.add_argument('--start', help='Start date (YYYY-MM-DD)')
    parser.add_argument('--end', help='End date (YYYY-MM-DD)')
    parser.add_argument('--days', type=int, help='Days back from today')
    parser.add_argument('--season', type=int, help='Full season year (e.g., 2024)')

    args = parser.parse_args()

    if args.season:
        # Full season
        start_date = datetime(args.season, 3, 20)  # Spring training/opening day
        end_date = datetime(args.season, 10, 31)   # End of World Series
    elif args.days:
        end_date = datetime.now()
        start_date = end_date - timedelta(days=args.days)
    elif args.start and args.end:
        start_date = datetime.strptime(args.start, '%Y-%m-%d')
        end_date = datetime.strptime(args.end, '%Y-%m-%d')
    else:
        # Default: 2024 season
        start_date = datetime(2024, 3, 20)
        end_date = datetime(2024, 10, 31)

    backfill_date_range(start_date, end_date)


if __name__ == '__main__':
    main()
