#!/usr/bin/env python3
"""
Backfill Historical Bookmaker Odds from Kaggle Datasets
Imports historical spreads, totals, and moneylines into BookmakerOdds table.

Sources:
  - NBA: data/kaggle/nba_2008-2025.csv (2007-2025)
  - NHL: data/kaggle/nhl_data_extensive.csv (2004-2025)
  - NFL: data/kaggle/tobycrabtree__nfl-scores-and-betting-data/spreadspoke_scores.csv (1966-2025)
  - MLB: data/kaggle/oddsDataMLB.csv (2012-2025)

Usage:
    python scripts/backfill_historical_odds_kaggle.py --league nba
    python scripts/backfill_historical_odds_kaggle.py --league nhl
    python scripts/backfill_historical_odds_kaggle.py --league nfl
    python scripts/backfill_historical_odds_kaggle.py --league mlb
    python scripts/backfill_historical_odds_kaggle.py --all
"""
import os
import csv
import argparse
import psycopg2
from psycopg2.extras import execute_values
from datetime import datetime
from pathlib import Path

BASE_DIR = Path('/var/www/html/eventheodds/data/kaggle')

# Load DB URL
def load_db_url():
    env_path = '/var/www/html/eventheodds/.env'
    try:
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('SPORTS_DATABASE_URL='):
                    return line.split('=', 1)[1].strip().split('?')[0]
    except FileNotFoundError:
        pass
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]

DB_URL = load_db_url()

# NBA team code mapping
NBA_TEAM_MAP = {
    'atl': 'ATL', 'bkn': 'BKN', 'brk': 'BKN', 'nj': 'BKN', 'njn': 'BKN',
    'bos': 'BOS', 'cha': 'CHA', 'cho': 'CHA', 'chi': 'CHI', 'cle': 'CLE',
    'dal': 'DAL', 'den': 'DEN', 'det': 'DET', 'gs': 'GSW', 'gsw': 'GSW',
    'hou': 'HOU', 'ind': 'IND', 'lac': 'LAC', 'lal': 'LAL', 'mem': 'MEM',
    'mia': 'MIA', 'mil': 'MIL', 'min': 'MIN', 'no': 'NOP', 'nop': 'NOP',
    'noh': 'NOP', 'nok': 'NOP', 'ny': 'NYK', 'nyk': 'NYK',
    'okc': 'OKC', 'orl': 'ORL', 'phi': 'PHI', 'phx': 'PHX', 'pho': 'PHX',
    'por': 'POR', 'sa': 'SAS', 'sas': 'SAS', 'sac': 'SAC',
    'tor': 'TOR', 'utah': 'UTA', 'uta': 'UTA', 'wsh': 'WAS', 'was': 'WAS',
    'sea': 'SEA',  # Seattle SuperSonics (historical)
}

# NHL team mapping
NHL_TEAM_MAP = {
    'anaheim ducks': 'ANA', 'arizona coyotes': 'ARI', 'boston bruins': 'BOS',
    'buffalo sabres': 'BUF', 'calgary flames': 'CGY', 'carolina hurricanes': 'CAR',
    'chicago blackhawks': 'CHI', 'colorado avalanche': 'COL', 'columbus blue jackets': 'CBJ',
    'dallas stars': 'DAL', 'detroit red wings': 'DET', 'edmonton oilers': 'EDM',
    'florida panthers': 'FLA', 'los angeles kings': 'LAK', 'minnesota wild': 'MIN',
    'montreal canadiens': 'MTL', 'nashville predators': 'NSH', 'new jersey devils': 'NJD',
    'new york islanders': 'NYI', 'new york rangers': 'NYR', 'ottawa senators': 'OTT',
    'philadelphia flyers': 'PHI', 'pittsburgh penguins': 'PIT', 'san jose sharks': 'SJS',
    'seattle kraken': 'SEA', 'st. louis blues': 'STL', 'tampa bay lightning': 'TBL',
    'toronto maple leafs': 'TOR', 'vancouver canucks': 'VAN', 'vegas golden knights': 'VGK',
    'washington capitals': 'WSH', 'winnipeg jets': 'WPG',
    # Historical
    'phoenix coyotes': 'ARI', 'atlanta thrashers': 'WPG', 'hartford whalers': 'CAR',
}

# NFL team mapping (full name -> abbreviation)
NFL_TEAM_MAP = {
    'arizona cardinals': 'ARI', 'atlanta falcons': 'ATL', 'baltimore ravens': 'BAL',
    'buffalo bills': 'BUF', 'carolina panthers': 'CAR', 'chicago bears': 'CHI',
    'cincinnati bengals': 'CIN', 'cleveland browns': 'CLE', 'dallas cowboys': 'DAL',
    'denver broncos': 'DEN', 'detroit lions': 'DET', 'green bay packers': 'GB',
    'houston texans': 'HOU', 'indianapolis colts': 'IND', 'jacksonville jaguars': 'JAX',
    'kansas city chiefs': 'KC', 'las vegas raiders': 'LV', 'los angeles chargers': 'LAC',
    'los angeles rams': 'LAR', 'miami dolphins': 'MIA', 'minnesota vikings': 'MIN',
    'new england patriots': 'NE', 'new orleans saints': 'NO', 'new york giants': 'NYG',
    'new york jets': 'NYJ', 'philadelphia eagles': 'PHI', 'pittsburgh steelers': 'PIT',
    'san francisco 49ers': 'SF', 'seattle seahawks': 'SEA', 'tampa bay buccaneers': 'TB',
    'tennessee titans': 'TEN', 'washington commanders': 'WAS',
    # Historical names
    'oakland raiders': 'LV', 'san diego chargers': 'LAC', 'st. louis rams': 'LAR',
    'houston oilers': 'TEN', 'tennessee oilers': 'TEN', 'washington redskins': 'WAS',
    'washington football team': 'WAS', 'phoenix cardinals': 'ARI', 'st. louis cardinals': 'ARI',
    'baltimore colts': 'IND', 'boston patriots': 'NE',
}

# MLB team abbreviation normalization
MLB_TEAM_MAP = {
    'ARI': 'ARI', 'ATL': 'ATL', 'BAL': 'BAL', 'BOS': 'BOS', 'CHC': 'CHC',
    'CHW': 'CHW', 'CIN': 'CIN', 'CLE': 'CLE', 'COL': 'COL', 'DET': 'DET',
    'HOU': 'HOU', 'KC': 'KC', 'LAA': 'LAA', 'LAD': 'LAD', 'MIA': 'MIA',
    'MIL': 'MIL', 'MIN': 'MIN', 'NYM': 'NYM', 'NYY': 'NYY', 'OAK': 'OAK',
    'PHI': 'PHI', 'PIT': 'PIT', 'SD': 'SD', 'SF': 'SF', 'SEA': 'SEA',
    'STL': 'STL', 'TB': 'TB', 'TEX': 'TEX', 'TOR': 'TOR', 'WSH': 'WSH',
    # Alternate codes
    'WAS': 'WSH', 'CWS': 'CHW', 'CHI': 'CHC', 'ANA': 'LAA', 'FLA': 'MIA',
    'MON': 'WSH',  # Montreal Expos -> Washington Nationals
}


def to_float(x):
    if x is None:
        return None
    s = str(x).strip()
    if s == '' or s.lower() == 'nan' or s.lower() == 'none':
        return None
    try:
        return float(s)
    except:
        return None


def to_int(x):
    if x is None:
        return None
    s = str(x).strip()
    if s == '' or s.lower() == 'nan' or s.lower() == 'none':
        return None
    try:
        return int(float(s))
    except:
        return None


def parse_date(date_str):
    """Parse various date formats"""
    if not date_str:
        return None
    date_str = str(date_str).strip()

    # Try common formats
    for fmt in ['%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S%z', '%m/%d/%Y']:
        try:
            return datetime.strptime(date_str[:19], fmt[:len(fmt)-1] if '%z' in fmt else fmt)
        except:
            continue
    return None


def import_nba_odds(conn, start_year=2023):
    """Import NBA historical odds from Kaggle"""
    src_file = BASE_DIR / 'nba_2008-2025.csv'
    if not src_file.exists():
        print(f"ERROR: {src_file} not found")
        return 0

    print(f"\n{'='*60}")
    print(f"IMPORTING NBA HISTORICAL ODDS (from {start_year})")
    print(f"Source: {src_file}")
    print(f"{'='*60}\n")

    cur = conn.cursor()
    records = []
    skipped = 0

    with open(src_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                season = to_int(row.get('season'))
                if season and season < start_year:
                    skipped += 1
                    continue

                date = parse_date(row.get('date'))
                if not date:
                    continue

                home = NBA_TEAM_MAP.get(row.get('home', '').lower().strip(), row.get('home', '').upper())
                away = NBA_TEAM_MAP.get(row.get('away', '').lower().strip(), row.get('away', '').upper())

                if not home or not away:
                    continue

                spread = to_float(row.get('spread'))
                total = to_float(row.get('total'))
                ml_home = to_int(row.get('moneyline_home'))
                ml_away = to_int(row.get('moneyline_away'))

                # Create spread record
                if spread is not None:
                    records.append({
                        'league': 'nba',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'spread',
                        'lineValue': spread,
                        'homeOdds': -110,  # Standard juice
                        'awayOdds': -110,
                        'source': 'kaggle',
                    })

                # Create total record
                if total is not None:
                    records.append({
                        'league': 'nba',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'total',
                        'lineValue': total,
                        'homeOdds': -110,  # Over
                        'awayOdds': -110,  # Under
                        'source': 'kaggle',
                    })

                # Create moneyline record
                if ml_home is not None and ml_away is not None:
                    records.append({
                        'league': 'nba',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'moneyline',
                        'lineValue': None,
                        'homeOdds': ml_home,
                        'awayOdds': ml_away,
                        'source': 'kaggle',
                    })

            except Exception as e:
                continue

    print(f"Parsed {len(records)} records (skipped {skipped} pre-{start_year})")

    # Batch insert
    if records:
        inserted = insert_bookmaker_odds(conn, records)
        print(f"Inserted {inserted} BookmakerOdds records")
        return inserted
    return 0


def import_nhl_odds(conn, start_year=2023):
    """Import NHL historical odds from Kaggle"""
    src_file = BASE_DIR / 'nhl_data_extensive.csv'
    if not src_file.exists():
        print(f"ERROR: {src_file} not found")
        return 0

    print(f"\n{'='*60}")
    print(f"IMPORTING NHL HISTORICAL ODDS (from {start_year})")
    print(f"Source: {src_file}")
    print(f"{'='*60}\n")

    cur = conn.cursor()
    records = []
    skipped = 0
    seen_games = set()  # Track unique games (date, home, away)

    with open(src_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                season = to_int(row.get('season'))
                if season and season < start_year:
                    skipped += 1
                    continue

                date = parse_date(row.get('date'))
                if not date:
                    continue

                # Only process home team rows to avoid duplicates
                is_home = row.get('is_home', '').strip()
                if is_home != '1' and is_home.lower() != 'true':
                    continue

                team_name = row.get('team_name', '').strip().lower()
                opp_name = row.get('opp_team_name', '').strip().lower()

                home = NHL_TEAM_MAP.get(team_name, team_name[:3].upper())
                away = NHL_TEAM_MAP.get(opp_name, opp_name[:3].upper())

                game_key = (date.date(), home, away)
                if game_key in seen_games:
                    continue
                seen_games.add(game_key)

                spread = to_float(row.get('spread'))
                total = to_float(row.get('over_under'))
                ml_fav = to_int(row.get('favorite_moneyline'))

                # Create spread record (NHL uses puck line, typically 1.5)
                if spread is not None:
                    records.append({
                        'league': 'nhl',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'spread',
                        'lineValue': spread,
                        'homeOdds': -110,
                        'awayOdds': -110,
                        'source': 'kaggle',
                    })

                # Create total record
                if total is not None:
                    records.append({
                        'league': 'nhl',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'total',
                        'lineValue': total,
                        'homeOdds': -110,
                        'awayOdds': -110,
                        'source': 'kaggle',
                    })

                # Create moneyline record (favorite ML given, estimate underdog)
                if ml_fav is not None:
                    # Estimate underdog ML from favorite
                    if ml_fav < 0:
                        ml_dog = int(abs(ml_fav) * 0.8)  # Rough estimate
                    else:
                        ml_dog = -int(ml_fav * 1.2)

                    records.append({
                        'league': 'nhl',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'moneyline',
                        'lineValue': None,
                        'homeOdds': ml_fav if spread and spread < 0 else ml_dog,
                        'awayOdds': ml_dog if spread and spread < 0 else ml_fav,
                        'source': 'kaggle',
                    })

            except Exception as e:
                continue

    print(f"Parsed {len(records)} records (skipped {skipped} pre-{start_year})")

    if records:
        inserted = insert_bookmaker_odds(conn, records)
        print(f"Inserted {inserted} BookmakerOdds records")
        return inserted
    return 0


def import_nfl_odds(conn, start_year=2023):
    """Import NFL historical odds from Kaggle spreadspoke data"""
    src_file = BASE_DIR / 'tobycrabtree__nfl-scores-and-betting-data' / 'spreadspoke_scores.csv'
    if not src_file.exists():
        print(f"ERROR: {src_file} not found")
        return 0

    print(f"\n{'='*60}")
    print(f"IMPORTING NFL HISTORICAL ODDS (from {start_year})")
    print(f"Source: {src_file}")
    print(f"{'='*60}\n")

    records = []
    skipped = 0

    with open(src_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                season = to_int(row.get('schedule_season'))
                if season and season < start_year:
                    skipped += 1
                    continue

                # Parse date (format: M/D/YYYY)
                date_str = row.get('schedule_date', '')
                date = parse_date(date_str)
                if not date:
                    continue

                home_name = row.get('team_home', '').strip().lower()
                away_name = row.get('team_away', '').strip().lower()

                home = NFL_TEAM_MAP.get(home_name, home_name[:3].upper())
                away = NFL_TEAM_MAP.get(away_name, away_name[:3].upper())

                if not home or not away:
                    continue

                spread = to_float(row.get('spread_favorite'))
                total = to_float(row.get('over_under_line'))
                favorite_id = row.get('team_favorite_id', '').strip()

                # Skip games with no odds data
                if spread is None and total is None:
                    continue

                # Create spread record (spread is from favorite's perspective)
                if spread is not None and favorite_id:
                    # Determine if home team is the favorite (favorite_id is abbreviation like 'KC')
                    home_is_favorite = favorite_id.upper() == home
                    home_spread = spread if home_is_favorite else -spread
                    records.append({
                        'league': 'nfl',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'spread',
                        'lineValue': home_spread,
                        'homeOdds': -110,
                        'awayOdds': -110,
                        'source': 'kaggle',
                    })

                # Create total record
                if total is not None:
                    records.append({
                        'league': 'nfl',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'total',
                        'lineValue': total,
                        'homeOdds': -110,
                        'awayOdds': -110,
                        'source': 'kaggle',
                    })

            except Exception as e:
                continue

    print(f"Parsed {len(records)} records (skipped {skipped} pre-{start_year})")

    if records:
        inserted = insert_bookmaker_odds(conn, records)
        print(f"Inserted {inserted} BookmakerOdds records")
        return inserted
    return 0


def import_mlb_odds(conn, start_year=2023):
    """Import MLB historical odds from Kaggle"""
    src_file = BASE_DIR / 'oddsDataMLB.csv'
    if not src_file.exists():
        print(f"ERROR: {src_file} not found")
        return 0

    print(f"\n{'='*60}")
    print(f"IMPORTING MLB HISTORICAL ODDS (from {start_year})")
    print(f"Source: {src_file}")
    print(f"{'='*60}\n")

    records = []
    skipped = 0
    seen_games = set()  # Track unique games to avoid duplicates (data has both teams' perspective)

    with open(src_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                season = to_int(row.get('season'))
                if season and season < start_year:
                    skipped += 1
                    continue

                date = parse_date(row.get('date'))
                if not date:
                    continue

                team = MLB_TEAM_MAP.get(row.get('team', '').strip().upper(), row.get('team', '').strip().upper())
                opponent = MLB_TEAM_MAP.get(row.get('opponent', '').strip().upper(), row.get('opponent', '').strip().upper())

                if not team or not opponent:
                    continue

                # Data has each game twice (once per team). Use lexicographic order to dedupe
                game_key = (date.date(), tuple(sorted([team, opponent])))
                if game_key in seen_games:
                    continue
                seen_games.add(game_key)

                # Determine home/away (we'll assume first team alphabetically is "home" for consistency)
                if team < opponent:
                    home, away = team, opponent
                    ml_home = to_int(row.get('moneyLine'))
                    ml_away = to_int(row.get('oppMoneyLine'))
                    run_line = to_float(row.get('runLine'))
                else:
                    home, away = opponent, team
                    ml_home = to_int(row.get('oppMoneyLine'))
                    ml_away = to_int(row.get('moneyLine'))
                    run_line = to_float(row.get('oppRunLine'))

                total = to_float(row.get('total'))
                over_odds = to_int(row.get('overOdds'))
                under_odds = to_int(row.get('underOdds'))

                # Create spread (run line) record
                if run_line is not None:
                    records.append({
                        'league': 'mlb',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'spread',
                        'lineValue': run_line,
                        'homeOdds': -110,
                        'awayOdds': -110,
                        'source': 'kaggle',
                    })

                # Create total record
                if total is not None:
                    records.append({
                        'league': 'mlb',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'total',
                        'lineValue': total,
                        'homeOdds': over_odds if over_odds else -110,
                        'awayOdds': under_odds if under_odds else -110,
                        'source': 'kaggle',
                    })

                # Create moneyline record
                if ml_home is not None and ml_away is not None:
                    records.append({
                        'league': 'mlb',
                        'gameDate': date,
                        'homeTeam': home,
                        'awayTeam': away,
                        'bookmaker': 'consensus',
                        'market': 'moneyline',
                        'lineValue': None,
                        'homeOdds': ml_home,
                        'awayOdds': ml_away,
                        'source': 'kaggle',
                    })

            except Exception as e:
                continue

    print(f"Parsed {len(records)} records (skipped {skipped} pre-{start_year})")

    if records:
        inserted = insert_bookmaker_odds(conn, records)
        print(f"Inserted {inserted} BookmakerOdds records")
        return inserted
    return 0


def generate_game_id(league, game_date, home_team, away_team):
    """Generate a unique numeric gameId from game details"""
    import hashlib
    # Create a deterministic hash from game details
    key = f"{league}:{game_date}:{home_team}:{away_team}"
    hash_bytes = hashlib.md5(key.encode()).digest()
    # Convert first 8 bytes to a positive integer
    return int.from_bytes(hash_bytes[:8], 'big') % (2**31)  # Keep it in int32 range


def insert_bookmaker_odds(conn, records):
    """Insert records into BookmakerOdds with upsert"""
    if not records:
        return 0

    cur = conn.cursor()

    # Dedupe by unique key
    unique_records = {}
    for r in records:
        key = (r['league'], r['gameDate'].date(), r['homeTeam'], r['awayTeam'], r['bookmaker'], r['market'], r['lineValue'])
        unique_records[key] = r

    sql_alt = '''
        INSERT INTO "BookmakerOdds"
        (league, "gameDate", "homeTeam", "awayTeam", bookmaker, market, "lineValue",
         "homeOdds", "awayOdds", source, "fetchedAt", "createdAt", "gameId")
        VALUES %s
        ON CONFLICT ("gameId", bookmaker, market, "lineValue") DO NOTHING
    '''

    values = []
    for r in unique_records.values():
        game_id = generate_game_id(r['league'], r['gameDate'].date(), r['homeTeam'], r['awayTeam'])
        values.append((
            r['league'],
            r['gameDate'],
            r['homeTeam'],
            r['awayTeam'],
            r['bookmaker'],
            r['market'],
            r['lineValue'],
            r['homeOdds'],
            r['awayOdds'],
            r['source'],
            datetime.now(),
            datetime.now(),
            game_id,
        ))

    try:
        execute_values(cur, sql_alt, values, page_size=1000)
        conn.commit()
    except Exception as e:
        print(f"Bulk insert error: {e}")
        conn.rollback()
        # Fall back to individual inserts
        inserted = 0
        for v in values:
            try:
                cur.execute('''
                    INSERT INTO "BookmakerOdds"
                    (league, "gameDate", "homeTeam", "awayTeam", bookmaker, market, "lineValue",
                     "homeOdds", "awayOdds", source, "fetchedAt", "createdAt", "gameId")
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT DO NOTHING
                ''', v)
                inserted += 1
            except:
                pass
        conn.commit()
        return inserted

    cur.close()
    return len(values)


def main():
    parser = argparse.ArgumentParser(description='Backfill historical odds from Kaggle')
    parser.add_argument('--league', choices=['nba', 'nhl', 'nfl', 'mlb', 'all'], default='all')
    parser.add_argument('--start-year', type=int, default=2023, help='Start year for import')

    args = parser.parse_args()

    conn = psycopg2.connect(DB_URL)
    total = 0

    try:
        if args.league in ['nba', 'all']:
            total += import_nba_odds(conn, args.start_year)

        if args.league in ['nhl', 'all']:
            total += import_nhl_odds(conn, args.start_year)

        if args.league in ['nfl', 'all']:
            total += import_nfl_odds(conn, args.start_year)

        if args.league in ['mlb', 'all']:
            total += import_mlb_odds(conn, args.start_year)

        print(f"\n{'='*60}")
        print(f"TOTAL IMPORTED: {total} records")
        print(f"{'='*60}")

    finally:
        conn.close()


if __name__ == '__main__':
    main()
