#!/usr/bin/env python3
"""
Fetch sports data from BallDontLie API and store it in the betting database format.

This script fetches game data for the 2024 season and converts it to the
format expected by sportsBettingData.ts for backtesting.

Usage:
    python scripts/fetch_balldontlie_data.py [sport]
    
    sport: nba, nfl, nhl, mlb, or all (default: all)

Environment:
    BALLDONTLIE_API_KEY: API key for BallDontLie (required)
"""

import os
import sys
import json
import time
import hashlib
from datetime import datetime, timedelta
from pathlib import Path

# Try to import requests, fall back to urllib if not available
try:
    import requests
    USE_REQUESTS = True
except ImportError:
    import urllib.request
    import urllib.error
    USE_REQUESTS = False

# Configuration
API_BASE_URL = "https://api.balldontlie.io"
DATA_DIR = Path(__file__).parent.parent / "data" / "betting"
CACHE_DIR = Path(__file__).parent.parent / "data" / "balldontlie_cache"

# API key from environment
API_KEY = os.environ.get("BALLDONTLIE_API_KEY", "db60bdff-30db-4e3c-999a-9ce31d0a6c6c")

# Sport configurations
SPORT_CONFIG = {
    "nba": {
        "api_prefix": "/v1",
        "seasons": [2024, 2025],  # Fetch both 2024 and 2025 seasons
        "output_file": "nba_historical.json"
    },
    "nfl": {
        "api_prefix": "/nfl/v1",
        "seasons": [2024, 2025],
        "output_file": "nfl_historical.json"
    },
    "nhl": {
        "api_prefix": "/nhl/v1",
        "seasons": [2024, 2025],
        "output_file": "nhl_historical.json"
    },
    "mlb": {
        "api_prefix": "/mlb/v1",
        "seasons": [2024],  # MLB 2025 not started yet
        "output_file": "mlb_historical.json"
    }
}


def make_request(url: str) -> dict:
    """Make an API request to BallDontLie."""
    headers = {"Authorization": API_KEY}
    
    if USE_REQUESTS:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        return response.json()
    else:
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=30) as response:
            return json.loads(response.read().decode())


def fetch_all_games(sport: str, season: int, per_page: int = 100, max_pages: int = 50) -> list:
    """Fetch ALL games for a given sport and season (all pages)."""
    config = SPORT_CONFIG.get(sport)
    if not config:
        print(f"Unknown sport: {sport}")
        return []
    
    all_games = []
    page = 1
    
    print(f"Fetching {sport.upper()} games for {season} season...")
    
    while page <= max_pages:
        url = f"{API_BASE_URL}{config['api_prefix']}/games?seasons[]={season}&per_page={per_page}&page={page}"
        print(f"  Page {page}...", end=" ", flush=True)
        
        try:
            data = make_request(url)
            games = data.get("data", [])
            
            if not games:
                print("done (no more data)")
                break
            
            all_games.extend(games)
            print(f"{len(games)} games (total: {len(all_games)})")
            
            # Check if there are more pages
            meta = data.get("meta", {})
            next_page = meta.get("next_page")
            if next_page is None:
                print(f"  ✓ Completed - all pages fetched")
                break
            
            page += 1
            time.sleep(0.3)  # Rate limiting
            
        except Exception as e:
            print(f"error: {e}")
            if page == 1:
                return []
            break
    
    return all_games


def fetch_real_odds(sport: str, season: int, per_page: int = 100, max_pages: int = 50) -> dict:
    """Fetch REAL odds from BallDontLie API for a given sport and season.
    Returns a dict mapping game_id -> odds data.
    """
    config = SPORT_CONFIG.get(sport)
    if not config:
        return {}
    
    all_odds = {}
    page = 1
    
    print(f"Fetching {sport.upper()} REAL odds for {season} season...")
    
    while page <= max_pages:
        # Different sports may have different odds endpoints
        if sport == "nba":
            # NBA odds are date-based, we'll try season games
            url = f"{API_BASE_URL}{config['api_prefix']}/odds?per_page={per_page}&page={page}"
        elif sport == "nfl":
            url = f"{API_BASE_URL}{config['api_prefix']}/odds?season={season}&per_page={per_page}&page={page}"
        else:
            # Try generic approach
            url = f"{API_BASE_URL}{config['api_prefix']}/odds?per_page={per_page}&page={page}"
        
        print(f"  Odds page {page}...", end=" ", flush=True)
        
        try:
            data = make_request(url)
            odds_list = data.get("data", [])
            
            if not odds_list:
                print("done (no more data)")
                break
            
            # Index by game_id
            for odds_entry in odds_list:
                game_id = odds_entry.get("game_id")
                if game_id:
                    if game_id not in all_odds:
                        all_odds[game_id] = {"moneyline": None, "spread": None, "total": None}
                    
                    odds_type = odds_entry.get("type", "").lower()
                    if odds_type == "2way" or odds_type == "moneyline":
                        all_odds[game_id]["moneyline"] = odds_entry
                    elif odds_type == "spread":
                        all_odds[game_id]["spread"] = odds_entry
                    elif odds_type in ("over_under", "total", "totals"):
                        all_odds[game_id]["total"] = odds_entry
            
            print(f"{len(odds_list)} entries (total games with odds: {len(all_odds)})")
            
            # Check for more pages
            meta = data.get("meta", {})
            next_page = meta.get("next_page")
            if next_page is None:
                print(f"  ✓ Completed - all odds pages fetched")
                break
            
            page += 1
            time.sleep(0.3)  # Rate limiting
            
        except Exception as e:
            print(f"error: {e}")
            break
    
    return all_odds


def estimate_odds(margin: int, total_points: int, sport: str) -> dict:
    """
    DEPRECATED: Estimate odds based on final score.
    WARNING: This creates information leakage - odds derived from outcomes are NOT valid for backtesting!
    Only use this for display purposes when real odds are unavailable.
    """
    
    # Sport-specific adjustments
    if sport == "nfl":
        # NFL games are typically lower scoring, tighter spreads
        if margin > 14:
            ml_home = min(-300 + (margin * 5), -400)
            ml_away = max(250 - (margin * 4), 350)
        elif margin > 7:
            ml_home = -200 + (margin * 8)
            ml_away = 180 - (margin * 6)
        elif margin > 0:
            ml_home = -150 + (margin * 7)
            ml_away = 130 - (margin * 5)
        elif margin < -14:
            ml_home = max(250 + (margin * 4), 350)
            ml_away = min(-300 - (margin * 5), -400)
        elif margin < -7:
            ml_home = 180 + (margin * 6)
            ml_away = -200 - (margin * 8)
        elif margin < 0:
            ml_home = 130 + (margin * 5)
            ml_away = -150 - (margin * 7)
        else:
            ml_home = -110
            ml_away = -110
        spread_home = round(-margin / 2 * 2) / 2
        
    elif sport == "mlb":
        # MLB uses run lines (+/- 1.5 typically)
        if margin > 5:
            ml_home = -250 + (margin * 20)
            ml_away = 220 - (margin * 15)
        elif margin > 2:
            ml_home = -180 + (margin * 15)
            ml_away = 160 - (margin * 12)
        elif margin > 0:
            ml_home = -130 + (margin * 10)
            ml_away = 110 - (margin * 8)
        elif margin < -5:
            ml_home = 220 + (margin * 15)
            ml_away = -250 - (margin * 20)
        elif margin < -2:
            ml_home = 160 + (margin * 12)
            ml_away = -180 - (margin * 15)
        elif margin < 0:
            ml_home = 110 + (margin * 8)
            ml_away = -130 - (margin * 10)
        else:
            ml_home = -110
            ml_away = -110
        spread_home = -1.5 if margin > 0 else 1.5  # Run line
        
    elif sport == "nhl":
        # NHL uses puck lines (+/- 1.5)
        if margin > 3:
            ml_home = -200 + (margin * 30)
            ml_away = 180 - (margin * 25)
        elif margin > 1:
            ml_home = -150 + (margin * 25)
            ml_away = 130 - (margin * 20)
        elif margin > 0:
            ml_home = -120 + (margin * 15)
            ml_away = 100 - (margin * 10)
        elif margin < -3:
            ml_home = 180 + (margin * 25)
            ml_away = -200 - (margin * 30)
        elif margin < -1:
            ml_home = 130 + (margin * 20)
            ml_away = -150 - (margin * 25)
        elif margin < 0:
            ml_home = 100 + (margin * 10)
            ml_away = -120 - (margin * 15)
        else:
            ml_home = -110
            ml_away = -110
        spread_home = -1.5 if margin > 0 else 1.5  # Puck line
        
    else:  # NBA default
        if margin > 10:
            ml_home = -200 + (margin * 10)
            ml_away = 180 - (margin * 8)
        elif margin > 0:
            ml_home = -150 + (margin * 5)
            ml_away = 130 - (margin * 5)
        elif margin < -10:
            ml_home = 180 + (margin * 8)
            ml_away = -200 - (margin * 10)
        elif margin < 0:
            ml_home = 130 + (margin * 5)
            ml_away = -150 - (margin * 5)
        else:
            ml_home = -110
            ml_away = -110
        spread_home = round(-margin / 2, 1)
    
    return {
        "moneylineHome": int(max(min(ml_home, 500), -500)),
        "moneylineAway": int(max(min(ml_away, 500), -500)),
        "spreadHome": spread_home,
        "spreadAway": -spread_home,
        "totalLine": round(total_points, 0),
        "source": "live"
    }


def convert_game(game: dict, sport: str, season: int, real_odds: dict = None) -> dict:
    """Convert a BallDontLie game to our betting format.
    
    Args:
        game: Game data from BallDontLie API
        sport: Sport code (nba, nfl, etc.)
        season: Season year
        real_odds: Dict with real odds data from BallDontLie odds endpoint
                   Keys: 'moneyline', 'spread', 'total'
    """
    
    # Handle different API response structures per sport
    if sport == "nhl":
        home_team = game.get("home_team", {})
        visitor_team = game.get("away_team", {})
        home_score = game.get("home_score", 0) or 0
        away_score = game.get("away_score", 0) or 0
        game_date = game.get("game_date", game.get("date", ""))[:10] if game.get("game_date") or game.get("date") else ""
        home_name = home_team.get("full_name", home_team.get("name", "Unknown"))
        away_name = visitor_team.get("full_name", visitor_team.get("name", "Unknown"))
        
    elif sport == "mlb":
        home_team = game.get("home_team", {})
        visitor_team = game.get("away_team", {})
        # MLB stores runs in home_team_data/away_team_data
        home_data = game.get("home_team_data", {})
        away_data = game.get("away_team_data", {})
        home_score = home_data.get("runs", 0) or 0
        away_score = away_data.get("runs", 0) or 0
        game_date = game.get("date", "")[:10] if game.get("date") else ""
        home_name = home_team.get("display_name", home_team.get("name", game.get("home_team_name", "Unknown")))
        away_name = visitor_team.get("display_name", visitor_team.get("name", game.get("away_team_name", "Unknown")))
        
    else:  # NBA, NFL
        home_team = game.get("home_team", {})
        visitor_team = game.get("visitor_team", {})
        home_score = game.get("home_team_score", 0) or 0
        away_score = game.get("visitor_team_score", 0) or 0
        game_date = game.get("date", "")[:10] if game.get("date") else ""
        home_name = home_team.get("name", home_team.get("full_name", "Unknown"))
        away_name = visitor_team.get("name", visitor_team.get("full_name", "Unknown"))
    
    total_points = home_score + away_score
    margin = home_score - away_score
    
    # Skip games with no scores
    if home_score == 0 and away_score == 0:
        return None
    
    # Determine winner
    if home_score > away_score:
        winner = "home"
    elif away_score > home_score:
        winner = "away"
    else:
        winner = "draw"
    
    # Generate unique ID
    game_id = hashlib.md5(
        f"{sport}-{game.get('id', '')}-{game_date}".encode()
    ).hexdigest()[:8]
    
    # Use REAL odds if available, otherwise mark as estimated (DO NOT USE FOR BACKTESTING)
    has_real_odds = False
    ml_home = None
    ml_away = None
    spread_home = None
    spread_away = None
    spread_odds_home = -110
    spread_odds_away = -110
    total_line = None
    
    if real_odds:
        # Extract moneyline odds
        ml_data = real_odds.get("moneyline")
        if ml_data:
            ml_home = ml_data.get("odds_american_home") or ml_data.get("home_odds")
            ml_away = ml_data.get("odds_american_visitor") or ml_data.get("away_odds")
            if ml_home is not None and ml_away is not None:
                has_real_odds = True
        
        # Extract spread odds
        spread_data = real_odds.get("spread")
        if spread_data:
            # away_spread field contains the away team's spread (e.g., +6.5)
            away_spread_val = spread_data.get("away_spread")
            if away_spread_val is not None:
                spread_away = float(away_spread_val)
                spread_home = -spread_away  # Home spread is opposite
                spread_odds_home = spread_data.get("odds_american_home", -110)
                spread_odds_away = spread_data.get("odds_american_visitor", -110)
                has_real_odds = True
        
        # Extract total/over-under
        total_data = real_odds.get("total")
        if total_data:
            total_line = total_data.get("over_under") or total_data.get("total_line")
            if total_line is not None:
                total_line = float(total_line)
    
    # Build odds object
    if has_real_odds:
        odds = {
            "moneylineHome": int(ml_home) if ml_home else None,
            "moneylineAway": int(ml_away) if ml_away else None,
            "spreadHome": spread_home,
            "spreadAway": spread_away,
            "spreadOddsHome": int(spread_odds_home) if spread_odds_home else -110,
            "spreadOddsAway": int(spread_odds_away) if spread_odds_away else -110,
            "totalLine": total_line,
            "source": "live"  # Real odds from BallDontLie
        }
    else:
        # Use estimated odds but CLEARLY MARK THEM - not suitable for backtesting!
        estimated = estimate_odds(margin, total_points, sport)
        estimated["source"] = "estimated_DO_NOT_BACKTEST"
        odds = estimated
    
    # Determine spread coverage (only if we have spread data)
    if spread_home is not None:
        if margin > abs(spread_home):
            spread_covered = "home" if spread_home < 0 else "away"
        elif margin < -abs(spread_home):
            spread_covered = "away" if spread_home < 0 else "home"
        else:
            spread_covered = "push"
    else:
        spread_covered = None
    
    # Determine over/under (only if we have total line)
    if total_line is not None:
        if total_points > total_line:
            total_result = "over"
        elif total_points < total_line:
            total_result = "under"
        else:
            total_result = "push"
    else:
        total_result = None
    
    return {
        "id": game_id,
        "bdl_game_id": game.get("id"),  # Keep BallDontLie ID for reference
        "sport": sport,
        "date": game_date,
        "season": season,
        "homeTeam": home_name,
        "awayTeam": away_name,
        "scores": {
            "homeScore": home_score,
            "awayScore": away_score,
            "homeQ1": None,
            "homeQ2": None,
            "homeQ3": None,
            "homeQ4": None,
            "awayQ1": None,
            "awayQ2": None,
            "awayQ3": None,
            "awayQ4": None
        },
        "odds": odds,
        "hasRealOdds": has_real_odds,
        "result": {
            "winner": winner,
            "spreadCovered": spread_covered,
            "totalResult": total_result,
            "margin": margin,
            "totalPoints": total_points
        }
    }


def process_sport(sport: str, seasons_to_keep: list = None, require_real_odds: bool = False):
    """Process a single sport - fetch, convert, and save.
    
    Args:
        sport: Sport code (nba, nfl, etc.)
        seasons_to_keep: List of season years to keep in the dataset
        require_real_odds: If True, only include games with real odds (recommended for backtesting)
    """
    if seasons_to_keep is None:
        seasons_to_keep = [2024]
    
    config = SPORT_CONFIG.get(sport)
    if not config:
        print(f"Unknown sport: {sport}")
        return None
    
    print(f"\n{'=' * 60}")
    print(f"Processing {sport.upper()}")
    print(f"{'=' * 60}")
    
    # Fetch ALL games for ALL configured seasons
    seasons = config.get("seasons", [2024])
    all_games = []
    all_odds = {}
    
    for season in seasons:
        print(f"\n📅 Fetching {sport.upper()} season {season}...")
        season_games = fetch_all_games(sport, season, per_page=100, max_pages=100)
        if season_games:
            all_games.extend(season_games)
            print(f"  Got {len(season_games)} games for {season}")
        
        # Fetch odds for this season
        season_odds = fetch_real_odds(sport, season, per_page=100, max_pages=100)
        all_odds.update(season_odds)
    
    games = all_games
    real_odds_by_game = all_odds
    
    if not games:
        print(f"No {sport.upper()} games fetched. Check API access.")
        return None
    
    print(f"\n📊 Total: {len(games)} games, real odds for {len(real_odds_by_game)} games")
    
    # Convert to betting format
    print(f"\nConverting {len(games)} games to betting format...")
    betting_games = []
    skipped = 0
    skipped_no_odds = 0
    
    for game in games:
        try:
            bdl_game_id = game.get("id")
            game_odds = real_odds_by_game.get(bdl_game_id)
            
            # Get season from game or use the most recent configured season
            game_season = game.get("season", seasons[-1] if seasons else 2024)
            
            converted = convert_game(game, sport, game_season, real_odds=game_odds)
            if converted:
                if require_real_odds and not converted.get("hasRealOdds"):
                    skipped_no_odds += 1
                    continue
                betting_games.append(converted)
            else:
                skipped += 1
        except Exception as e:
            print(f"  Warning: Failed to convert game {game.get('id')}: {e}")
            skipped += 1
    
    games_with_real_odds = len([g for g in betting_games if g.get("hasRealOdds")])
    print(f"Converted {len(betting_games)} games (skipped {skipped} incomplete, {skipped_no_odds} without real odds)")
    print(f"  📈 Games with REAL odds: {games_with_real_odds}")
    print(f"  ⚠️  Games with estimated odds: {len(betting_games) - games_with_real_odds}")
    
    # Load existing data
    output_file = DATA_DIR / config["output_file"]
    existing_games = []
    
    if output_file.exists():
        try:
            with open(output_file, "r") as f:
                existing_data = json.load(f)
            
            # Filter to keep only specified seasons
            existing_games = [
                game for game in existing_data 
                if game.get("season") in seasons_to_keep
            ]
            
            removed = len(existing_data) - len(existing_games)
            if removed > 0:
                print(f"Removed {removed} games from old seasons")
        except Exception as e:
            print(f"Warning: Could not read existing data: {e}")
    
    # Merge games (deduplicate by ID)
    existing_ids = {g.get("id") for g in existing_games}
    new_games = [g for g in betting_games if g.get("id") not in existing_ids]
    all_games = existing_games + new_games
    
    # Remove duplicates by ID (keep first occurrence)
    seen_ids = set()
    unique_games = []
    for game in all_games:
        if game.get("id") not in seen_ids:
            seen_ids.add(game.get("id"))
            unique_games.append(game)
    all_games = unique_games
    
    # Sort by date
    all_games.sort(key=lambda g: g.get("date", ""))
    
    print(f"\nTotal games in database: {len(all_games)} (new: {len(new_games)})")
    
    # Save to file
    with open(output_file, "w") as f:
        json.dump(all_games, f, indent=2)
    
    print(f"Saved to {output_file}")
    
    # Show sample
    if all_games:
        print(f"\nSample {sport.upper()} games:")
        for game in all_games[:3]:
            print(f"  {game['date']}: {game['awayTeam']} @ {game['homeTeam']} ({game['scores']['awayScore']}-{game['scores']['homeScore']})")
        if len(all_games) > 3:
            print(f"  ... and {len(all_games) - 3} more games")
    
    return {
        "sport": sport,
        "total_games": len(all_games),
        "games_with_real_odds": len([g for g in all_games if g.get("hasRealOdds") or g.get("odds", {}).get("source") == "live"]),
        "games_with_estimated_odds": len([g for g in all_games if g.get("odds", {}).get("source") == "estimated_DO_NOT_BACKTEST"]),
        "seasons": seasons
    }


def main():
    """Main entry point."""
    print("=" * 60)
    print("BallDontLie Sports Data Fetcher - All Major Sports")
    print("=" * 60)
    
    # Ensure directories exist
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    
    # Determine which sports to process
    all_sports = ["nba", "nfl", "nhl", "mlb"]
    sports_to_process = all_sports
    
    if len(sys.argv) > 1:
        arg = sys.argv[1].lower()
        if arg == "all":
            sports_to_process = all_sports
        elif arg in SPORT_CONFIG:
            sports_to_process = [arg]
        else:
            print(f"Unknown sport: {arg}")
            print(f"Available: {', '.join(SPORT_CONFIG.keys())}, all")
            sys.exit(1)
    
    # Check if we should require real odds (recommended for backtesting)
    require_real_odds = "--require-real-odds" in sys.argv or "-r" in sys.argv
    if require_real_odds:
        print("\n⚠️  REQUIRE_REAL_ODDS mode: Only games with verified sportsbook odds will be included")
    
    # Process each sport (keep both 2024 and 2025 seasons)
    results = []
    for sport in sports_to_process:
        result = process_sport(sport, seasons_to_keep=[2024, 2025], require_real_odds=require_real_odds)
        if result:
            results.append(result)
    
    # Update metadata
    total_real_odds = sum(r.get("games_with_real_odds", 0) for r in results)
    total_estimated = sum(r.get("games_with_estimated_odds", 0) for r in results)
    
    metadata = {
        "importedAt": datetime.now().isoformat(),
        "source": "BallDontLie API",
        "totalGames": sum(r["total_games"] for r in results),
        "gamesWithRealOdds": total_real_odds,
        "gamesWithEstimatedOdds": total_estimated,
        "sports": [r["sport"] for r in results],
        "seasons": list(set(s for r in results for s in r.get("seasons", []))),
        "dateRange": "2024-2025",
        "warnings": ["Games with 'estimated_DO_NOT_BACKTEST' odds source should NOT be used for backtesting"] if total_estimated > 0 else []
    }
    
    metadata_file = DATA_DIR / "metadata.json"
    with open(metadata_file, "w") as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print(f"{'=' * 60}")
    for r in results:
        real = r.get('games_with_real_odds', 0)
        est = r.get('games_with_estimated_odds', 0)
        print(f"  {r['sport'].upper()}: {r['total_games']} games (📈 {real} real odds, ⚠️ {est} estimated)")
    print(f"  {'─' * 30}")
    print(f"  TOTAL: {metadata['totalGames']} games across {len(results)} sports")
    print(f"  📈 Games with REAL odds (safe for backtesting): {total_real_odds}")
    if total_estimated > 0:
        print(f"  ⚠️  Games with ESTIMATED odds (DO NOT BACKTEST): {total_estimated}")
    print(f"\nMetadata saved to: {metadata_file}")
    print("\n✅ Done!")


if __name__ == "__main__":
    main()
