#!/usr/bin/env python3
"""
Deduplicate betting datasets by removing duplicate games.

A game is considered duplicate if it has the same:
- date
- homeTeam
- awayTeam

This script:
1. Loads each sport's historical JSON file
2. Removes duplicates (keeps the one with real odds if available)
3. Saves the deduplicated data back

Usage:
    python scripts/dedupe_betting_data.py [sport]
    
    sport: nba, nfl, nhl, mlb, or all (default: all)
"""

import os
import sys
import json
from datetime import datetime
from pathlib import Path
from collections import defaultdict

# Data directory
DATA_DIR = Path(__file__).parent.parent / "data" / "betting"

SPORTS = ["nba", "nfl", "nhl", "mlb"]


def load_dataset(sport: str) -> list:
    """Load a sport's betting dataset."""
    filename = f"{sport}_historical.json"
    filepath = DATA_DIR / filename
    
    if not filepath.exists():
        print(f"  ⚠️  File not found: {filepath}")
        return []
    
    with open(filepath, "r") as f:
        return json.load(f)


def save_dataset(sport: str, data: list):
    """Save a sport's betting dataset."""
    filename = f"{sport}_historical.json"
    filepath = DATA_DIR / filename
    
    with open(filepath, "w") as f:
        json.dump(data, f, indent=2)
    
    print(f"  ✅ Saved {len(data)} games to {filepath}")


def get_game_key(game: dict) -> tuple:
    """Generate a unique key for a game based on date + teams."""
    date = game.get("date", "")[:10]  # Just date part
    home = game.get("homeTeam", "").strip().lower()
    away = game.get("awayTeam", "").strip().lower()
    return (date, home, away)


def has_real_odds(game: dict) -> bool:
    """Check if a game has real (not estimated) odds."""
    odds = game.get("odds", {})
    source = odds.get("source", "")
    
    # "live" is real odds, "estimated_DO_NOT_BACKTEST" is fake
    if source == "live":
        return True
    if "estimated" in source.lower():
        return False
    
    # Check for presence of real odds data
    if odds.get("moneylineHome") and odds.get("moneylineAway"):
        return True
    
    return False


def dedupe_sport(sport: str) -> dict:
    """Deduplicate a single sport's dataset."""
    print(f"\nProcessing {sport.upper()}...")
    
    data = load_dataset(sport)
    if not data:
        return {"sport": sport, "original": 0, "deduplicated": 0, "removed": 0}
    
    original_count = len(data)
    print(f"  Original games: {original_count}")
    
    # Group games by key
    games_by_key = defaultdict(list)
    for game in data:
        key = get_game_key(game)
        games_by_key[key].append(game)
    
    # For each key, pick the best game (prefer one with real odds)
    deduplicated = []
    duplicates_found = 0
    
    for key, games in games_by_key.items():
        if len(games) == 1:
            deduplicated.append(games[0])
        else:
            duplicates_found += len(games) - 1
            
            # Prefer game with real odds
            real_odds_games = [g for g in games if has_real_odds(g)]
            if real_odds_games:
                # Pick the first one with real odds
                deduplicated.append(real_odds_games[0])
            else:
                # No real odds, just pick the first one
                deduplicated.append(games[0])
    
    # Sort by date
    deduplicated.sort(key=lambda g: g.get("date", ""))
    
    print(f"  Duplicates found: {duplicates_found}")
    print(f"  After deduplication: {len(deduplicated)}")
    
    # Count games with real odds
    real_odds_count = sum(1 for g in deduplicated if has_real_odds(g))
    print(f"  Games with real odds: {real_odds_count}")
    print(f"  Games with estimated odds: {len(deduplicated) - real_odds_count}")
    
    # Save
    save_dataset(sport, deduplicated)
    
    return {
        "sport": sport,
        "original": original_count,
        "deduplicated": len(deduplicated),
        "removed": original_count - len(deduplicated),
        "real_odds": real_odds_count
    }


def main():
    """Main entry point."""
    print("=" * 60)
    print("Betting Data Deduplication Tool")
    print("=" * 60)
    
    # Determine which sports to process
    sports_to_process = SPORTS
    
    if len(sys.argv) > 1:
        arg = sys.argv[1].lower()
        if arg == "all":
            sports_to_process = SPORTS
        elif arg in SPORTS:
            sports_to_process = [arg]
        else:
            print(f"Unknown sport: {arg}")
            print(f"Available: {', '.join(SPORTS)}, all")
            sys.exit(1)
    
    # Process each sport
    results = []
    for sport in sports_to_process:
        result = dedupe_sport(sport)
        results.append(result)
    
    # Summary
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print(f"{'=' * 60}")
    
    total_original = sum(r["original"] for r in results)
    total_deduped = sum(r["deduplicated"] for r in results)
    total_removed = sum(r["removed"] for r in results)
    total_real = sum(r.get("real_odds", 0) for r in results)
    
    for r in results:
        print(f"  {r['sport'].upper()}: {r['original']} → {r['deduplicated']} (-{r['removed']} duplicates)")
    
    print(f"  {'─' * 40}")
    print(f"  TOTAL: {total_original} → {total_deduped} (-{total_removed} duplicates)")
    print(f"  📈 Games with real odds: {total_real}")
    print(f"  ⚠️  Games with estimated odds: {total_deduped - total_real}")
    
    print("\n✅ Done!")


if __name__ == "__main__":
    main()
