#!/usr/bin/env python3
"""
Data Deduplicator for EvenTheOdds SportsGame Table
Run this periodically to clean up any duplicate entries.
"""
import psycopg2
import os
import sys
from datetime import datetime

def load_db_url():
    env_paths = [
        '/var/www/html/eventheodds/.env',
        os.path.join(os.path.dirname(__file__), '..', '.env'),
    ]
    for env_path in env_paths:
        try:
            with open(env_path, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line.startswith('SPORTS_DATABASE_URL='):
                        url = line.split('=', 1)[1].strip()
                        if '?' in url:
                            url = url.split('?')[0]
                        return url
        except FileNotFoundError:
            continue
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]

def deduplicate_games(conn, league, dry_run=False):
    """Remove duplicate games, keeping the best quality record."""
    cur = conn.cursor()
    
    # Simple deduplication SQL that doesn't require complex CASE statements
    # Group by date and find games where team names match (considering abbrev vs full)
    find_sql = """
    WITH game_groups AS (
        SELECT 
            id,
            league,
            "homeTeam",
            "awayTeam",
            "gameDate"::date as game_date,
            "homeScore",
            "externalGameId",
            -- Priority: real scores > full names > tsdb source > older id
            ROW_NUMBER() OVER (
                PARTITION BY league, "gameDate"::date, 
                    CASE 
                        WHEN LENGTH("homeTeam") <= 3 THEN UPPER("homeTeam")
                        ELSE UPPER(SPLIT_PART("homeTeam", ' ', -1))
                    END
                ORDER BY 
                    CASE WHEN "homeScore" IS NOT NULL AND "homeScore" != 107 THEN 0 ELSE 1 END,
                    CASE WHEN LENGTH("homeTeam") > 3 THEN 0 ELSE 1 END,
                    CASE WHEN "externalGameId" LIKE 'tsdb_%%' THEN 0 ELSE 1 END,
                    id
            ) as rn
        FROM "SportsGame"
        WHERE league = %s
    )
    SELECT id, "homeTeam", "awayTeam", game_date
    FROM game_groups
    WHERE rn > 1
    """
    
    cur.execute(find_sql, (league,))
    duplicates = cur.fetchall()
    
    if not duplicates:
        print(f"[{league.upper()}] No duplicates found")
        return 0
    
    print(f"[{league.upper()}] Found {len(duplicates)} potential duplicates")
    
    if dry_run:
        for dup in duplicates[:5]:
            print(f"  Would delete: ID {dup[0]} - {dup[1]} vs {dup[2]} on {dup[3]}")
        if len(duplicates) > 5:
            print(f"  ... and {len(duplicates) - 5} more")
        return len(duplicates)
    
    # Delete duplicates
    ids_to_delete = [d[0] for d in duplicates]
    for id_to_del in ids_to_delete:
        cur.execute('DELETE FROM "SportsGame" WHERE id = %s', (id_to_del,))
    
    print(f"[{league.upper()}] Deleted {len(ids_to_delete)} duplicate records")
    return len(ids_to_delete)

def normalize_team_names(conn, league, dry_run=False):
    """Update abbreviations to full team names."""
    cur = conn.cursor()
    
    # NBA team mappings
    nba_teams = {
        'ATL': 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BKN': 'Brooklyn Nets',
        'CHA': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
        'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'DET': 'Detroit Pistons',
        'GSW': 'Golden State Warriors', 'HOU': 'Houston Rockets', 'IND': 'Indiana Pacers',
        'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers', 'MEM': 'Memphis Grizzlies',
        'MIA': 'Miami Heat', 'MIL': 'Milwaukee Bucks', 'MIN': 'Minnesota Timberwolves',
        'NOP': 'New Orleans Pelicans', 'NYK': 'New York Knicks', 'OKC': 'Oklahoma City Thunder',
        'ORL': 'Orlando Magic', 'PHI': 'Philadelphia 76ers', 'PHX': 'Phoenix Suns',
        'POR': 'Portland Trail Blazers', 'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs',
        'TOR': 'Toronto Raptors', 'UTA': 'Utah Jazz', 'WAS': 'Washington Wizards',
    }
    
    team_map = nba_teams if league == 'nba' else {}
    updates = 0
    
    for abbr, full_name in team_map.items():
        if not dry_run:
            cur.execute(
                'UPDATE "SportsGame" SET "homeTeam" = %s WHERE league = %s AND "homeTeam" = %s',
                (full_name, league, abbr)
            )
            updates += cur.rowcount
            cur.execute(
                'UPDATE "SportsGame" SET "awayTeam" = %s WHERE league = %s AND "awayTeam" = %s',
                (full_name, league, abbr)
            )
            updates += cur.rowcount
        else:
            cur.execute(
                'SELECT COUNT(*) FROM "SportsGame" WHERE league = %s AND ("homeTeam" = %s OR "awayTeam" = %s)',
                (league, abbr, abbr)
            )
            count = cur.fetchone()[0]
            if count > 0:
                print(f"  Would normalize {count} '{abbr}' -> '{full_name}'")
                updates += count
    
    if updates > 0 and not dry_run:
        print(f"[{league.upper()}] Normalized {updates} team name references")
    return updates

def clear_placeholder_scores(conn, league, dry_run=False):
    """Clear placeholder 107-79 scores."""
    cur = conn.cursor()
    
    if dry_run:
        cur.execute(
            'SELECT COUNT(*) FROM "SportsGame" WHERE league = %s AND "homeScore" = 107 AND "awayScore" = 79',
            (league,)
        )
        count = cur.fetchone()[0]
        if count > 0:
            print(f"[{league.upper()}] Would clear {count} placeholder scores")
        return count
    
    cur.execute(
        'UPDATE "SportsGame" SET "homeScore" = NULL, "awayScore" = NULL WHERE league = %s AND "homeScore" = 107 AND "awayScore" = 79',
        (league,)
    )
    if cur.rowcount > 0:
        print(f"[{league.upper()}] Cleared {cur.rowcount} placeholder scores")
    return cur.rowcount

def main():
    import argparse
    parser = argparse.ArgumentParser(description='Deduplicate SportsGame records')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    parser.add_argument('--league', default='all', help='League to process (nba, nfl, nhl, mlb, or all)')
    args = parser.parse_args()
    
    db_url = load_db_url()
    if not db_url:
        print('ERROR: SPORTS_DATABASE_URL not found')
        sys.exit(1)
    
    print(f"=== Data Deduplicator - {datetime.now().isoformat()} ===")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    
    conn = psycopg2.connect(db_url)
    
    try:
        leagues = ['nba', 'nfl', 'nhl', 'mlb'] if args.league == 'all' else [args.league]
        
        total_deduped = 0
        total_normalized = 0
        total_scores = 0
        
        for league in leagues:
            print(f"\n--- {league.upper()} ---")
            total_deduped += deduplicate_games(conn, league, args.dry_run)
            total_normalized += normalize_team_names(conn, league, args.dry_run)
            total_scores += clear_placeholder_scores(conn, league, args.dry_run)
        
        if not args.dry_run:
            conn.commit()
        
        print(f"\n=== Summary ===")
        print(f"Duplicates removed: {total_deduped}")
        print(f"Team names normalized: {total_normalized}")
        print(f"Placeholder scores cleared: {total_scores}")
            
    finally:
        conn.close()

if __name__ == '__main__':
    main()
