#!/usr/bin/env python3
"""
NHL Data Clean & Backfill Script
=================================
Fixes dirty NHL data in SportsGame and builds nhl_historical.json.

Phases:
  1 - Team name normalization + season fix
  2 - Fix broken records (final w/no scores, NULL status)
  3 - Backfill missing 2024-25 season games from NHL API
  4 - Build nhl_historical.json for backtesting

Usage:
  python3 nhl_clean_and_backfill.py --dry-run          # Preview all phases
  python3 nhl_clean_and_backfill.py                     # Run all phases
  python3 nhl_clean_and_backfill.py --phase 1           # Run only phase 1
  python3 nhl_clean_and_backfill.py --phase 3 --dry-run # Preview phase 3
"""

import argparse
import json
import os
import sys
import time
import requests
import psycopg2
from datetime import datetime, date, timedelta
from pathlib import Path

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
BASE_DIR = Path('/var/www/html/eventheodds')
DATA_DIR = BASE_DIR / 'data'
OUT_FILE = DATA_DIR / 'betting' / 'nhl_historical.json'

NHL_API_BASE = 'https://api-web.nhle.com/v1'
NHL_API_DELAY = 0.35  # seconds between API calls

# Abbreviations found in SportsGame rows -> full name
# (These are the short/variant names that slipped in from various sources)
ABBREV_TO_FULL = {
    'ANA': 'Anaheim Ducks',
    'CAR': 'Carolina Hurricanes',
    'CGY': 'Calgary Flames',
    'COL': 'Colorado Avalanche',
    'DAL': 'Dallas Stars',
    'LA': 'Los Angeles Kings',
    'NJ': 'New Jersey Devils',
    'NSH': 'Nashville Predators',
    'NYR': 'New York Rangers',
    'SJ': 'San Jose Sharks',
    'TB': 'Tampa Bay Lightning',
    'VAN': 'Vancouver Canucks',
    'VGK': 'Vegas Golden Knights',
    'WPG': 'Winnipeg Jets',
    # Additional abbreviations that might appear
    'BOS': 'Boston Bruins',
    'BUF': 'Buffalo Sabres',
    'CHI': 'Chicago Blackhawks',
    'CBJ': 'Columbus Blue Jackets',
    'DET': 'Detroit Red Wings',
    'EDM': 'Edmonton Oilers',
    'FLA': 'Florida Panthers',
    'LAK': 'Los Angeles Kings',
    'MIN': 'Minnesota Wild',
    'MTL': 'Montreal Canadiens',
    'NJD': 'New Jersey Devils',
    'NYI': 'New York Islanders',
    'OTT': 'Ottawa Senators',
    'PHI': 'Philadelphia Flyers',
    'PIT': 'Pittsburgh Penguins',
    'SJS': 'San Jose Sharks',
    'SEA': 'Seattle Kraken',
    'STL': 'St. Louis Blues',
    'TBL': 'Tampa Bay Lightning',
    'TOR': 'Toronto Maple Leafs',
    'WSH': 'Washington Capitals',
    'ARI': 'Arizona Coyotes',
}

# NHL API team abbreviation -> full name (current 32 teams for 2024-25)
NHL_API_TO_FULL = {
    'ANA': 'Anaheim Ducks',
    'BOS': 'Boston Bruins',
    'BUF': 'Buffalo Sabres',
    'CGY': 'Calgary Flames',
    'CAR': 'Carolina Hurricanes',
    'CHI': 'Chicago Blackhawks',
    'COL': 'Colorado Avalanche',
    'CBJ': 'Columbus Blue Jackets',
    'DAL': 'Dallas Stars',
    'DET': 'Detroit Red Wings',
    'EDM': 'Edmonton Oilers',
    'FLA': 'Florida Panthers',
    'LAK': 'Los Angeles Kings',
    'MIN': 'Minnesota Wild',
    'MTL': 'Montreal Canadiens',  # also 'MON' sometimes
    'NSH': 'Nashville Predators',
    'NJD': 'New Jersey Devils',
    'NYI': 'New York Islanders',
    'NYR': 'New York Rangers',
    'OTT': 'Ottawa Senators',
    'PHI': 'Philadelphia Flyers',
    'PIT': 'Pittsburgh Penguins',
    'SJS': 'San Jose Sharks',
    'SEA': 'Seattle Kraken',
    'STL': 'St. Louis Blues',
    'TBL': 'Tampa Bay Lightning',
    'TOR': 'Toronto Maple Leafs',
    'UTA': 'Utah Hockey Club',
    'VAN': 'Vancouver Canucks',
    'VGK': 'Vegas Golden Knights',
    'WPG': 'Winnipeg Jets',
    'WSH': 'Washington Capitals',
}

# Canonical full team names (32 current teams)
CANONICAL_TEAMS = set(NHL_API_TO_FULL.values())


def load_db_url():
    env_path = BASE_DIR / '.env'
    if env_path.exists():
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('SPORTS_DATABASE_URL='):
                    return line.split('=', 1)[1].strip().split('?')[0]
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


def get_connection():
    db_url = load_db_url()
    if not db_url:
        print("ERROR: SPORTS_DATABASE_URL not found in .env")
        sys.exit(1)
    return psycopg2.connect(db_url)


def nhl_api_get(path, retries=3):
    """Fetch from NHL API with rate limiting and retries."""
    url = f"{NHL_API_BASE}{path}"
    for attempt in range(retries):
        try:
            time.sleep(NHL_API_DELAY)
            resp = requests.get(url, timeout=15)
            if resp.status_code == 200:
                return resp.json()
            if resp.status_code == 404:
                return None
            print(f"  NHL API {resp.status_code} for {path}, retry {attempt+1}")
        except requests.RequestException as e:
            print(f"  NHL API error for {path}: {e}, retry {attempt+1}")
        time.sleep(1)
    return None


def resolve_team_name(abbrev_or_name):
    """Resolve a team name that might be an abbreviation to its full canonical name."""
    if not abbrev_or_name:
        return abbrev_or_name
    name = abbrev_or_name.strip()
    # Already a canonical full name
    if name in CANONICAL_TEAMS or name == 'Arizona Coyotes':
        return name
    # Check abbreviation maps
    upper = name.upper()
    if upper in ABBREV_TO_FULL:
        return ABBREV_TO_FULL[upper]
    return name


def is_abbreviated(name):
    """Check if a team name looks like an abbreviation (not a full name)."""
    if not name:
        return False
    name = name.strip()
    # Full names have spaces and are longer
    if ' ' in name and len(name) > 5:
        return False
    # Short all-caps names are abbreviations
    if len(name) <= 4 and name == name.upper():
        return True
    return False


# ---------------------------------------------------------------------------
# Phase 1: Team Name Normalization + Season Fix
# ---------------------------------------------------------------------------
def phase1(conn, dry_run):
    print("\n" + "=" * 60)
    print("PHASE 1: Team Name Normalization + Season Fix")
    print("=" * 60)

    cur = conn.cursor()

    # 1a. Find games with abbreviated team names
    print("\n--- 1a: Fix abbreviated team names ---")
    cur.execute('''
        SELECT id, "homeTeam", "awayTeam", "gameDate", season,
               "homeScore", "awayScore", "moneylineHome", "moneylineAway",
               "spreadHome", "spreadAway", "total", status
        FROM "SportsGame"
        WHERE league = 'nhl'
        AND (
            LENGTH("homeTeam") <= 4 OR LENGTH("awayTeam") <= 4
            OR "homeTeam" IN ('ANA','CAR','CGY','COL','DAL','LA','NJ','NSH','NYR','SJ','TB','VAN','VGK','WPG')
            OR "awayTeam" IN ('ANA','CAR','CGY','COL','DAL','LA','NJ','NSH','NYR','SJ','TB','VAN','VGK','WPG')
        )
    ''')
    abbrev_games = cur.fetchall()
    print(f"  Found {len(abbrev_games)} games with potentially abbreviated names")

    fixed_names = 0
    deleted_dupes = 0
    for row in abbrev_games:
        gid, home, away, gdate, season, hscore, ascore, mlh, mla, sph, spa, total, status = row
        new_home = resolve_team_name(home) if is_abbreviated(home) else home
        new_away = resolve_team_name(away) if is_abbreviated(away) else away

        if new_home == home and new_away == away:
            continue  # nothing to fix

        # Check for duplicate with full names
        cur.execute('''
            SELECT id, "homeScore", "awayScore", status
            FROM "SportsGame"
            WHERE league = 'nhl' AND season = %s
            AND DATE("gameDate") = DATE(%s)
            AND "homeTeam" = %s AND "awayTeam" = %s
            AND id != %s
        ''', (season, gdate, new_home, new_away, gid))
        dupe = cur.fetchone()

        if dupe:
            dupe_id, dupe_hscore, dupe_ascore, dupe_status = dupe
            if dupe_hscore is not None:
                # Duplicate has scores — delete the abbreviated row
                print(f"  DELETE abbrev row {gid} ({home} vs {away}), dupe {dupe_id} has scores")
                if not dry_run:
                    cur.execute('DELETE FROM "SportsGame" WHERE id = %s', (gid,))
                deleted_dupes += 1
            else:
                # Duplicate exists without scores — copy data from abbreviated row, delete it
                print(f"  MERGE abbrev row {gid} into dupe {dupe_id} ({new_home} vs {new_away})")
                if not dry_run:
                    cur.execute('''
                        UPDATE "SportsGame"
                        SET "homeScore" = COALESCE("homeScore", %s),
                            "awayScore" = COALESCE("awayScore", %s),
                            "moneylineHome" = COALESCE("moneylineHome", %s),
                            "moneylineAway" = COALESCE("moneylineAway", %s),
                            "spreadHome" = COALESCE("spreadHome", %s),
                            "spreadAway" = COALESCE("spreadAway", %s),
                            "total" = COALESCE("total", %s),
                            status = COALESCE(status, %s),
                            "updatedAt" = NOW()
                        WHERE id = %s
                    ''', (hscore, ascore, mlh, mla, sph, spa, total, status, dupe_id))
                    cur.execute('DELETE FROM "SportsGame" WHERE id = %s', (gid,))
                deleted_dupes += 1
        else:
            # No duplicate — just rename
            print(f"  RENAME {gid}: {home} → {new_home}, {away} → {new_away}")
            if not dry_run:
                cur.execute('''
                    UPDATE "SportsGame"
                    SET "homeTeam" = %s, "awayTeam" = %s, "updatedAt" = NOW()
                    WHERE id = %s
                ''', (new_home, new_away, gid))
            fixed_names += 1

    print(f"  Renamed: {fixed_names}, Deleted dupes: {deleted_dupes}")

    # 1b. Fix "Utah Mammoth" -> "Utah Hockey Club"
    print("\n--- 1b: Fix 'Utah Mammoth' → 'Utah Hockey Club' ---")
    for col in ('homeTeam', 'awayTeam'):
        cur.execute(f'''
            SELECT COUNT(*) FROM "SportsGame"
            WHERE league = 'nhl' AND "{col}" = 'Utah Mammoth'
        ''')
        count = cur.fetchone()[0]
        if count > 0:
            print(f"  Fixing {count} rows with {col}='Utah Mammoth'")
            if not dry_run:
                cur.execute(f'''
                    UPDATE "SportsGame"
                    SET "{col}" = 'Utah Hockey Club', "updatedAt" = NOW()
                    WHERE league = 'nhl' AND "{col}" = 'Utah Mammoth'
                ''')

    # 1c. Fix season mistagging (season=2025 for Oct 2024 – Jun 2025 games)
    print("\n--- 1c: Fix season mistagging (2025 → 2024) ---")
    cur.execute('''
        SELECT id, "gameDate", "homeTeam", "awayTeam"
        FROM "SportsGame"
        WHERE league = 'nhl' AND season = 2025
        AND "gameDate" >= '2024-10-01' AND "gameDate" < '2025-07-01'
    ''')
    mistagged = cur.fetchall()
    print(f"  Found {len(mistagged)} games with season=2025 in 2024-25 range")

    conflict_count = 0
    fixed_season = 0
    for gid, gdate, home, away in mistagged:
        # Check for unique constraint conflict
        cur.execute('''
            SELECT id FROM "SportsGame"
            WHERE league = 'nhl' AND season = 2024
            AND DATE("gameDate") = DATE(%s)
            AND "homeTeam" = %s AND "awayTeam" = %s
            AND id != %s
        ''', (gdate, home, away, gid))
        conflict = cur.fetchone()
        if conflict:
            print(f"  CONFLICT: game {gid} ({home} vs {away} on {gdate}) - row {conflict[0]} already has season=2024")
            # Delete the mistagged duplicate
            if not dry_run:
                cur.execute('DELETE FROM "SportsGame" WHERE id = %s', (gid,))
            conflict_count += 1
        else:
            if not dry_run:
                cur.execute('''
                    UPDATE "SportsGame"
                    SET season = 2024, "updatedAt" = NOW()
                    WHERE id = %s
                ''', (gid,))
            fixed_season += 1

    print(f"  Fixed season: {fixed_season}, Deleted conflicts: {conflict_count}")

    if not dry_run:
        conn.commit()
    print("\nPhase 1 complete.")


# ---------------------------------------------------------------------------
# Phase 2: Fix Broken Records
# ---------------------------------------------------------------------------
def fetch_nhl_schedule_day(date_str):
    """Fetch NHL schedule for a specific date. Returns list of games."""
    data = nhl_api_get(f'/schedule/{date_str}')
    if not data:
        return []
    games = []
    for week in data.get('gameWeek', []):
        game_date = week.get('date', '')
        for game in week.get('games', []):
            home_abbr = game.get('homeTeam', {}).get('abbrev', '')
            away_abbr = game.get('awayTeam', {}).get('abbrev', '')
            home_score = game.get('homeTeam', {}).get('score')
            away_score = game.get('awayTeam', {}).get('score')
            games.append({
                'id': game.get('id'),
                'date': game_date,
                'gameType': game.get('gameType'),
                'gameState': game.get('gameState', ''),
                'homeAbbr': home_abbr,
                'awayAbbr': away_abbr,
                'homeTeam': NHL_API_TO_FULL.get(home_abbr, home_abbr),
                'awayTeam': NHL_API_TO_FULL.get(away_abbr, away_abbr),
                'homeScore': home_score,
                'awayScore': away_score,
            })
    return games


def match_api_game_to_db(api_games, home_team, away_team, game_date_str):
    """Find matching NHL API game for a DB row by team names and date."""
    for ag in api_games:
        if ag['date'] != game_date_str:
            continue
        # Try matching by full name
        if ag['homeTeam'] == home_team and ag['awayTeam'] == away_team:
            return ag
        # Try matching by resolved abbreviation
        if (resolve_team_name(ag['homeAbbr']) == home_team and
                resolve_team_name(ag['awayAbbr']) == away_team):
            return ag
    return None


def phase2(conn, dry_run):
    print("\n" + "=" * 60)
    print("PHASE 2: Fix Broken Records")
    print("=" * 60)

    cur = conn.cursor()
    today = date.today()

    # 2a. Fix 'final' games with no scores
    print("\n--- 2a: Fix 'final' games with no scores ---")
    cur.execute('''
        SELECT id, "homeTeam", "awayTeam", DATE("gameDate") as gd, "gameDate"
        FROM "SportsGame"
        WHERE league = 'nhl'
        AND LOWER(status) = 'final'
        AND ("homeScore" IS NULL OR "awayScore" IS NULL)
    ''')
    no_score_finals = cur.fetchall()
    print(f"  Found {len(no_score_finals)} 'final' games with no scores")

    fixed_scores = 0
    dates_fetched = {}  # cache API calls by date
    for gid, home, away, gd, gdate in no_score_finals:
        gd_str = str(gd)
        if gd_str not in dates_fetched:
            dates_fetched[gd_str] = fetch_nhl_schedule_day(gd_str)
        api_games = dates_fetched[gd_str]
        match = match_api_game_to_db(api_games, home, away, gd_str)
        if match and match['homeScore'] is not None and match['awayScore'] is not None:
            print(f"  FIX scores: {gid} ({away} @ {home} on {gd_str}) → {match['homeScore']}-{match['awayScore']}")
            if not dry_run:
                cur.execute('''
                    UPDATE "SportsGame"
                    SET "homeScore" = %s, "awayScore" = %s, "updatedAt" = NOW()
                    WHERE id = %s
                ''', (match['homeScore'], match['awayScore'], gid))
            fixed_scores += 1
        else:
            print(f"  SKIP: no API match for {gid} ({away} @ {home} on {gd_str})")

    print(f"  Fixed scores: {fixed_scores}")

    # 2b. Fix NULL-status games
    print("\n--- 2b: Fix NULL-status games ---")
    cur.execute('''
        SELECT id, "homeTeam", "awayTeam", DATE("gameDate") as gd, "gameDate",
               "homeScore", "awayScore"
        FROM "SportsGame"
        WHERE league = 'nhl' AND status IS NULL
        ORDER BY "gameDate"
    ''')
    null_status = cur.fetchall()
    print(f"  Found {len(null_status)} NULL-status games")

    fixed_status = 0
    deleted_null = 0
    skipped_future = 0
    for gid, home, away, gd, gdate, hscore, ascore in null_status:
        gd_str = str(gd)

        # Future games: leave as-is
        if gd > today:
            skipped_future += 1
            continue

        # Check for duplicate with same date+teams that already has a status
        cur.execute('''
            SELECT id, status FROM "SportsGame"
            WHERE league = 'nhl'
            AND DATE("gameDate") = DATE(%s)
            AND "homeTeam" = %s AND "awayTeam" = %s
            AND id != %s AND status IS NOT NULL
        ''', (gdate, home, away, gid))
        dupe = cur.fetchone()
        if dupe:
            print(f"  DELETE dupe NULL-status {gid} ({away} @ {home} on {gd_str}), existing row {dupe[0]}")
            if not dry_run:
                cur.execute('DELETE FROM "SportsGame" WHERE id = %s', (gid,))
            deleted_null += 1
            continue

        # Past games: fetch from NHL API
        if gd_str not in dates_fetched:
            dates_fetched[gd_str] = fetch_nhl_schedule_day(gd_str)
        api_games = dates_fetched[gd_str]
        match = match_api_game_to_db(api_games, home, away, gd_str)
        if match and match['gameState'] in ('OFF', 'FINAL'):
            hs = match['homeScore']
            as_ = match['awayScore']
            if hs is not None and as_ is not None:
                print(f"  FIX: {gid} ({away} @ {home} on {gd_str}) → final {hs}-{as_}")
                if not dry_run:
                    cur.execute('''
                        UPDATE "SportsGame"
                        SET status = 'final', "homeScore" = %s, "awayScore" = %s, "updatedAt" = NOW()
                        WHERE id = %s
                    ''', (hs, as_, gid))
                fixed_status += 1
            else:
                print(f"  SKIP: API match but no scores for {gid} on {gd_str}")
        else:
            # No API match — might be a bad row. Leave it for now.
            if match:
                print(f"  SKIP: game {gid} on {gd_str} state={match['gameState']}")
            else:
                print(f"  NO MATCH: {gid} ({away} @ {home} on {gd_str})")

    print(f"  Fixed status: {fixed_status}, Deleted dupes: {deleted_null}, Skipped future: {skipped_future}")

    if not dry_run:
        conn.commit()
    print("\nPhase 2 complete.")


# ---------------------------------------------------------------------------
# Phase 3: Backfill Missing 2024-25 Season
# ---------------------------------------------------------------------------
def phase3(conn, dry_run):
    print("\n" + "=" * 60)
    print("PHASE 3: Backfill Missing 2024-25 Season")
    print("=" * 60)

    cur = conn.cursor()

    # Fetch the full 2024-25 schedule by paginating weekly
    start = date(2024, 10, 1)
    end = date(2025, 6, 30)
    current = start

    all_api_games = []
    print(f"  Fetching NHL schedule from {start} to {end}...")
    while current <= end:
        day_games = fetch_nhl_schedule_day(current.strftime('%Y-%m-%d'))
        # Filter to regular season (gameType=2) only
        reg = [g for g in day_games if g.get('gameType') == 2]
        all_api_games.extend(reg)
        current += timedelta(days=7)

    # Deduplicate by game ID
    seen_ids = set()
    unique_games = []
    for g in all_api_games:
        if g['id'] not in seen_ids:
            seen_ids.add(g['id'])
            unique_games.append(g)
    all_api_games = unique_games

    print(f"  Found {len(all_api_games)} regular-season games from NHL API")

    inserted = 0
    updated_scores = 0
    skipped_existing = 0

    for g in all_api_games:
        home = g['homeTeam']
        away = g['awayTeam']
        gdate = g['date']
        game_id = g['id']
        state = g['gameState']
        hs = g['homeScore']
        as_ = g['awayScore']

        # Check for existing row
        cur.execute('''
            SELECT id, "homeScore", "awayScore", status
            FROM "SportsGame"
            WHERE league = 'nhl' AND season = 2024
            AND DATE("gameDate") = %s
            AND "homeTeam" = %s AND "awayTeam" = %s
        ''', (gdate, home, away))
        existing = cur.fetchone()

        if existing:
            ex_id, ex_hs, ex_as, ex_status = existing
            # If existing row is missing scores but API shows final
            if (ex_hs is None or ex_as is None) and state in ('OFF', 'FINAL') and hs is not None:
                print(f"  UPDATE scores: {ex_id} ({away} @ {home} on {gdate}) → {hs}-{as_}")
                if not dry_run:
                    cur.execute('''
                        UPDATE "SportsGame"
                        SET "homeScore" = %s, "awayScore" = %s,
                            status = 'final', "updatedAt" = NOW()
                        WHERE id = %s
                    ''', (hs, as_, ex_id))
                updated_scores += 1
            else:
                skipped_existing += 1
        else:
            # Insert new game
            ext_id = f"nhl:{game_id}"
            is_final = state in ('OFF', 'FINAL')
            status = 'final' if is_final else None
            score_h = hs if is_final else None
            score_a = as_ if is_final else None

            if not dry_run:
                try:
                    cur.execute('''
                        INSERT INTO "SportsGame"
                        (league, season, "gameDate", "homeTeam", "awayTeam",
                         "homeScore", "awayScore", status, "externalGameId", "updatedAt")
                        VALUES ('nhl', 2024, %s, %s, %s, %s, %s, %s, %s, NOW())
                    ''', (gdate, home, away, score_h, score_a, status, ext_id))
                except psycopg2.errors.UniqueViolation:
                    conn.rollback()
                    print(f"  SKIP (unique violation): {ext_id} {away} @ {home} on {gdate}")
                    continue
            inserted += 1

        # Commit every 100 games to avoid huge transactions
        if not dry_run and (inserted + updated_scores) % 100 == 0 and (inserted + updated_scores) > 0:
            conn.commit()

    if not dry_run:
        conn.commit()

    print(f"\n  Inserted: {inserted}")
    print(f"  Updated scores: {updated_scores}")
    print(f"  Skipped (already exist): {skipped_existing}")
    print("\nPhase 3 complete.")


# ---------------------------------------------------------------------------
# Phase 4: Build nhl_historical.json
# ---------------------------------------------------------------------------
def phase4(conn, dry_run):
    print("\n" + "=" * 60)
    print("PHASE 4: Build nhl_historical.json")
    print("=" * 60)

    cur = conn.cursor()

    cur.execute('''
        SELECT id, league, season, "gameDate", "homeTeam", "awayTeam",
               "homeScore", "awayScore", status, "externalGameId",
               "moneylineHome", "moneylineAway",
               "spreadHome", "spreadAway", "total", "oddsSource"
        FROM "SportsGame"
        WHERE league = 'nhl'
        AND LOWER(COALESCE(status, '')) = 'final'
        AND "homeScore" IS NOT NULL AND "awayScore" IS NOT NULL
        ORDER BY "gameDate"
    ''')
    rows = cur.fetchall()
    print(f"  Found {len(rows)} completed NHL games with scores")

    records = []
    for row in rows:
        (gid, league, season, gdate, home, away, hs, as_, status, ext_id,
         ml_h, ml_a, sp_h, sp_a, total, odds_src) = row

        date_str = gdate.strftime('%Y-%m-%d') if hasattr(gdate, 'strftime') else str(gdate)[:10]

        # Determine winner
        if hs > as_:
            winner = 'home'
        elif as_ > hs:
            winner = 'away'
        else:
            winner = 'draw'

        margin = hs - as_
        total_points = hs + as_

        # Spread covered
        spread_covered = None
        if sp_h is not None:
            adj = margin + sp_h
            if adj > 0:
                spread_covered = 'home'
            elif adj < 0:
                spread_covered = 'away'
            else:
                spread_covered = 'push'

        # Over/under result
        total_result = None
        if total is not None:
            if total_points > total:
                total_result = 'over'
            elif total_points < total:
                total_result = 'under'
            else:
                total_result = 'push'

        has_real = any(v is not None for v in (ml_h, ml_a, sp_h, total))

        # Build ID
        rec_id = ext_id or f"nhl-{date_str}-{gid}"

        records.append({
            'id': rec_id,
            'bdl_game_id': ext_id,
            'sport': 'nhl',
            'date': date_str,
            'season': season,
            'homeTeam': home,
            'awayTeam': away,
            'scores': {
                'homeScore': hs,
                'awayScore': as_,
            },
            'odds': {
                'moneylineHome': ml_h,
                'moneylineAway': ml_a,
                'spreadHome': float(sp_h) if sp_h is not None else None,
                'spreadAway': float(sp_a) if sp_a is not None else None,
                'totalLine': float(total) if total is not None else None,
                'source': odds_src or 'unknown',
            },
            'hasRealOdds': has_real,
            'result': {
                'winner': winner,
                'spreadCovered': spread_covered,
                'totalResult': total_result,
                'margin': margin,
                'totalPoints': total_points,
            },
        })

    print(f"  Built {len(records)} records")
    real_odds = sum(1 for r in records if r['hasRealOdds'])
    print(f"  With real odds: {real_odds}")

    if dry_run:
        print(f"  [DRY RUN] Would write to {OUT_FILE}")
        if records:
            print(f"  Sample record: {json.dumps(records[0], indent=2)[:500]}")
    else:
        OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
        with open(OUT_FILE, 'w') as f:
            json.dump(records, f, indent=2)
        print(f"  Wrote {OUT_FILE} ({len(records)} records)")

    # Summary by season
    season_counts = {}
    for r in records:
        s = r['season']
        if s not in season_counts:
            season_counts[s] = {'total': 0, 'with_odds': 0}
        season_counts[s]['total'] += 1
        if r['hasRealOdds']:
            season_counts[s]['with_odds'] += 1
    print("\n  Season breakdown:")
    for s in sorted(season_counts):
        c = season_counts[s]
        print(f"    {s}: {c['total']} games, {c['with_odds']} with odds")

    print("\nPhase 4 complete.")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def print_pre_summary(conn):
    """Print current state before running phases."""
    cur = conn.cursor()
    print("\n" + "=" * 60)
    print("NHL DATA CURRENT STATE")
    print("=" * 60)

    cur.execute('SELECT COUNT(*) FROM "SportsGame" WHERE league = %s', ('nhl',))
    print(f"  Total NHL games: {cur.fetchone()[0]}")

    cur.execute('SELECT COUNT(*) FROM "SportsGame" WHERE league = %s AND "homeScore" IS NOT NULL', ('nhl',))
    print(f"  With scores: {cur.fetchone()[0]}")

    cur.execute('SELECT COUNT(*) FROM "SportsGame" WHERE league = %s AND "moneylineHome" IS NOT NULL', ('nhl',))
    print(f"  With odds: {cur.fetchone()[0]}")

    cur.execute('SELECT COUNT(*) FROM "SportsGame" WHERE league = %s AND status IS NULL', ('nhl',))
    print(f"  NULL status: {cur.fetchone()[0]}")

    cur.execute('''
        SELECT COUNT(*) FROM "SportsGame"
        WHERE league = 'nhl' AND season = 2024
        AND "gameDate" >= '2024-10-01' AND "gameDate" < '2025-07-01'
    ''')
    print(f"  2024-25 season (season=2024): {cur.fetchone()[0]}")

    cur.execute('''
        SELECT COUNT(*) FROM "SportsGame"
        WHERE league = 'nhl' AND season = 2025
        AND "gameDate" >= '2024-10-01' AND "gameDate" < '2025-07-01'
    ''')
    print(f"  Mistagged (season=2025, 2024-25 range): {cur.fetchone()[0]}")

    cur.execute('''
        SELECT COUNT(*) FROM (
            SELECT "homeTeam" AS team FROM "SportsGame" WHERE league = 'nhl'
            UNION
            SELECT "awayTeam" AS team FROM "SportsGame" WHERE league = 'nhl'
        ) t
    ''')
    print(f"  Unique team names: {cur.fetchone()[0]}")

    cur.close()


def main():
    parser = argparse.ArgumentParser(description='NHL Data Clean & Backfill')
    parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying DB')
    parser.add_argument('--phase', type=int, choices=[1, 2, 3, 4], help='Run only specific phase')
    args = parser.parse_args()

    print(f"NHL Clean & Backfill — {datetime.now().isoformat()}")
    if args.dry_run:
        print("*** DRY RUN MODE — no changes will be made ***")
    if args.phase:
        print(f"Running phase {args.phase} only")

    conn = get_connection()

    print_pre_summary(conn)

    phases = {
        1: phase1,
        2: phase2,
        3: phase3,
        4: phase4,
    }

    if args.phase:
        phases[args.phase](conn, args.dry_run)
    else:
        for p in [1, 2, 3, 4]:
            phases[p](conn, args.dry_run)

    # Post-run summary
    if not args.dry_run:
        print("\n" + "=" * 60)
        print("POST-RUN STATE")
        print_pre_summary(conn)

    conn.close()
    print("\nDone.")


if __name__ == '__main__':
    main()
