#!/usr/bin/env python3
"""
Fetch Expected Goals (xG) Data from Understat
Understat provides free xG data and is scrape-friendly.

Supported leagues:
- EPL, La Liga, Serie A, Bundesliga, Ligue 1, RFPL (Russian)

Run: Daily after matches complete
"""
import requests
import psycopg2
import os
import time
import re
import json
from datetime import datetime, timezone
from bs4 import BeautifulSoup
import argparse

UNDERSTAT_BASE = 'https://understat.com'
REQUEST_DELAY = 2  # Be respectful

# League IDs on Understat
LEAGUE_MAP = {
    'epl': 'EPL',
    'laliga': 'La_liga',
    'seriea': 'Serie_A',
    'bundesliga': 'Bundesliga',
    'ligue1': 'Ligue_1',
    'rfpl': 'RFPL',
}


def load_db_url():
    env_path = '/var/www/html/eventheodds/.env'
    try:
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('SPORTS_DATABASE_URL='):
                    return line.split('=', 1)[1].strip().split('?')[0]
    except FileNotFoundError:
        pass
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


def ensure_xg_table(cur):
    """Create SoccerXG table if needed"""
    cur.execute('''
        CREATE TABLE IF NOT EXISTS "SoccerXG" (
            id BIGSERIAL PRIMARY KEY,
            league VARCHAR(50) NOT NULL,
            "gameId" BIGINT,
            "gameDate" DATE,
            "homeTeam" VARCHAR(100),
            "awayTeam" VARCHAR(100),
            "homeScore" INT,
            "awayScore" INT,
            "homeXG" FLOAT,
            "awayXG" FLOAT,
            "xgDiff" FLOAT,
            "homeXGPerformance" FLOAT,
            "awayXGPerformance" FLOAT,
            source VARCHAR(50) DEFAULT 'understat',
            "understatMatchId" VARCHAR(50),
            season INT,
            "createdAt" TIMESTAMP DEFAULT NOW(),
            UNIQUE(league, "gameDate", "homeTeam", "awayTeam")
        )
    ''')
    cur.execute('CREATE INDEX IF NOT EXISTS "SoccerXG_league_date_idx" ON "SoccerXG" (league, "gameDate")')
    cur.execute('CREATE INDEX IF NOT EXISTS "SoccerXG_xgDiff_idx" ON "SoccerXG" ("xgDiff")')


def fetch_understat_matches(league, season=2025):
    """Fetch match data with xG from Understat"""
    if league not in LEAGUE_MAP:
        print(f"  Unknown league: {league}")
        return []

    understat_league = LEAGUE_MAP[league]
    url = f'{UNDERSTAT_BASE}/league/{understat_league}/{season}'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
    }

    try:
        resp = requests.get(url, headers=headers, timeout=30)
        if resp.status_code != 200:
            print(f"  Understat returned {resp.status_code}")
            return []

        # Understat embeds JSON data in script tags
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Find the script containing datesData
        matches = []

        for script in soup.find_all('script'):
            script_text = script.string
            if not script_text:
                continue

            # Look for datesData JSON
            if 'datesData' in script_text:
                # Extract JSON from: var datesData = JSON.parse('...')
                match = re.search(r"var\s+datesData\s*=\s*JSON\.parse\('(.+?)'\)", script_text)
                if match:
                    json_str = match.group(1)
                    # Unescape the string
                    json_str = json_str.encode().decode('unicode_escape')
                    dates_data = json.loads(json_str)

                    # datesData is an object with dates as keys
                    for date_key, day_matches in dates_data.items():
                        for m in day_matches:
                            try:
                                matches.append({
                                    'id': m.get('id'),
                                    'date': datetime.strptime(m.get('datetime', '')[:10], '%Y-%m-%d').date() if m.get('datetime') else None,
                                    'home_team': m.get('h', {}).get('title', ''),
                                    'away_team': m.get('a', {}).get('title', ''),
                                    'home_score': int(m.get('goals', {}).get('h', 0)) if m.get('goals', {}).get('h') else None,
                                    'away_score': int(m.get('goals', {}).get('a', 0)) if m.get('goals', {}).get('a') else None,
                                    'home_xg': float(m.get('xG', {}).get('h', 0)) if m.get('xG', {}).get('h') else None,
                                    'away_xg': float(m.get('xG', {}).get('a', 0)) if m.get('xG', {}).get('a') else None,
                                    'is_result': m.get('isResult', False),
                                })
                            except Exception as e:
                                continue
                    break

        return matches
    except Exception as e:
        print(f"  Error fetching Understat: {e}")
        import traceback
        traceback.print_exc()
        return []


def fetch_xg_for_league(league, season=2025):
    """Fetch xG data for a league"""
    db_url = load_db_url()
    conn = psycopg2.connect(db_url)
    cur = conn.cursor()

    ensure_xg_table(cur)
    conn.commit()

    print(f"\n{league.upper()}: Fetching xG from Understat ({season})...")
    time.sleep(REQUEST_DELAY)

    matches = fetch_understat_matches(league, season)
    print(f"  Found {len(matches)} matches")

    inserted = 0
    with_xg = 0

    for match in matches:
        if not match['date'] or not match['home_team']:
            continue

        if match['home_xg'] is not None:
            with_xg += 1

        xg_diff = None
        home_xg_perf = None
        away_xg_perf = None

        if match['home_xg'] is not None and match['away_xg'] is not None:
            xg_diff = match['home_xg'] - match['away_xg']

            # xG performance = actual goals - expected goals
            if match['home_score'] is not None:
                home_xg_perf = match['home_score'] - match['home_xg']
            if match['away_score'] is not None:
                away_xg_perf = match['away_score'] - match['away_xg']

        try:
            cur.execute('''
                INSERT INTO "SoccerXG" (
                    league, "gameDate", "homeTeam", "awayTeam",
                    "homeScore", "awayScore", "homeXG", "awayXG", "xgDiff",
                    "homeXGPerformance", "awayXGPerformance",
                    "understatMatchId", season, source
                )
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'understat')
                ON CONFLICT (league, "gameDate", "homeTeam", "awayTeam") DO UPDATE SET
                    "homeScore" = COALESCE(EXCLUDED."homeScore", "SoccerXG"."homeScore"),
                    "awayScore" = COALESCE(EXCLUDED."awayScore", "SoccerXG"."awayScore"),
                    "homeXG" = COALESCE(EXCLUDED."homeXG", "SoccerXG"."homeXG"),
                    "awayXG" = COALESCE(EXCLUDED."awayXG", "SoccerXG"."awayXG"),
                    "xgDiff" = COALESCE(EXCLUDED."xgDiff", "SoccerXG"."xgDiff"),
                    "homeXGPerformance" = COALESCE(EXCLUDED."homeXGPerformance", "SoccerXG"."homeXGPerformance"),
                    "awayXGPerformance" = COALESCE(EXCLUDED."awayXGPerformance", "SoccerXG"."awayXGPerformance")
            ''', (
                league, match['date'], match['home_team'], match['away_team'],
                match['home_score'], match['away_score'],
                match['home_xg'], match['away_xg'], xg_diff,
                home_xg_perf, away_xg_perf,
                str(match['id']) if match['id'] else None, season
            ))
            inserted += 1
        except Exception as e:
            pass

    conn.commit()

    # Also try previous season
    if season == 2025:
        print(f"  Also fetching {season-1} season...")
        time.sleep(REQUEST_DELAY)
        prev_matches = fetch_understat_matches(league, season - 1)
        for match in prev_matches:
            if not match['date'] or not match['home_team']:
                continue
            if match['home_xg'] is not None:
                with_xg += 1
            xg_diff = None
            if match['home_xg'] is not None and match['away_xg'] is not None:
                xg_diff = match['home_xg'] - match['away_xg']
            try:
                cur.execute('''
                    INSERT INTO "SoccerXG" (
                        league, "gameDate", "homeTeam", "awayTeam",
                        "homeScore", "awayScore", "homeXG", "awayXG", "xgDiff",
                        "understatMatchId", season, source
                    )
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'understat')
                    ON CONFLICT (league, "gameDate", "homeTeam", "awayTeam") DO NOTHING
                ''', (
                    league, match['date'], match['home_team'], match['away_team'],
                    match['home_score'], match['away_score'],
                    match['home_xg'], match['away_xg'], xg_diff,
                    str(match['id']) if match['id'] else None, season - 1
                ))
                inserted += 1
            except:
                pass
        conn.commit()

    cur.close()
    conn.close()

    print(f"  {league}: Inserted/updated {inserted} matches, {with_xg} have xG data")
    return {'matches': inserted, 'with_xg': with_xg}


def main():
    parser = argparse.ArgumentParser(description='Fetch xG data from Understat')
    parser.add_argument('--leagues', type=str, default='epl,laliga,seriea,bundesliga,ligue1',
                        help='Comma-separated leagues')
    parser.add_argument('--season', type=int, default=2025,
                        help='Season year')
    args = parser.parse_args()

    print("=" * 60)
    print("FETCH SOCCER xG DATA FROM UNDERSTAT")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    leagues = [l.strip().lower() for l in args.leagues.split(',')]

    total_matches = 0
    total_xg = 0

    for league in leagues:
        result = fetch_xg_for_league(league, args.season)
        total_matches += result['matches']
        total_xg += result['with_xg']

    print(f"\n{'='*60}")
    print(f"TOTAL: {total_matches} matches, {total_xg} with xG data")
    print("=" * 60)


if __name__ == '__main__':
    main()
