#!/usr/bin/env python3
"""
Fetch Expected Goals (xG) Data from FBref
Scrapes xG, xGA, and other advanced stats for soccer matches.

FBref provides free xG data for major leagues:
- EPL, La Liga, Serie A, Bundesliga, Ligue 1, MLS, etc.

Run: Daily after matches complete
"""
import psycopg2
import os
import time
import re
from datetime import datetime, timezone, timedelta
from bs4 import BeautifulSoup
import argparse

# Import proxy utilities for blocked sites
try:
    from proxy_config import make_request, get_proxy, get_headers
    USE_PROXY = True
except ImportError:
    import requests
    USE_PROXY = False

FBREF_BASE = 'https://fbref.com'
REQUEST_DELAY = 3  # FBref rate limits aggressively

# League URLs on FBref
LEAGUE_URLS = {
    'epl': '/en/comps/9/schedule/Premier-League-Scores-and-Fixtures',
    'laliga': '/en/comps/12/schedule/La-Liga-Scores-and-Fixtures',
    'seriea': '/en/comps/11/schedule/Serie-A-Scores-and-Fixtures',
    'bundesliga': '/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures',
    'ligue1': '/en/comps/13/schedule/Ligue-1-Scores-and-Fixtures',
    'mls': '/en/comps/22/schedule/Major-League-Soccer-Scores-and-Fixtures',
}


def load_db_url():
    env_path = '/var/www/html/eventheodds/.env'
    try:
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('SPORTS_DATABASE_URL='):
                    return line.split('=', 1)[1].strip().split('?')[0]
    except FileNotFoundError:
        pass
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


def ensure_xg_table(cur):
    """Create SoccerXG table if needed"""
    cur.execute('''
        CREATE TABLE IF NOT EXISTS "SoccerXG" (
            id BIGSERIAL PRIMARY KEY,
            league VARCHAR(50) NOT NULL,
            "gameId" BIGINT,
            "gameDate" DATE,
            "homeTeam" VARCHAR(100),
            "awayTeam" VARCHAR(100),
            "homeScore" INT,
            "awayScore" INT,
            "homeXG" FLOAT,
            "awayXG" FLOAT,
            "xgDiff" FLOAT,
            source VARCHAR(50) DEFAULT 'fbref',
            "fbrefMatchId" VARCHAR(50),
            "createdAt" TIMESTAMP DEFAULT NOW(),
            UNIQUE(league, "gameDate", "homeTeam", "awayTeam")
        )
    ''')
    cur.execute('CREATE INDEX IF NOT EXISTS "SoccerXG_league_date_idx" ON "SoccerXG" (league, "gameDate")')


def fetch_fbref_schedule(league):
    """Fetch schedule with xG from FBref"""
    if league not in LEAGUE_URLS:
        print(f"  Unknown league: {league}")
        return []

    url = FBREF_BASE + LEAGUE_URLS[league]

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive',
    }

    try:
        if USE_PROXY:
            # Use residential proxy for FBref (blocks datacenter IPs)
            resp = make_request(
                url,
                proxy_type='residential',
                retry_with_residential=False,  # Already using residential
                timeout=30,
                headers=headers,
                delay_range=None  # We handle delay separately
            )
        else:
            import requests
            resp = requests.get(url, headers=headers, timeout=30)

        if resp is None:
            print(f"  FBref request failed")
            return []
        if resp.status_code != 200:
            print(f"  FBref returned {resp.status_code}")
            return []

        soup = BeautifulSoup(resp.text, 'html.parser')

        # Find the scores/fixtures table
        table = soup.find('table', {'id': re.compile('sched.*_all')})
        if not table:
            table = soup.find('table', class_='stats_table')

        if not table:
            print("  Could not find schedule table")
            return []

        matches = []
        rows = table.find_all('tr')

        for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) < 5:
                continue

            # Find cells by data-stat attribute
            date_cell = row.find('td', {'data-stat': 'date'})
            home_cell = row.find('td', {'data-stat': 'home_team'})
            away_cell = row.find('td', {'data-stat': 'away_team'})
            score_cell = row.find('td', {'data-stat': 'score'})
            xg_home_cell = row.find('td', {'data-stat': 'home_xg'})
            xg_away_cell = row.find('td', {'data-stat': 'away_xg'})

            if not all([date_cell, home_cell, away_cell]):
                continue

            # Parse date
            date_str = date_cell.get_text(strip=True)
            try:
                game_date = datetime.strptime(date_str, '%Y-%m-%d').date()
            except:
                continue

            # Get teams
            home_team = home_cell.get_text(strip=True)
            away_team = away_cell.get_text(strip=True)

            # Get score
            home_score = None
            away_score = None
            if score_cell:
                score_text = score_cell.get_text(strip=True)
                score_match = re.match(r'(\d+)[–-](\d+)', score_text)
                if score_match:
                    home_score = int(score_match.group(1))
                    away_score = int(score_match.group(2))

            # Get xG
            home_xg = None
            away_xg = None
            if xg_home_cell:
                try:
                    home_xg = float(xg_home_cell.get_text(strip=True))
                except:
                    pass
            if xg_away_cell:
                try:
                    away_xg = float(xg_away_cell.get_text(strip=True))
                except:
                    pass

            # Get match ID from link
            match_id = None
            match_link = row.find('a', href=re.compile('/matches/'))
            if match_link:
                href = match_link.get('href', '')
                match_id = href.split('/matches/')[-1].split('/')[0] if '/matches/' in href else None

            matches.append({
                'date': game_date,
                'home_team': home_team,
                'away_team': away_team,
                'home_score': home_score,
                'away_score': away_score,
                'home_xg': home_xg,
                'away_xg': away_xg,
                'match_id': match_id
            })

        return matches
    except Exception as e:
        print(f"  Error fetching FBref: {e}")
        return []


def fetch_xg_for_league(league):
    """Fetch xG data for a league"""
    db_url = load_db_url()
    conn = psycopg2.connect(db_url)
    cur = conn.cursor()

    ensure_xg_table(cur)
    conn.commit()

    print(f"\n{league.upper()}: Fetching xG from FBref...")
    time.sleep(REQUEST_DELAY)

    matches = fetch_fbref_schedule(league)
    print(f"  Found {len(matches)} matches")

    inserted = 0
    with_xg = 0

    for match in matches:
        if match['home_xg'] is not None:
            with_xg += 1

        xg_diff = None
        if match['home_xg'] is not None and match['away_xg'] is not None:
            xg_diff = match['home_xg'] - match['away_xg']

        try:
            cur.execute('''
                INSERT INTO "SoccerXG" (
                    league, "gameDate", "homeTeam", "awayTeam",
                    "homeScore", "awayScore", "homeXG", "awayXG", "xgDiff",
                    "fbrefMatchId"
                )
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (league, "gameDate", "homeTeam", "awayTeam") DO UPDATE SET
                    "homeScore" = COALESCE(EXCLUDED."homeScore", "SoccerXG"."homeScore"),
                    "awayScore" = COALESCE(EXCLUDED."awayScore", "SoccerXG"."awayScore"),
                    "homeXG" = COALESCE(EXCLUDED."homeXG", "SoccerXG"."homeXG"),
                    "awayXG" = COALESCE(EXCLUDED."awayXG", "SoccerXG"."awayXG"),
                    "xgDiff" = COALESCE(EXCLUDED."xgDiff", "SoccerXG"."xgDiff")
            ''', (
                league, match['date'], match['home_team'], match['away_team'],
                match['home_score'], match['away_score'],
                match['home_xg'], match['away_xg'], xg_diff,
                match['match_id']
            ))
            inserted += 1
        except Exception as e:
            pass

    conn.commit()
    cur.close()
    conn.close()

    print(f"  {league}: Inserted/updated {inserted} matches, {with_xg} have xG data")
    return {'matches': inserted, 'with_xg': with_xg}


def main():
    parser = argparse.ArgumentParser(description='Fetch xG data from FBref')
    parser.add_argument('--leagues', type=str, default='epl,laliga,seriea,bundesliga',
                        help='Comma-separated leagues')
    args = parser.parse_args()

    print("=" * 60)
    print("FETCH SOCCER xG DATA FROM FBREF")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    leagues = [l.strip().lower() for l in args.leagues.split(',')]

    total_matches = 0
    total_xg = 0

    for league in leagues:
        result = fetch_xg_for_league(league)
        total_matches += result['matches']
        total_xg += result['with_xg']

    print(f"\n{'='*60}")
    print(f"TOTAL: {total_matches} matches, {total_xg} with xG data")
    print("=" * 60)


if __name__ == '__main__':
    main()
