#!/usr/bin/env python3
"""
Fetch Expected Goals (xG) Data from FBref using Playwright
Uses headless browser to bypass anti-bot protection.

FBref provides free xG data for major leagues:
- EPL, La Liga, Serie A, Bundesliga, Ligue 1, MLS, etc.

Run: Daily after matches complete
Requires: .venv-scraper with playwright installed
"""
import os
import sys
import time
import re
from datetime import datetime, timezone
import argparse

# Add venv to path
VENV_PATH = '/var/www/html/eventheodds/.venv-scraper/lib/python3.12/site-packages'
if VENV_PATH not in sys.path:
    sys.path.insert(0, VENV_PATH)

import psycopg2
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

FBREF_BASE = 'https://fbref.com'
REQUEST_DELAY = 5  # Delay between leagues to avoid rate limits

# League URLs on FBref
LEAGUE_URLS = {
    'epl': '/en/comps/9/schedule/Premier-League-Scores-and-Fixtures',
    'laliga': '/en/comps/12/schedule/La-Liga-Scores-and-Fixtures',
    'seriea': '/en/comps/11/schedule/Serie-A-Scores-and-Fixtures',
    'bundesliga': '/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures',
    'ligue1': '/en/comps/13/schedule/Ligue-1-Scores-and-Fixtures',
    'mls': '/en/comps/22/schedule/Major-League-Soccer-Scores-and-Fixtures',
}


def load_db_url():
    env_path = '/var/www/html/eventheodds/.env'
    try:
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('SPORTS_DATABASE_URL='):
                    return line.split('=', 1)[1].strip().split('?')[0]
    except FileNotFoundError:
        pass
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


def ensure_xg_table(cur):
    """Create SoccerXG table if needed"""
    cur.execute('''
        CREATE TABLE IF NOT EXISTS "SoccerXG" (
            id BIGSERIAL PRIMARY KEY,
            league VARCHAR(50) NOT NULL,
            "gameId" BIGINT,
            "gameDate" DATE,
            "homeTeam" VARCHAR(100),
            "awayTeam" VARCHAR(100),
            "homeScore" INT,
            "awayScore" INT,
            "homeXG" FLOAT,
            "awayXG" FLOAT,
            "xgDiff" FLOAT,
            source VARCHAR(50) DEFAULT 'fbref',
            "fbrefMatchId" VARCHAR(50),
            "createdAt" TIMESTAMP DEFAULT NOW(),
            UNIQUE(league, "gameDate", "homeTeam", "awayTeam")
        )
    ''')
    cur.execute('CREATE INDEX IF NOT EXISTS "SoccerXG_league_date_idx" ON "SoccerXG" (league, "gameDate")')


def parse_schedule_page(html, league):
    """Parse FBref schedule page HTML"""
    soup = BeautifulSoup(html, 'html.parser')

    # Find the scores/fixtures table
    table = soup.find('table', {'id': re.compile('sched.*_all')})
    if not table:
        table = soup.find('table', class_='stats_table')

    if not table:
        print("  Could not find schedule table")
        return []

    matches = []
    rows = table.find_all('tr')

    for row in rows:
        cells = row.find_all(['td', 'th'])
        if len(cells) < 5:
            continue

        # Find cells by data-stat attribute
        date_cell = row.find('td', {'data-stat': 'date'})
        home_cell = row.find('td', {'data-stat': 'home_team'})
        away_cell = row.find('td', {'data-stat': 'away_team'})
        score_cell = row.find('td', {'data-stat': 'score'})
        xg_home_cell = row.find('td', {'data-stat': 'home_xg'})
        xg_away_cell = row.find('td', {'data-stat': 'away_xg'})

        if not all([date_cell, home_cell, away_cell]):
            continue

        # Parse date
        date_str = date_cell.get_text(strip=True)
        try:
            game_date = datetime.strptime(date_str, '%Y-%m-%d').date()
        except:
            continue

        # Get teams
        home_team = home_cell.get_text(strip=True)
        away_team = away_cell.get_text(strip=True)

        # Skip empty rows
        if not home_team or not away_team:
            continue

        # Get score
        home_score = None
        away_score = None
        if score_cell:
            score_text = score_cell.get_text(strip=True)
            score_match = re.match(r'(\d+)[–-](\d+)', score_text)
            if score_match:
                home_score = int(score_match.group(1))
                away_score = int(score_match.group(2))

        # Get xG
        home_xg = None
        away_xg = None
        if xg_home_cell:
            try:
                xg_text = xg_home_cell.get_text(strip=True)
                if xg_text:
                    home_xg = float(xg_text)
            except:
                pass
        if xg_away_cell:
            try:
                xg_text = xg_away_cell.get_text(strip=True)
                if xg_text:
                    away_xg = float(xg_text)
            except:
                pass

        # Get match ID from link
        match_id = None
        match_link = row.find('a', href=re.compile('/matches/'))
        if match_link:
            href = match_link.get('href', '')
            match_id = href.split('/matches/')[-1].split('/')[0] if '/matches/' in href else None

        matches.append({
            'date': game_date,
            'home_team': home_team,
            'away_team': away_team,
            'home_score': home_score,
            'away_score': away_score,
            'home_xg': home_xg,
            'away_xg': away_xg,
            'match_id': match_id
        })

    return matches


def scrape_league(page, league, conn):
    """Scrape xG data for a league using Playwright"""
    if league not in LEAGUE_URLS:
        print(f"  Unknown league: {league}")
        return {'matches': 0, 'with_xg': 0}

    url = FBREF_BASE + LEAGUE_URLS[league]
    print(f"\n{league.upper()}: Navigating to FBref...")

    try:
        page.goto(url, wait_until='domcontentloaded', timeout=45000)
        time.sleep(5)  # Wait for JS to fully render

        html = page.content()

        # Debug: Check page title and content length
        title = page.title()
        print(f"  Page title: {title}")
        print(f"  Content length: {len(html)} chars")

        # Check for blocking indicators
        if 'captcha' in html.lower() or 'blocked' in html.lower() or 'access denied' in html.lower():
            print("  WARNING: Page may be blocked/captcha")

        matches = parse_schedule_page(html, league)
        print(f"  Found {len(matches)} matches")

        cur = conn.cursor()
        ensure_xg_table(cur)
        conn.commit()

        inserted = 0
        with_xg = 0

        for match in matches:
            if match['home_xg'] is not None:
                with_xg += 1

            xg_diff = None
            if match['home_xg'] is not None and match['away_xg'] is not None:
                xg_diff = match['home_xg'] - match['away_xg']

            try:
                cur.execute('''
                    INSERT INTO "SoccerXG" (
                        league, "gameDate", "homeTeam", "awayTeam",
                        "homeScore", "awayScore", "homeXG", "awayXG", "xgDiff",
                        "fbrefMatchId"
                    )
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (league, "gameDate", "homeTeam", "awayTeam") DO UPDATE SET
                        "homeScore" = COALESCE(EXCLUDED."homeScore", "SoccerXG"."homeScore"),
                        "awayScore" = COALESCE(EXCLUDED."awayScore", "SoccerXG"."awayScore"),
                        "homeXG" = COALESCE(EXCLUDED."homeXG", "SoccerXG"."homeXG"),
                        "awayXG" = COALESCE(EXCLUDED."awayXG", "SoccerXG"."awayXG"),
                        "xgDiff" = COALESCE(EXCLUDED."xgDiff", "SoccerXG"."xgDiff")
                ''', (
                    league, match['date'], match['home_team'], match['away_team'],
                    match['home_score'], match['away_score'],
                    match['home_xg'], match['away_xg'], xg_diff,
                    match['match_id']
                ))
                inserted += 1
            except Exception as e:
                pass

        conn.commit()
        cur.close()

        print(f"  {league}: Inserted/updated {inserted} matches, {with_xg} have xG data")
        return {'matches': inserted, 'with_xg': with_xg}

    except Exception as e:
        print(f"  Error scraping {league}: {e}")
        return {'matches': 0, 'with_xg': 0}


def main():
    parser = argparse.ArgumentParser(description='Fetch xG data from FBref using Playwright')
    parser.add_argument('--leagues', type=str, default='epl,laliga,seriea,bundesliga',
                        help='Comma-separated leagues')
    args = parser.parse_args()

    print("=" * 60)
    print("FETCH SOCCER xG DATA FROM FBREF (Playwright)")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    db_url = load_db_url()
    if not db_url:
        print("ERROR: No database URL found")
        return

    conn = psycopg2.connect(db_url)
    leagues = [l.strip().lower() for l in args.leagues.split(',')]

    total_matches = 0
    total_xg = 0

    with sync_playwright() as p:
        # Launch without proxy - let Playwright handle the browser
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            viewport={'width': 1920, 'height': 1080}
        )
        page = context.new_page()

        for i, league in enumerate(leagues):
            if i > 0:
                time.sleep(REQUEST_DELAY)
            result = scrape_league(page, league, conn)
            total_matches += result['matches']
            total_xg += result['with_xg']

        browser.close()

    conn.close()

    print(f"\n{'='*60}")
    print(f"TOTAL: {total_matches} matches, {total_xg} with xG data")
    print("=" * 60)


if __name__ == '__main__':
    main()
