#!/usr/bin/env python3
"""
FBref Soccer Stats Scraper
Fetches advanced soccer statistics from FBref.com
Covers: Premier League, La Liga, Serie A, Bundesliga, Ligue 1
"""
import requests
from bs4 import BeautifulSoup
import json
import os
import time
from datetime import datetime, timezone

REQUEST_DELAY = 3.0  # FBref rate limiting

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (compatible; EvenTheOdds/1.0; sports research)',
    'Accept': 'text/html,application/xhtml+xml',
}

# League URLs on FBref
LEAGUE_URLS = {
    'epl': {
        'name': 'Premier League',
        'stats_url': 'https://fbref.com/en/comps/9/Premier-League-Stats',
        'schedule_url': 'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures',
    },
    'laliga': {
        'name': 'La Liga',
        'stats_url': 'https://fbref.com/en/comps/12/La-Liga-Stats',
        'schedule_url': 'https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures',
    },
    'seriea': {
        'name': 'Serie A',
        'stats_url': 'https://fbref.com/en/comps/11/Serie-A-Stats',
        'schedule_url': 'https://fbref.com/en/comps/11/schedule/Serie-A-Scores-and-Fixtures',
    },
    'bundesliga': {
        'name': 'Bundesliga',
        'stats_url': 'https://fbref.com/en/comps/20/Bundesliga-Stats',
        'schedule_url': 'https://fbref.com/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures',
    },
    'ligue1': {
        'name': 'Ligue 1',
        'stats_url': 'https://fbref.com/en/comps/13/Ligue-1-Stats',
        'schedule_url': 'https://fbref.com/en/comps/13/schedule/Ligue-1-Scores-and-Fixtures',
    },
}


def fetch_page(url, retries=3):
    """Fetch a page with retries and rate limiting"""
    for attempt in range(retries):
        try:
            time.sleep(REQUEST_DELAY)
            resp = requests.get(url, headers=HEADERS, timeout=30)
            if resp.status_code == 200:
                return resp.text
            elif resp.status_code == 429:
                print(f"  Rate limited, waiting 60s...")
                time.sleep(60)
                continue
            else:
                print(f"  Status {resp.status_code} for {url}")
        except Exception as e:
            print(f"  Error: {e}")
            time.sleep(5)
    return None


def scrape_league_standings(league_key):
    """Scrape league standings from FBref"""
    config = LEAGUE_URLS.get(league_key)
    if not config:
        return []

    print(f"\n[FBref] Fetching {config['name']} standings...")
    html = fetch_page(config['stats_url'])
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')

    # Find the main standings table
    table = soup.find('table', {'id': lambda x: x and 'results' in str(x).lower()})
    if not table:
        # Try alternative IDs
        table = soup.find('table', {'class': 'stats_table'})

    if not table:
        print(f"  Could not find standings table")
        return []

    standings = []
    tbody = table.find('tbody')
    if not tbody:
        return standings

    for row in tbody.find_all('tr'):
        cells = row.find_all(['th', 'td'])
        if len(cells) < 8:
            continue

        try:
            # Find team name
            team_cell = None
            for cell in cells:
                if cell.find('a'):
                    team_cell = cell
                    break

            if not team_cell:
                continue

            team_link = team_cell.find('a')
            team_name = team_link.text.strip() if team_link else team_cell.text.strip()

            def safe_int(cell):
                try:
                    text = cell.text.strip()
                    return int(text) if text and text.isdigit() else None
                except:
                    return None

            def safe_float(cell):
                try:
                    text = cell.text.strip()
                    return float(text) if text else None
                except:
                    return None

            # Parse standings row
            standing = {
                'team': team_name,
                'league': league_key,
                'gamesPlayed': safe_int(cells[2]) if len(cells) > 2 else None,
                'wins': safe_int(cells[3]) if len(cells) > 3 else None,
                'draws': safe_int(cells[4]) if len(cells) > 4 else None,
                'losses': safe_int(cells[5]) if len(cells) > 5 else None,
                'goalsFor': safe_int(cells[6]) if len(cells) > 6 else None,
                'goalsAgainst': safe_int(cells[7]) if len(cells) > 7 else None,
                'goalDiff': safe_int(cells[8]) if len(cells) > 8 else None,
                'points': safe_int(cells[9]) if len(cells) > 9 else None,
                'xG': safe_float(cells[10]) if len(cells) > 10 else None,
                'xGA': safe_float(cells[11]) if len(cells) > 11 else None,
                'source': 'fbref',
            }

            if standing['team']:
                standings.append(standing)

        except Exception as e:
            continue

    print(f"  Found {len(standings)} teams")
    return standings


def scrape_league_player_stats(league_key):
    """Scrape player stats from FBref"""
    config = LEAGUE_URLS.get(league_key)
    if not config:
        return []

    print(f"\n[FBref] Fetching {config['name']} player stats...")
    html = fetch_page(config['stats_url'])
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')

    # Find player stats table (standard stats)
    table = soup.find('table', {'id': lambda x: x and 'stats_standard' in str(x).lower()})
    if not table:
        print(f"  Could not find player stats table")
        return []

    players = []
    tbody = table.find('tbody')
    if not tbody:
        return players

    for row in tbody.find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue

        cells = row.find_all(['th', 'td'])
        if len(cells) < 15:
            continue

        try:
            player_cell = cells[0]
            player_link = player_cell.find('a')
            player_name = player_link.text.strip() if player_link else player_cell.text.strip()

            if not player_name or player_name.lower() == 'player':
                continue

            def safe_int(cell):
                try:
                    text = cell.text.strip()
                    return int(text) if text and text.replace('-', '').isdigit() else None
                except:
                    return None

            def safe_float(cell):
                try:
                    text = cell.text.strip()
                    return float(text) if text else None
                except:
                    return None

            player_data = {
                'playerName': player_name,
                'league': league_key,
                'nation': cells[1].text.strip() if len(cells) > 1 else None,
                'position': cells[2].text.strip() if len(cells) > 2 else None,
                'team': cells[3].text.strip() if len(cells) > 3 else None,
                'age': cells[4].text.strip() if len(cells) > 4 else None,
                'gamesPlayed': safe_int(cells[6]) if len(cells) > 6 else None,
                'gamesStarted': safe_int(cells[7]) if len(cells) > 7 else None,
                'minutes': safe_int(cells[8]) if len(cells) > 8 else None,
                'goals': safe_int(cells[10]) if len(cells) > 10 else None,
                'assists': safe_int(cells[11]) if len(cells) > 11 else None,
                'nonPenaltyGoals': safe_int(cells[12]) if len(cells) > 12 else None,
                'penaltyGoals': safe_int(cells[13]) if len(cells) > 13 else None,
                'yellowCards': safe_int(cells[15]) if len(cells) > 15 else None,
                'redCards': safe_int(cells[16]) if len(cells) > 16 else None,
                'xG': safe_float(cells[17]) if len(cells) > 17 else None,
                'xAG': safe_float(cells[19]) if len(cells) > 19 else None,
                'source': 'fbref',
            }

            if player_data['playerName'] and player_data['gamesPlayed']:
                players.append(player_data)

        except Exception as e:
            continue

    print(f"  Found {len(players)} players")
    return players


def scrape_upcoming_fixtures(league_key):
    """Scrape upcoming fixtures from FBref"""
    config = LEAGUE_URLS.get(league_key)
    if not config:
        return []

    print(f"\n[FBref] Fetching {config['name']} fixtures...")
    html = fetch_page(config['schedule_url'])
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')

    # Find fixtures table
    table = soup.find('table', {'class': 'stats_table'})
    if not table:
        print(f"  Could not find fixtures table")
        return []

    fixtures = []
    tbody = table.find('tbody')
    if not tbody:
        return fixtures

    today = datetime.now().date()

    for row in tbody.find_all('tr'):
        cells = row.find_all(['th', 'td'])
        if len(cells) < 8:
            continue

        try:
            # Extract date
            date_cell = cells[1] if len(cells) > 1 else None
            if not date_cell:
                continue

            date_text = date_cell.text.strip()

            # Get teams
            home_cell = cells[3] if len(cells) > 3 else None
            score_cell = cells[4] if len(cells) > 4 else None
            away_cell = cells[5] if len(cells) > 5 else None

            if not home_cell or not away_cell:
                continue

            home_team = home_cell.text.strip()
            away_team = away_cell.text.strip()
            score = score_cell.text.strip() if score_cell else ''

            # Determine if upcoming (no score yet)
            is_upcoming = not score or score == '' or '–' in score and not any(c.isdigit() for c in score)

            fixture = {
                'date': date_text,
                'homeTeam': home_team,
                'awayTeam': away_team,
                'score': score if score else None,
                'league': league_key,
                'isUpcoming': is_upcoming,
                'venue': cells[7].text.strip() if len(cells) > 7 else None,
                'source': 'fbref',
            }

            fixtures.append(fixture)

        except Exception as e:
            continue

    # Filter for upcoming only
    upcoming = [f for f in fixtures if f.get('isUpcoming')]
    print(f"  Found {len(upcoming)} upcoming fixtures")
    return upcoming


def main():
    print("=" * 60)
    print("FBREF SOCCER STATS SCRAPER")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    all_data = {
        'timestamp': datetime.now(timezone.utc).isoformat(),
        'leagues': {},
    }

    total_standings = 0
    total_players = 0
    total_fixtures = 0

    for league_key in LEAGUE_URLS.keys():
        print(f"\n{'='*40}")
        print(f"Processing {LEAGUE_URLS[league_key]['name']}...")

        standings = scrape_league_standings(league_key)
        players = scrape_league_player_stats(league_key)
        fixtures = scrape_upcoming_fixtures(league_key)

        all_data['leagues'][league_key] = {
            'name': LEAGUE_URLS[league_key]['name'],
            'standings': standings,
            'players': players,
            'fixtures': fixtures,
        }

        total_standings += len(standings)
        total_players += len(players)
        total_fixtures += len(fixtures)

    # Save to JSON
    output_dir = '/var/www/html/eventheodds/data'
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, 'fbref_soccer_stats.json')

    with open(output_path, 'w') as f:
        json.dump(all_data, f, indent=2, default=str)

    print(f"\n{'='*60}")
    print("SUMMARY:")
    print(f"  Total standings:  {total_standings} teams")
    print(f"  Total players:    {total_players}")
    print(f"  Total fixtures:   {total_fixtures}")
    print(f"  Saved to: {output_path}")
    print("=" * 60)


if __name__ == '__main__':
    main()
