#!/usr/bin/env python3
"""
Sports-Reference Player Stats Scraper
Fetches player stats from Basketball-Reference, Pro-Football-Reference,
Hockey-Reference, and Baseball-Reference for dynamic prop questions.
"""
import requests
from bs4 import BeautifulSoup
import psycopg2
import json
import time
import re
from datetime import datetime, timezone
from urllib.parse import urljoin

# Rate limiting - be respectful
REQUEST_DELAY = 2.0  # seconds between requests

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (compatible; EvenTheOdds/1.0; sports research)',
    'Accept': 'text/html,application/xhtml+xml',
}

def load_db_url():
    """Load database URL from environment"""
    env_paths = [
        '/var/www/html/eventheodds/.env',
        '/home/hazypiff/Work/eventheodds/eventheodds_backup/.env',
    ]
    import os
    for env_path in env_paths:
        try:
            with open(env_path, 'r') as f:
                for line in f:
                    if line.startswith('SPORTS_DATABASE_URL='):
                        return line.split('=', 1)[1].strip().split('?')[0]
        except FileNotFoundError:
            continue
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


def fetch_page(url, retries=3):
    """Fetch a page with retries and rate limiting"""
    for attempt in range(retries):
        try:
            time.sleep(REQUEST_DELAY)
            resp = requests.get(url, headers=HEADERS, timeout=30)
            if resp.status_code == 200:
                return resp.text
            elif resp.status_code == 429:  # Rate limited
                time.sleep(30)
                continue
            else:
                print(f"  Status {resp.status_code} for {url}")
        except Exception as e:
            print(f"  Error fetching {url}: {e}")
            time.sleep(5)
    return None


def scrape_basketball_reference_player_stats(season='2025'):
    """Scrape NBA player stats from Basketball-Reference"""
    print(f"\n[Basketball-Reference] Fetching NBA player stats for {season}...")

    # Per-game stats
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html"
    html = fetch_page(url)
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', {'id': 'per_game_stats'})
    if not table:
        print("  Could not find stats table")
        return []

    players = []
    tbody = table.find('tbody')
    if not tbody:
        return players

    for row in tbody.find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue

        cells = row.find_all(['th', 'td'])
        if len(cells) < 20:
            continue

        try:
            player_cell = cells[1] if len(cells) > 1 else None
            if not player_cell:
                continue

            player_link = player_cell.find('a')
            player_name = player_link.text if player_link else player_cell.text
            player_id = player_link.get('href', '').split('/')[-1].replace('.html', '') if player_link else None

            # Extract stats
            def safe_float(cell):
                try:
                    return float(cell.text.strip()) if cell and cell.text.strip() else None
                except:
                    return None

            player_data = {
                'playerId': player_id,
                'playerName': player_name.strip(),
                'league': 'nba',
                'season': season,
                'team': cells[4].text.strip() if len(cells) > 4 else None,
                'position': cells[3].text.strip() if len(cells) > 3 else None,
                'gamesPlayed': safe_float(cells[5]),
                'gamesStarted': safe_float(cells[6]),
                'minutesPerGame': safe_float(cells[7]),
                'pointsPerGame': safe_float(cells[29]) if len(cells) > 29 else None,
                'reboundsPerGame': safe_float(cells[23]) if len(cells) > 23 else None,
                'assistsPerGame': safe_float(cells[24]) if len(cells) > 24 else None,
                'stealsPerGame': safe_float(cells[25]) if len(cells) > 25 else None,
                'blocksPerGame': safe_float(cells[26]) if len(cells) > 26 else None,
                'turnoversPerGame': safe_float(cells[27]) if len(cells) > 27 else None,
                'fgPercent': safe_float(cells[11]) if len(cells) > 11 else None,
                'fg3Percent': safe_float(cells[14]) if len(cells) > 14 else None,
                'ftPercent': safe_float(cells[17]) if len(cells) > 17 else None,
                'source': 'basketball-reference',
            }

            if player_data['playerName'] and player_data['pointsPerGame'] is not None:
                players.append(player_data)

        except Exception as e:
            continue

    print(f"  Found {len(players)} NBA players with stats")
    return players


def scrape_hockey_reference_player_stats(season='2025'):
    """Scrape NHL player stats from Hockey-Reference"""
    print(f"\n[Hockey-Reference] Fetching NHL player stats for {season}...")

    url = f"https://www.hockey-reference.com/leagues/NHL_{season}_skaters.html"
    html = fetch_page(url)
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', {'id': 'stats'})
    if not table:
        print("  Could not find stats table")
        return []

    players = []
    tbody = table.find('tbody')
    if not tbody:
        return players

    for row in tbody.find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue

        cells = row.find_all(['th', 'td'])
        if len(cells) < 15:
            continue

        try:
            player_cell = cells[1] if len(cells) > 1 else None
            if not player_cell:
                continue

            player_link = player_cell.find('a')
            player_name = player_link.text if player_link else player_cell.text

            def safe_float(cell):
                try:
                    return float(cell.text.strip()) if cell and cell.text.strip() else None
                except:
                    return None

            def safe_int(cell):
                try:
                    return int(cell.text.strip()) if cell and cell.text.strip() else None
                except:
                    return None

            player_data = {
                'playerName': player_name.strip(),
                'league': 'nhl',
                'season': season,
                'team': cells[3].text.strip() if len(cells) > 3 else None,
                'position': cells[4].text.strip() if len(cells) > 4 else None,
                'gamesPlayed': safe_int(cells[5]),
                'goals': safe_int(cells[6]),
                'assists': safe_int(cells[7]),
                'points': safe_int(cells[8]),
                'plusMinus': safe_int(cells[9]) if len(cells) > 9 else None,
                'penaltyMinutes': safe_int(cells[10]) if len(cells) > 10 else None,
                'powerPlayGoals': safe_int(cells[12]) if len(cells) > 12 else None,
                'powerPlayPoints': safe_int(cells[13]) if len(cells) > 13 else None,
                'shots': safe_int(cells[16]) if len(cells) > 16 else None,
                'shootingPercent': safe_float(cells[17]) if len(cells) > 17 else None,
                'timeOnIce': safe_float(cells[19]) if len(cells) > 19 else None,
                'source': 'hockey-reference',
            }

            if player_data['playerName'] and player_data['gamesPlayed']:
                players.append(player_data)

        except Exception as e:
            continue

    print(f"  Found {len(players)} NHL players with stats")
    return players


def scrape_baseball_reference_player_stats(season='2025'):
    """Scrape MLB player stats from Baseball-Reference"""
    print(f"\n[Baseball-Reference] Fetching MLB player stats for {season}...")

    # Batting stats
    url = f"https://www.baseball-reference.com/leagues/majors/{season}-standard-batting.shtml"
    html = fetch_page(url)
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', {'id': 'players_standard_batting'})
    if not table:
        print("  Could not find batting stats table")
        return []

    players = []
    tbody = table.find('tbody')
    if not tbody:
        return players

    for row in tbody.find_all('tr'):
        if row.get('class') and 'thead' in row.get('class'):
            continue

        cells = row.find_all(['th', 'td'])
        if len(cells) < 20:
            continue

        try:
            player_cell = cells[1] if len(cells) > 1 else None
            if not player_cell:
                continue

            player_link = player_cell.find('a')
            player_name = player_link.text if player_link else player_cell.text

            def safe_float(cell):
                try:
                    return float(cell.text.strip()) if cell and cell.text.strip() else None
                except:
                    return None

            def safe_int(cell):
                try:
                    return int(cell.text.strip()) if cell and cell.text.strip() else None
                except:
                    return None

            player_data = {
                'playerName': player_name.strip(),
                'league': 'mlb',
                'season': season,
                'team': cells[3].text.strip() if len(cells) > 3 else None,
                'gamesPlayed': safe_int(cells[5]),
                'plateAppearances': safe_int(cells[6]),
                'atBats': safe_int(cells[7]),
                'runs': safe_int(cells[8]),
                'hits': safe_int(cells[9]),
                'doubles': safe_int(cells[10]),
                'triples': safe_int(cells[11]),
                'homeRuns': safe_int(cells[12]),
                'rbi': safe_int(cells[13]),
                'stolenBases': safe_int(cells[14]),
                'walks': safe_int(cells[16]),
                'strikeouts': safe_int(cells[17]),
                'battingAverage': safe_float(cells[18]),
                'onBasePercent': safe_float(cells[19]),
                'sluggingPercent': safe_float(cells[20]),
                'ops': safe_float(cells[21]) if len(cells) > 21 else None,
                'source': 'baseball-reference',
            }

            if player_data['playerName'] and player_data['gamesPlayed']:
                players.append(player_data)

        except Exception as e:
            continue

    print(f"  Found {len(players)} MLB batters with stats")
    return players


def scrape_profootball_reference_player_stats(season='2025'):
    """Scrape NFL player stats from Pro-Football-Reference"""
    print(f"\n[Pro-Football-Reference] Fetching NFL player stats for {season}...")

    # Passing stats
    url = f"https://www.pro-football-reference.com/years/{season}/passing.htm"
    html = fetch_page(url)

    players = []

    if html:
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', {'id': 'passing'})

        if table:
            tbody = table.find('tbody')
            if tbody:
                for row in tbody.find_all('tr'):
                    if row.get('class') and 'thead' in row.get('class'):
                        continue

                    cells = row.find_all(['th', 'td'])
                    if len(cells) < 15:
                        continue

                    try:
                        player_cell = cells[1] if len(cells) > 1 else None
                        if not player_cell:
                            continue

                        player_link = player_cell.find('a')
                        player_name = player_link.text if player_link else player_cell.text

                        def safe_float(cell):
                            try:
                                val = cell.text.strip().replace('%', '')
                                return float(val) if val else None
                            except:
                                return None

                        def safe_int(cell):
                            try:
                                return int(cell.text.strip()) if cell and cell.text.strip() else None
                            except:
                                return None

                        player_data = {
                            'playerName': player_name.strip(),
                            'league': 'nfl',
                            'season': season,
                            'position': 'QB',
                            'team': cells[3].text.strip() if len(cells) > 3 else None,
                            'gamesPlayed': safe_int(cells[5]),
                            'gamesStarted': safe_int(cells[6]),
                            'passCompletions': safe_int(cells[8]),
                            'passAttempts': safe_int(cells[9]),
                            'passYards': safe_int(cells[11]),
                            'passTouchdowns': safe_int(cells[12]),
                            'interceptions': safe_int(cells[14]),
                            'passerRating': safe_float(cells[22]) if len(cells) > 22 else None,
                            'source': 'pro-football-reference',
                        }

                        if player_data['playerName'] and player_data['gamesPlayed']:
                            players.append(player_data)

                    except Exception as e:
                        continue

    print(f"  Found {len(players)} NFL QBs with stats")

    # Rushing stats
    time.sleep(REQUEST_DELAY)
    url = f"https://www.pro-football-reference.com/years/{season}/rushing.htm"
    html = fetch_page(url)

    if html:
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', {'id': 'rushing'})

        if table:
            tbody = table.find('tbody')
            if tbody:
                for row in tbody.find_all('tr'):
                    if row.get('class') and 'thead' in row.get('class'):
                        continue

                    cells = row.find_all(['th', 'td'])
                    if len(cells) < 10:
                        continue

                    try:
                        player_cell = cells[1] if len(cells) > 1 else None
                        if not player_cell:
                            continue

                        player_link = player_cell.find('a')
                        player_name = player_link.text if player_link else player_cell.text

                        def safe_int(cell):
                            try:
                                return int(cell.text.strip()) if cell and cell.text.strip() else None
                            except:
                                return None

                        player_data = {
                            'playerName': player_name.strip(),
                            'league': 'nfl',
                            'season': season,
                            'position': cells[4].text.strip() if len(cells) > 4 else 'RB',
                            'team': cells[3].text.strip() if len(cells) > 3 else None,
                            'gamesPlayed': safe_int(cells[5]),
                            'rushAttempts': safe_int(cells[7]),
                            'rushYards': safe_int(cells[8]),
                            'rushTouchdowns': safe_int(cells[9]),
                            'source': 'pro-football-reference',
                        }

                        if player_data['playerName'] and player_data['gamesPlayed']:
                            players.append(player_data)

                    except Exception as e:
                        continue

    print(f"  Total {len(players)} NFL players with stats")
    return players


def save_player_stats(conn, players):
    """Save player stats to database"""
    if not players:
        return 0

    cur = conn.cursor()
    saved = 0

    for player in players:
        try:
            # Upsert into PlayerSeasonStats or a new PlayerMetrics table
            cur.execute('''
                INSERT INTO "PlayerGameMetric" (
                    "playerId", "league", "gameDate", "metricType", "metricValue", "raw"
                )
                VALUES (%s, %s, %s, %s, %s, %s)
                ON CONFLICT DO NOTHING
            ''', (
                player.get('playerName', '').lower().replace(' ', '_'),
                player.get('league'),
                datetime.now(timezone.utc),
                'season_stats',
                json.dumps(player),
                json.dumps(player)
            ))
            saved += 1
        except Exception as e:
            conn.rollback()
            continue

    conn.commit()
    cur.close()
    return saved


def main():
    print("=" * 60)
    print("SPORTS-REFERENCE PLAYER STATS SCRAPER")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    db_url = load_db_url()

    all_players = []

    # Scrape each sport
    nba_players = scrape_basketball_reference_player_stats('2025')
    all_players.extend(nba_players)

    nhl_players = scrape_hockey_reference_player_stats('2025')
    all_players.extend(nhl_players)

    mlb_players = scrape_baseball_reference_player_stats('2024')  # 2024 season data
    all_players.extend(mlb_players)

    nfl_players = scrape_profootball_reference_player_stats('2024')  # 2024-25 season
    all_players.extend(nfl_players)

    print(f"\n{'='*60}")
    print(f"TOTAL PLAYERS SCRAPED: {len(all_players)}")

    # Save to JSON for now
    output_path = '/var/www/html/eventheodds/data/sportsreference_stats.json'
    try:
        import os
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump({
                'players': all_players,
                'count': len(all_players),
                'timestamp': datetime.now(timezone.utc).isoformat(),
                'sources': ['basketball-reference', 'hockey-reference', 'baseball-reference', 'pro-football-reference']
            }, f, indent=2)
        print(f"Saved to {output_path}")
    except Exception as e:
        print(f"Error saving: {e}")

    # Save to database if available
    if db_url:
        try:
            conn = psycopg2.connect(db_url)
            saved = save_player_stats(conn, all_players)
            conn.close()
            print(f"Saved {saved} records to database")
        except Exception as e:
            print(f"Database error: {e}")

    print("=" * 60)


if __name__ == '__main__':
    main()
