#!/usr/bin/env python3
"""
Scrape OddsPortal Half-Time Odds with Playwright
Extracts 1H/2H spread and total lines from OddsPortal.

OddsPortal uses JavaScript to load odds data dynamically,
requiring a headless browser for proper scraping.

Run: Every 4-6 hours via cron
"""
import json
import os
import sys
import time
import re
import random
import psycopg2
from datetime import datetime, timezone, timedelta
from dotenv import load_dotenv

# Add venv path
VENV_PATH = '/var/www/html/eventheodds/.venv-scraper/lib/python3.12/site-packages'
if VENV_PATH not in sys.path:
    sys.path.insert(0, VENV_PATH)

from playwright.sync_api import sync_playwright

# Proxy configuration (residential for better success)
PROXY_CONFIG = {
    'server': 'socks5://54.38.19.233:3028'
}

# League URLs on OddsPortal
LEAGUE_URLS = {
    'nba': 'https://www.oddsportal.com/basketball/usa/nba/',
    'nfl': 'https://www.oddsportal.com/american-football/usa/nfl/',
    'nhl': 'https://www.oddsportal.com/hockey/usa/nhl/',
    'ncaab': 'https://www.oddsportal.com/basketball/usa/ncaa/',
}

# User agents for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
]


def load_db_url():
    load_dotenv('/var/www/html/eventheodds/.env')
    url = os.environ.get('SPORTS_DATABASE_URL', '')
    return url.split('?')[0] if '?' in url else url


def random_delay(min_sec=2, max_sec=5):
    """Add random delay to avoid detection"""
    time.sleep(random.uniform(min_sec, max_sec))


def extract_game_urls(page, league_url):
    """Extract individual game URLs from league page"""
    print(f"  Loading league page...")
    page.goto(league_url, wait_until='domcontentloaded', timeout=60000)
    random_delay(3, 5)

    # Get page content and extract links via regex (more reliable than selectors)
    content = page.content()

    # Pattern: /sport/country/league/team1-team2-GAMEID/ where GAMEID is 8 alphanumeric chars
    # Example: /basketball/usa/nba/philadelphia-76ers-indiana-pacers-8UorYn6D/
    game_pattern = r'/(?:basketball|hockey|american-football)/[a-z]+/[a-z-]+/[a-z0-9-]+-[A-Za-z0-9]{8}/'

    matches = re.findall(game_pattern, content)
    game_urls = []

    for match in matches:
        full_url = 'https://www.oddsportal.com' + match
        if full_url not in game_urls:
            game_urls.append(full_url)

    return game_urls


def extract_half_time_odds(page, game_url, league):
    """Extract half-time odds from a game page"""
    results = []
    
    try:
        print(f"    Fetching: {game_url.split('/')[-2][:40]}...")
        page.goto(game_url, wait_until='networkidle', timeout=45000)
        random_delay(2, 3)
        
        # Extract team names from page
        title = page.title()
        teams = extract_teams_from_title(title)
        if not teams:
            return results
        
        home_team, away_team = teams
        
        # Look for half-time tab/section
        # OddsPortal typically has tabs for different markets
        half_time_selectors = [
            'text=1st Half',
            'text=1st half',
            'text=Half Time',
            '[data-market*="half"]',
            'a[href*="#1h"]',
        ]
        
        for selector in half_time_selectors:
            try:
                element = page.query_selector(selector)
                if element:
                    element.click()
                    random_delay(1, 2)
                    break
            except:
                continue
        
        # Try to extract odds from the page
        page_content = page.content()
        
        # Look for odds data in embedded JSON
        json_match = re.search(r'window\["__STATE__"\]\s*=\s*(\{.*?\});', page_content, re.S)
        if json_match:
            try:
                state_data = json.loads(json_match.group(1))
                odds_data = extract_odds_from_state(state_data, league)
                for od in odds_data:
                    od['home_team'] = home_team
                    od['away_team'] = away_team
                    od['game_url'] = game_url
                results.extend(odds_data)
            except:
                pass
        
        # Also try extracting from visible odds cells
        odds_cells = page.query_selector_all('[class*="odds-"], [class*="table-odds"] td')
        visible_odds = extract_visible_odds(odds_cells, home_team, away_team, league)
        results.extend(visible_odds)
        
    except Exception as e:
        print(f"      Error: {str(e)[:50]}")
    
    return results


def extract_teams_from_title(title):
    """Extract team names from page title"""
    # Pattern: "Team A - Team B | Date | OddsPortal"
    match = re.search(r'^([^-|]+)\s*-\s*([^-|]+)', title)
    if match:
        away = match.group(1).strip()
        home = match.group(2).strip()
        # Remove any trailing info
        home = re.sub(r'\s*\|.*$', '', home).strip()
        return (home, away)
    return None


def extract_odds_from_state(state_data, league):
    """Extract odds from OddsPortal's __STATE__ object"""
    odds = []
    
    def search_dict(d, depth=0):
        if depth > 10 or not isinstance(d, dict):
            return
        
        # Look for market data
        for key, value in d.items():
            if isinstance(value, dict):
                # Check for odds structure
                if 'odds' in key.lower() or 'market' in key.lower():
                    if '1h' in str(value).lower() or 'half' in str(value).lower():
                        # Found half-time market
                        try:
                            odds.append({
                                'period': '1h',
                                'market': 'spread' if 'spread' in key.lower() else 'total',
                                'data': value,
                            })
                        except:
                            pass
                search_dict(value, depth + 1)
            elif isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        search_dict(item, depth + 1)
    
    search_dict(state_data)
    return odds


def extract_visible_odds(odds_cells, home_team, away_team, league):
    """Extract odds from visible DOM elements"""
    results = []
    
    for cell in odds_cells:
        try:
            text = cell.inner_text()
            # Look for odds format: +150, -110, 2.50, etc.
            odds_match = re.search(r'([+-]?\d{2,4})', text)
            if odds_match:
                # Try to determine market type and side
                parent_text = cell.evaluate('el => el.closest("tr")?.innerText || ""')
                
                if 'half' in parent_text.lower() or '1h' in parent_text.lower():
                    results.append({
                        'period': '1h',
                        'market': 'spread' if 'spread' in parent_text.lower() else 'total',
                        'odds': odds_match.group(1),
                        'home_team': home_team,
                        'away_team': away_team,
                    })
        except:
            continue
    
    return results


def store_half_lines(conn, league, odds_data):
    """Store extracted half-line odds in database"""
    cur = conn.cursor()
    stored = 0
    
    for od in odds_data:
        try:
            # Parse game date from URL if available
            game_date = datetime.now(timezone.utc) + timedelta(days=1)  # Default to tomorrow
            
            cur.execute('''
                INSERT INTO "GameHalfLine"
                (league, "gameDate", "homeTeam", "awayTeam",
                 period, market, "lineValue", "bookOdds", bookmaker,
                 "createdAt", "updatedAt")
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 'oddsportal', NOW(), NOW())
                ON CONFLICT DO NOTHING
            ''', (
                league,
                game_date,
                od.get('home_team', 'Unknown'),
                od.get('away_team', 'Unknown'),
                od.get('period', '1h'),
                od.get('market', 'spread'),
                od.get('line_value'),
                od.get('odds'),
            ))
            stored += 1
        except Exception as e:
            pass
    
    conn.commit()
    cur.close()
    return stored


def main():
    print("=" * 60)
    print("SCRAPE ODDSPORTAL HALF-TIME ODDS (Playwright)")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)
    
    db_url = load_db_url()
    if not db_url:
        print("ERROR: No database URL found")
        return
    
    conn = psycopg2.connect(db_url)
    total_stored = 0
    
    with sync_playwright() as p:
        # Launch browser with proxy
        browser = p.chromium.launch(
            headless=True,
            args=[
                '--disable-blink-features=AutomationControlled',
                '--disable-dev-shm-usage',
                '--no-sandbox',
            ]
        )
        
        context = browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent=random.choice(USER_AGENTS),
            proxy=PROXY_CONFIG,
        )
        
        # Disable webdriver detection
        context.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
        """)
        
        page = context.new_page()
        
        for league, league_url in LEAGUE_URLS.items():
            print(f"\n{league.upper()}: Scraping OddsPortal...")
            
            try:
                # Get game URLs
                game_urls = extract_game_urls(page, league_url)
                print(f"  Found {len(game_urls)} games")
                
                # Process each game (limit to avoid rate limiting)
                for game_url in game_urls[:10]:
                    odds_data = extract_half_time_odds(page, game_url, league)
                    
                    if odds_data:
                        stored = store_half_lines(conn, league, odds_data)
                        total_stored += stored
                        print(f"      Stored {stored} half-time odds")
                    
                    random_delay(2, 4)
                    
            except Exception as e:
                print(f"  Error: {e}")
            
            random_delay(3, 5)
        
        browser.close()
    
    conn.close()
    
    print("\n" + "=" * 60)
    print(f"TOTAL: {total_stored} half-time odds scraped from OddsPortal")
    print("=" * 60)


if __name__ == '__main__':
    main()
