#!/usr/bin/env python3
"""
Scrape DraftKings Half/Quarter Lines with Playwright
Uses headless browser to bypass API blocking.

Run: Every 4 hours during betting hours
Requires: .venv-scraper with playwright installed
"""
import json
import os
import sys
import time
import re
from datetime import datetime, timezone

# Add venv to path
VENV_PATH = '/var/www/html/eventheodds/.venv-scraper/lib/python3.12/site-packages'
if VENV_PATH not in sys.path:
    sys.path.insert(0, VENV_PATH)

import psycopg2
from playwright.sync_api import sync_playwright

# DraftKings sport URLs
SPORT_URLS = {
    'nba': 'https://sportsbook.draftkings.com/leagues/basketball/nba',
    'nfl': 'https://sportsbook.draftkings.com/leagues/football/nfl',
    'nhl': 'https://sportsbook.draftkings.com/leagues/hockey/nhl',
    'ncaab': 'https://sportsbook.draftkings.com/leagues/basketball/ncaab',
}


def load_db_url():
    env_path = '/var/www/html/eventheodds/.env'
    try:
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('SPORTS_DATABASE_URL='):
                    return line.split('=', 1)[1].strip().split('?')[0]
    except FileNotFoundError:
        pass
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


def ensure_table(cur):
    """Ensure GameHalfLine table exists with proper schema"""
    cur.execute('''
        CREATE TABLE IF NOT EXISTS "GameHalfLine" (
            id BIGSERIAL PRIMARY KEY,
            league VARCHAR(50) NOT NULL,
            "gameId" BIGINT,
            "gameDate" TIMESTAMP,
            "homeTeam" VARCHAR(100),
            "awayTeam" VARCHAR(100),
            period VARCHAR(10) NOT NULL,
            market VARCHAR(50) NOT NULL,
            side VARCHAR(50),
            "lineValue" FLOAT,
            "bookOdds" INT,
            bookmaker VARCHAR(50) DEFAULT 'draftkings',
            "createdAt" TIMESTAMP DEFAULT NOW(),
            "updatedAt" TIMESTAMP DEFAULT NOW()
        )
    ''')


def scrape_dk_league(page, league, conn):
    """Scrape half/quarter lines for a league using Playwright"""
    url = SPORT_URLS.get(league)
    if not url:
        print(f"  Unknown league: {league}")
        return 0

    print(f"\n{league.upper()}: Navigating to {url}")

    try:
        # Navigate to the league page
        page.goto(url, wait_until='networkidle', timeout=60000)
        time.sleep(3)  # Wait for JS to fully render

        # Get all event links
        events = page.query_selector_all('a[href*="/event/"]')
        event_urls = []

        for event in events:
            href = event.get_attribute('href')
            if href and '/event/' in href:
                if not href.startswith('http'):
                    href = 'https://sportsbook.draftkings.com' + href
                if href not in event_urls:
                    event_urls.append(href)

        print(f"  Found {len(event_urls)} events")

        total_lines = 0
        cur = conn.cursor()

        # Process each event (limit to avoid rate limiting)
        for event_url in event_urls[:15]:
            try:
                print(f"  Scraping: {event_url.split('/')[-1][:50]}...")
                page.goto(event_url, wait_until='networkidle', timeout=30000)
                time.sleep(2)

                # Get page content
                content = page.content()

                # Extract teams from the page title or content
                title = page.title()
                teams = extract_teams(title)

                if not teams:
                    # Try to get from page content
                    team_elements = page.query_selector_all('[class*="event-cell__name"]')
                    if len(team_elements) >= 2:
                        teams = (team_elements[0].inner_text(), team_elements[1].inner_text())

                if not teams:
                    continue

                home_team, away_team = teams

                # Look for half/quarter tabs and markets
                half_quarter_data = extract_half_quarter_markets(page, content)

                for market_data in half_quarter_data:
                    try:
                        cur.execute('''
                            INSERT INTO "GameHalfLine"
                            (league, "gameDate", "homeTeam", "awayTeam",
                             period, market, side, "lineValue", "bookOdds",
                             bookmaker, "createdAt", "updatedAt")
                            VALUES (%s, NOW(), %s, %s, %s, %s, %s, %s, %s,
                                    'draftkings', NOW(), NOW())
                            ON CONFLICT DO NOTHING
                        ''', (
                            league,
                            home_team,
                            away_team,
                            market_data['period'],
                            market_data['market'],
                            market_data['side'],
                            market_data['line_value'],
                            market_data['odds'],
                        ))
                        total_lines += 1
                    except Exception as e:
                        pass

                conn.commit()
                time.sleep(1)  # Rate limiting

            except Exception as e:
                print(f"    Error: {str(e)[:50]}")
                continue

        cur.close()
        print(f"  Stored {total_lines} half/quarter lines for {league}")
        return total_lines

    except Exception as e:
        print(f"  Error scraping {league}: {e}")
        return 0


def extract_teams(title):
    """Extract teams from page title"""
    # Common patterns: "Team A vs Team B" or "Team A @ Team B"
    if ' vs ' in title.lower():
        parts = title.lower().split(' vs ')
        return (parts[0].strip()[:50], parts[1].split(' -')[0].strip()[:50])
    elif ' @ ' in title:
        parts = title.split(' @ ')
        return (parts[1].split(' -')[0].strip()[:50], parts[0].strip()[:50])
    return None


def extract_half_quarter_markets(page, content):
    """Extract half and quarter market data from the page"""
    markets = []

    # Look for period-specific market sections
    period_patterns = {
        '1h': [r'1st\s*half', r'first\s*half'],
        '2h': [r'2nd\s*half', r'second\s*half'],
        '1q': [r'1st\s*quarter', r'first\s*quarter', r'1st\s*qtr'],
        '2q': [r'2nd\s*quarter', r'second\s*quarter', r'2nd\s*qtr'],
        '3q': [r'3rd\s*quarter', r'third\s*quarter', r'3rd\s*qtr'],
        '4q': [r'4th\s*quarter', r'fourth\s*quarter', r'4th\s*qtr'],
        'p1': [r'1st\s*period', r'first\s*period'],
        'p2': [r'2nd\s*period', r'second\s*period'],
        'p3': [r'3rd\s*period', r'third\s*period'],
    }

    # Try to find market data in JavaScript objects on the page
    # DraftKings often embeds data in __NEXT_DATA__ or similar
    scripts = page.query_selector_all('script')

    for script in scripts:
        try:
            script_content = script.inner_text()

            # Look for JSON data
            if '__NEXT_DATA__' in script_content or 'eventGroup' in script_content:
                # Extract JSON
                json_match = re.search(r'\{[\s\S]*"eventGroup"[\s\S]*\}', script_content)
                if json_match:
                    try:
                        data = json.loads(json_match.group())
                        markets.extend(parse_dk_json_data(data))
                    except:
                        pass
        except:
            continue

    # Also try to extract from visible DOM elements
    # Look for odds cells with period context
    odds_cells = page.query_selector_all('[class*="odds-cell"], [class*="outcome"], [class*="sportsbook-outcome"]')

    # Track current period context
    current_period = None

    for cell in odds_cells:
        try:
            parent_text = cell.evaluate('el => el.closest("[class*=subcategory], [class*=market]")?.innerText || ""')

            # Determine period from parent context
            for period, patterns in period_patterns.items():
                for pattern in patterns:
                    if re.search(pattern, parent_text, re.IGNORECASE):
                        current_period = period
                        break

            if not current_period:
                continue

            # Extract odds and line value
            cell_text = cell.inner_text()

            # Parse odds (American format like +110 or -110)
            odds_match = re.search(r'([+-]?\d{3,})', cell_text)
            odds = int(odds_match.group(1)) if odds_match else None

            # Parse line value (like -3.5 or O 220.5)
            line_match = re.search(r'([+-]?\d+\.?\d*)', cell_text)
            line_value = float(line_match.group(1)) if line_match else None

            # Determine market type and side
            market_type = 'unknown'
            side = 'unknown'

            if 'spread' in parent_text.lower() or 'handicap' in parent_text.lower():
                market_type = 'spread'
                side = 'home' if '+' in cell_text[:5] else 'away'
            elif 'total' in parent_text.lower() or 'over' in cell_text.lower() or 'under' in cell_text.lower():
                market_type = 'total'
                side = 'over' if 'over' in cell_text.lower() or cell_text.startswith('O') else 'under'
            elif 'money' in parent_text.lower() or 'winner' in parent_text.lower():
                market_type = 'moneyline'

            if odds and market_type != 'unknown':
                markets.append({
                    'period': current_period,
                    'market': market_type,
                    'side': side,
                    'line_value': line_value,
                    'odds': odds,
                })

        except Exception as e:
            continue

    return markets


def parse_dk_json_data(data):
    """Parse DraftKings JSON data structure for half/quarter lines"""
    markets = []

    def extract_from_dict(d, period=None):
        if not isinstance(d, dict):
            return

        # Check for period indicators
        name = d.get('name', '').lower()
        if '1st half' in name or 'first half' in name:
            period = '1h'
        elif '2nd half' in name or 'second half' in name:
            period = '2h'
        elif '1st quarter' in name or '1st qtr' in name:
            period = '1q'
        elif '2nd quarter' in name or '2nd qtr' in name:
            period = '2q'
        elif '3rd quarter' in name or '3rd qtr' in name:
            period = '3q'
        elif '4th quarter' in name or '4th qtr' in name:
            period = '4q'
        elif '1st period' in name:
            period = 'p1'
        elif '2nd period' in name:
            period = 'p2'
        elif '3rd period' in name:
            period = 'p3'

        # Look for outcomes/selections
        if 'outcomes' in d and period:
            for outcome in d['outcomes']:
                odds_am = outcome.get('oddsAmerican')
                if odds_am:
                    try:
                        markets.append({
                            'period': period,
                            'market': 'spread' if 'spread' in name else ('total' if 'total' in name else 'moneyline'),
                            'side': outcome.get('label', 'unknown'),
                            'line_value': outcome.get('line'),
                            'odds': int(odds_am.replace('+', '')),
                        })
                    except:
                        pass

        # Recurse into nested structures
        for key, value in d.items():
            if isinstance(value, dict):
                extract_from_dict(value, period)
            elif isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        extract_from_dict(item, period)

    extract_from_dict(data)
    return markets


# Proxy configuration
PROXY_CONFIG = {
    'server': 'socks5://82.163.172.166:50101',
    'username': 'goDRdyJf',
    'password': '1aIFz9LNOW'
}


def main():
    print("=" * 60)
    print("SCRAPE DRAFTKINGS HALF/QUARTER LINES (Playwright)")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    db_url = load_db_url()
    if not db_url:
        print("ERROR: No database URL found")
        return

    conn = psycopg2.connect(db_url)
    cur = conn.cursor()
    ensure_table(cur)
    conn.commit()
    cur.close()

    total_lines = 0

    with sync_playwright() as p:
        # Launch browser in headless mode with proxy
        browser = p.chromium.launch(
            headless=True,
            args=['--disable-blink-features=AutomationControlled'],
            proxy=PROXY_CONFIG
        )

        # Create context with realistic settings
        context = browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        )

        page = context.new_page()

        # Scrape each league
        for league in ['nba', 'nfl', 'nhl', 'ncaab']:
            try:
                lines = scrape_dk_league(page, league, conn)
                total_lines += lines
            except Exception as e:
                print(f"Error with {league}: {e}")
            time.sleep(2)

        browser.close()

    conn.close()

    print("\n" + "=" * 60)
    print(f"TOTAL: {total_lines} half/quarter lines scraped")
    print("=" * 60)


if __name__ == '__main__':
    main()
