#!/usr/bin/env python3
"""
MMA/UFC Injuries Scraper
Fetches injury data from MMA news sources and validates against known fighters.
Writes to SportsDB PlayerInjury table.
"""
import requests
import json
import re
import os
import time
import psycopg2
from datetime import datetime, timezone
from typing import List, Dict, Set, Optional


def load_db_url():
    env_paths = [
        '/var/www/html/eventheodds/.env',
        os.path.join(os.path.dirname(__file__), '..', '.env'),
    ]
    for env_path in env_paths:
        try:
            with open(env_path, 'r') as f:
                for line in f:
                    if line.startswith('SPORTS_DATABASE_URL='):
                        return line.split('=', 1)[1].strip().split('?')[0]
        except FileNotFoundError:
            continue
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}

# Known UFC fighters cache (loaded from DB)
KNOWN_FIGHTERS: Set[str] = set()


def load_known_fighters() -> Set[str]:
    """Load known UFC fighter names from database for validation"""
    global KNOWN_FIGHTERS
    if KNOWN_FIGHTERS:
        return KNOWN_FIGHTERS

    db_url = load_db_url()
    if not db_url:
        return set()

    try:
        conn = psycopg2.connect(db_url)
        cur = conn.cursor()
        cur.execute('SELECT LOWER(name) FROM "UfcFighter"')
        KNOWN_FIGHTERS = {row[0] for row in cur.fetchall()}
        cur.close()
        conn.close()
        print(f"  Loaded {len(KNOWN_FIGHTERS)} known UFC fighters for validation")
        return KNOWN_FIGHTERS
    except Exception as e:
        print(f"  Warning: Could not load fighters from DB: {e}")
        return set()


def is_valid_fighter_name(name: str) -> bool:
    """
    Validate that a name looks like a real fighter name.
    - Must have first and last name (2+ words)
    - Each word must be capitalized properly
    - No common non-name words
    - Optionally validate against known fighters DB
    """
    if not name or len(name) < 5:
        return False

    # Must have at least 2 words
    words = name.split()
    if len(words) < 2:
        return False

    # Each word should start with uppercase
    for word in words:
        if not word[0].isupper():
            return False
        # Should be mostly letters
        if not re.match(r'^[A-Za-z\'-]+$', word):
            return False

    # Blacklist common false positive words
    blacklist = {
        'the', 'and', 'for', 'with', 'from', 'this', 'that', 'will', 'has',
        'have', 'been', 'was', 'were', 'are', 'out', 'due', 'after', 'before',
        'fight', 'bout', 'card', 'event', 'ufc', 'news', 'breaking', 'report',
        'injury', 'injured', 'pulled', 'pulls', 'withdrew', 'withdrawal',
        'main', 'co-main', 'preliminary', 'early', 'late', 'title', 'interim',
        'champion', 'contender', 'former', 'current', 'next', 'last', 'first',
        'all', 'calling', 'blaine', 'freaks', 'called', 'him', 'spells',
        'johnson', 'shut', 'gaethje',  # Known garbage from previous scrapes
    }

    name_lower = name.lower()
    for word in words:
        if word.lower() in blacklist:
            return False

    # If we have known fighters, validate against that
    known = load_known_fighters()
    if known:
        # Check if this name (or close variant) exists in DB
        if name_lower in known:
            return True
        # Also check without middle names
        first_last = f"{words[0]} {words[-1]}".lower()
        if first_last in known:
            return True
        # Check partial matches (first name + any last name)
        first_name = words[0].lower()
        matching = [f for f in known if f.startswith(first_name + ' ')]
        if matching:
            # Fuzzy match - if first name matches a known fighter
            return True
        # If we have a fighters DB and name isn't found, reject it
        # This prevents garbage data
        return False

    # No DB validation available, use heuristics
    # Name should be reasonable length
    if len(name) > 40:
        return False

    return True


def extract_fighter_names_from_text(text: str) -> List[str]:
    """
    Extract potential fighter names from text using proper patterns.
    Returns validated fighter names only.
    """
    names = []

    # Pattern 1: "FirstName LastName" at start of sentence (common in headlines)
    # Must be proper capitalization
    pattern1 = re.findall(r'(?:^|[.!?]\s+)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', text)
    names.extend(pattern1)

    # Pattern 2: Names followed by injury keywords
    pattern2 = re.findall(
        r'([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s+(?:is\s+)?'
        r'(?:out|injured|suffers?|sustained?|underwent|sidelined|forced out|withdraws?|pulled)',
        text
    )
    names.extend(pattern2)

    # Pattern 3: Names mentioned with "vs" or "versus" (fight context)
    pattern3 = re.findall(
        r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:vs\.?|versus)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)',
        text
    )
    for match in pattern3:
        names.extend(match)

    # Validate and dedupe
    validated = []
    seen = set()
    for name in names:
        name = name.strip()
        name_lower = name.lower()
        if name_lower not in seen and is_valid_fighter_name(name):
            seen.add(name_lower)
            validated.append(name)

    return validated


def fetch_mmafighting_injuries() -> List[Dict]:
    """Fetch injuries from MMA Fighting RSS/news"""
    print("\n=== MMA FIGHTING INJURIES ===\n")

    injuries = []

    # Try RSS feed first (more structured)
    rss_url = 'https://www.mmafighting.com/rss/current'
    try:
        resp = requests.get(rss_url, headers=HEADERS, timeout=30)
        print(f"  MMA Fighting RSS: HTTP {resp.status_code}")

        if resp.status_code == 200:
            # Parse RSS items for injury-related articles
            items = re.findall(r'<item>(.*?)</item>', resp.text, re.DOTALL)

            for item in items:
                title_match = re.search(r'<title>(.*?)</title>', item)
                desc_match = re.search(r'<description>(.*?)</description>', item)

                if title_match:
                    title = re.sub(r'<!\[CDATA\[(.*?)\]\]>', r'\1', title_match.group(1))
                    title = re.sub(r'<[^>]+>', '', title).strip()

                    # Check if injury-related
                    injury_keywords = ['injury', 'injured', 'out of', 'withdrew', 'pulls out',
                                      'surgery', 'sidelined', 'forced out', 'medical']
                    if any(kw in title.lower() for kw in injury_keywords):
                        fighters = extract_fighter_names_from_text(title)
                        for fighter in fighters:
                            injuries.append({
                                'playerName': fighter,
                                'league': 'mma',
                                'status': 'Out',
                                'injuryType': 'Unknown',
                                'description': title[:200],
                                'source': 'mmafighting',
                            })
    except Exception as e:
        print(f"  MMA Fighting error: {e}")

    print(f"    Found {len(injuries)} potential injuries")
    return injuries


def fetch_espn_mma_injuries() -> List[Dict]:
    """Fetch injuries from ESPN MMA section"""
    print("\n=== ESPN MMA INJURIES ===\n")

    injuries = []
    url = 'https://www.espn.com/mma/'

    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        print(f"  ESPN MMA: HTTP {resp.status_code}")

        if resp.status_code == 200:
            # Look for news headlines
            headlines = re.findall(r'<h[123][^>]*>(.*?)</h[123]>', resp.text, re.DOTALL)

            for headline in headlines:
                text = re.sub(r'<[^>]+>', '', headline).strip()
                injury_keywords = ['injury', 'injured', 'out', 'withdrew', 'surgery', 'sidelined']

                if any(kw in text.lower() for kw in injury_keywords):
                    fighters = extract_fighter_names_from_text(text)
                    for fighter in fighters:
                        injuries.append({
                            'playerName': fighter,
                            'league': 'mma',
                            'status': 'Out',
                            'injuryType': 'Unknown',
                            'description': text[:200],
                            'source': 'espn',
                        })
    except Exception as e:
        print(f"  ESPN error: {e}")

    print(f"    Found {len(injuries)} potential injuries")
    return injuries


def fetch_ufc_news_injuries() -> List[Dict]:
    """Fetch injuries from UFC.com news API"""
    print("\n=== UFC.COM INJURIES ===\n")

    injuries = []

    # UFC has a news API
    api_url = 'https://www.ufc.com/views/ajax'
    params = {
        'view_name': 'all_news',
        'view_display_id': 'page_news_all',
    }

    try:
        resp = requests.get(api_url, params=params, headers=HEADERS, timeout=30)
        print(f"  UFC.com API: HTTP {resp.status_code}")

        if resp.status_code == 200:
            # Parse JSON response if available
            try:
                data = resp.json()
                # Extract news items
                for item in data:
                    if isinstance(item, dict) and 'data' in item:
                        html = item.get('data', '')
                        titles = re.findall(r'<h[23][^>]*>(.*?)</h[23]>', html, re.DOTALL)

                        for title in titles:
                            text = re.sub(r'<[^>]+>', '', title).strip()
                            injury_keywords = ['injury', 'injured', 'out', 'medical', 'surgery']

                            if any(kw in text.lower() for kw in injury_keywords):
                                fighters = extract_fighter_names_from_text(text)
                                for fighter in fighters:
                                    injuries.append({
                                        'playerName': fighter,
                                        'league': 'mma',
                                        'status': 'Out',
                                        'source': 'ufc.com',
                                    })
            except json.JSONDecodeError:
                # Fallback to HTML parsing
                pass
    except Exception as e:
        print(f"  UFC.com error: {e}")

    print(f"    Found {len(injuries)} potential injuries")
    return injuries


def fetch_sherdog_injuries() -> List[Dict]:
    """Fetch injuries from Sherdog news"""
    print("\n=== SHERDOG INJURIES ===\n")

    injuries = []
    url = 'https://www.sherdog.com/news/news'

    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        print(f"  Sherdog: HTTP {resp.status_code}")

        if resp.status_code == 200:
            # Find news headlines
            headlines = re.findall(r'class="title"[^>]*>(.*?)</(?:a|span|div)', resp.text, re.DOTALL)

            for headline in headlines:
                text = re.sub(r'<[^>]+>', '', headline).strip()
                injury_keywords = ['injury', 'injured', 'out of', 'withdrew', 'surgery', 'sidelined']

                if any(kw in text.lower() for kw in injury_keywords):
                    fighters = extract_fighter_names_from_text(text)
                    for fighter in fighters:
                        injuries.append({
                            'playerName': fighter,
                            'league': 'mma',
                            'status': 'Out',
                            'source': 'sherdog',
                        })
    except Exception as e:
        print(f"  Sherdog error: {e}")

    print(f"    Found {len(injuries)} potential injuries")
    return injuries


def load_balldontlie_api_key() -> Optional[str]:
    """Load BallDontLie API key from .env"""
    env_paths = [
        '/var/www/html/eventheodds/.env',
        os.path.join(os.path.dirname(__file__), '..', '.env'),
    ]
    for env_path in env_paths:
        try:
            with open(env_path, 'r') as f:
                for line in f:
                    if line.startswith('BALLDONTLIE_API_KEY='):
                        return line.split('=', 1)[1].strip()
        except FileNotFoundError:
            continue
    return os.environ.get('BALLDONTLIE_API_KEY')


def fetch_balldontlie_injuries() -> List[Dict]:
    """
    Fetch injury data from BallDontLie MMA API.
    Check for cancelled/postponed fights and fighters marked as inactive.
    """
    print("\n=== BALLDONTLIE MMA ===\n")

    injuries = []
    api_key = load_balldontlie_api_key()

    if not api_key:
        print("  No BallDontLie API key found")
        return injuries

    headers = {
        'Authorization': api_key,
        'Accept': 'application/json',
    }

    try:
        # Get upcoming events
        resp = requests.get(
            'https://api.balldontlie.io/mma/v1/events',
            params={'per_page': 50},
            headers=headers,
            timeout=30
        )
        print(f"  BallDontLie Events: HTTP {resp.status_code}")

        if resp.status_code == 200:
            data = resp.json()
            events = data.get('data', [])
            print(f"    Checking {len(events)} events for cancellations...")

            for event in events:
                event_id = event.get('id')
                status = event.get('status', '').lower()

                # Check if event has cancelled/postponed status
                if status in ['cancelled', 'postponed']:
                    print(f"    Event {event.get('name')} is {status}")

                # Get fights for this event to check for individual cancellations
                time.sleep(0.3)  # Rate limiting
                fights_resp = requests.get(
                    f'https://api.balldontlie.io/mma/v1/events/{event_id}/fights',
                    headers=headers,
                    timeout=30
                )

                if fights_resp.status_code == 200:
                    fights_data = fights_resp.json()
                    fights = fights_data.get('data', [])

                    for fight in fights:
                        fight_status = fight.get('status', '').lower()

                        # Check for cancelled fights (often due to injury)
                        if fight_status in ['cancelled', 'postponed', 'no contest']:
                            fighter1 = fight.get('fighter_1', {})
                            fighter2 = fight.get('fighter_2', {})

                            for fighter in [fighter1, fighter2]:
                                name = fighter.get('name', '')
                                if name and is_valid_fighter_name(name):
                                    injuries.append({
                                        'playerName': name,
                                        'league': 'mma',
                                        'status': 'Out',
                                        'injuryType': 'Fight Cancelled',
                                        'description': f"Fight cancelled/postponed at {event.get('name', 'Unknown Event')}",
                                        'source': 'balldontlie',
                                    })

    except Exception as e:
        print(f"  BallDontLie error: {e}")

    print(f"    Found {len(injuries)} potential injuries from cancelled fights")
    return injuries


def ingest_to_sportsdb(injuries: List[Dict]) -> int:
    """Write MMA injuries to SportsDB PlayerInjury table"""
    db_url = load_db_url()
    if not db_url:
        print('Warning: SPORTS_DATABASE_URL not found, skipping DB write')
        return 0

    try:
        conn = psycopg2.connect(db_url)
        cur = conn.cursor()
        added = 0

        for inj in injuries:
            player_name = inj.get('playerName', '')
            if not player_name:
                continue

            # Create stable ID from name
            player_id = f"mma_{player_name.replace(' ', '_').lower()}"

            try:
                cur.execute('''
                    INSERT INTO "PlayerInjury"
                    (league, "playerExternalId", "playerName", team, position, status,
                     "injuryType", description, source, "sourceUpdatedAt", raw, "updatedAt")
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())
                    ON CONFLICT (league, "playerExternalId", source)
                    DO UPDATE SET
                        status = EXCLUDED.status,
                        "injuryType" = EXCLUDED."injuryType",
                        description = EXCLUDED.description,
                        "sourceUpdatedAt" = EXCLUDED."sourceUpdatedAt",
                        raw = EXCLUDED.raw,
                        "updatedAt" = NOW()
                ''', (
                    'mma',
                    player_id,
                    player_name,
                    '',  # team N/A for MMA
                    '',  # position N/A for MMA
                    inj.get('status', 'Out'),
                    inj.get('injuryType', 'Unknown'),
                    inj.get('description', ''),
                    inj.get('source', 'mma_scraper'),
                    datetime.now(timezone.utc),
                    json.dumps(inj),
                ))
                added += 1
            except Exception as e:
                conn.rollback()
                print(f'  Error inserting {player_name}: {e}')

        conn.commit()
        cur.close()
        conn.close()
        return added
    except Exception as e:
        print(f'  DB connection error: {e}')
        return 0


def cleanup_garbage_injuries() -> int:
    """Remove garbage injury records from previous bad scrapes"""
    db_url = load_db_url()
    if not db_url:
        return 0

    try:
        conn = psycopg2.connect(db_url)
        cur = conn.cursor()

        # Delete records with obvious garbage names
        cur.execute('''
            DELETE FROM "PlayerInjury"
            WHERE league = 'mma'
            AND (
                LENGTH("playerName") < 5
                OR "playerName" !~ '^[A-Z][a-z]+ [A-Z][a-z]+'
                OR LOWER("playerName") ~ '(calling|freaks|called him|spells|shut)'
            )
        ''')

        deleted = cur.rowcount
        conn.commit()
        cur.close()
        conn.close()

        if deleted > 0:
            print(f"  🧹 Cleaned up {deleted} garbage injury records")

        return deleted
    except Exception as e:
        print(f"  Cleanup error: {e}")
        return 0


def fetch_all_mma_injuries() -> List[Dict]:
    """Fetch MMA injuries from all sources"""
    print("╔════════════════════════════════════════════════════════════╗")
    print("║           MMA/UFC INJURIES SCRAPER v2.0                    ║")
    print("╚════════════════════════════════════════════════════════════╝\n")

    # First, clean up any garbage from previous runs
    cleanup_garbage_injuries()

    # Load known fighters for validation
    load_known_fighters()

    all_injuries = []

    # Try all sources
    sources = [
        ('BallDontLie', fetch_balldontlie_injuries),
        ('MMA Fighting', fetch_mmafighting_injuries),
        ('ESPN MMA', fetch_espn_mma_injuries),
        ('UFC.com', fetch_ufc_news_injuries),
        ('Sherdog', fetch_sherdog_injuries),
    ]

    for name, fetch_func in sources:
        try:
            injuries = fetch_func()
            all_injuries.extend(injuries)
        except Exception as e:
            print(f"    {name} error: {e}")

    # Final deduplication by player name
    seen = set()
    unique = []
    for inj in all_injuries:
        name = inj['playerName'].lower()
        if name not in seen:
            seen.add(name)
            unique.append(inj)

    print(f"\n  TOTAL UNIQUE VALIDATED: {len(unique)} MMA injuries")

    # Write to SportsDB
    if unique:
        db_added = ingest_to_sportsdb(unique)
        print(f"\n  ✅ Wrote {db_added} injuries to SportsDB")
    else:
        print("\n  ℹ️  No valid injuries found to write")

    # Also save JSON backup
    output_path = os.path.join(os.path.dirname(__file__), '..', 'mma_injuries.json')
    with open(output_path, 'w') as f:
        json.dump({
            'injuries': unique,
            'total': len(unique),
            'timestamp': datetime.now(timezone.utc).isoformat(),
            'sources': [s[0] for s in sources],
        }, f, indent=2)
    print(f"  Results saved to mma_injuries.json")

    return unique


if __name__ == '__main__':
    fetch_all_mma_injuries()
