#!/usr/bin/env python3
"""
Scrape NFL Play-by-Play Data from nflfastR
Source: nfl_data_py (Python wrapper for nflfastR)
Extracts: Situational rushing, turnovers by game state, pass/run tendencies
Answers: "Mahomes scrambling when protecting leads?" "Allen INT rate in comeback?"
"""
import psycopg2
import pandas as pd
import numpy as np
from datetime import datetime
import sys

# Use venv if available
sys.path.insert(0, '/opt/eventheodds-data/venv/lib/python3.12/site-packages')

import nfl_data_py as nfl

def load_db_url():
    with open('/var/www/html/eventheodds/.env', 'r') as f:
        for line in f:
            if line.startswith('SPORTS_DATABASE_URL='):
                return line.split('=', 1)[1].strip().split('?')[0]
    return ''

def scrape_nflfastr_pbp():
    conn = psycopg2.connect(load_db_url())
    cur = conn.cursor()

    # Create NFLPlayByPlay table for raw data
    cur.execute('''
        CREATE TABLE IF NOT EXISTS "NFLPlayByPlay" (
            id SERIAL PRIMARY KEY,
            season INTEGER NOT NULL,
            week INTEGER,
            "gameId" VARCHAR(50),
            "playId" INTEGER,
            "playType" VARCHAR(30),
            "passerId" VARCHAR(50),
            "rusherId" VARCHAR(50),
            "receiverId" VARCHAR(50),
            "passerName" VARCHAR(100),
            "rusherName" VARCHAR(100),
            "receiverName" VARCHAR(100),
            team VARCHAR(10),
            "defTeam" VARCHAR(10),
            "quarter" INTEGER,
            "gameSecondsRemaining" INTEGER,
            "scoreDifferential" INTEGER,
            "wpBefore" NUMERIC,
            "wpAfter" NUMERIC,
            "yardsGained" INTEGER,
            "touchdown" BOOLEAN,
            "interception" BOOLEAN,
            "fumble" BOOLEAN,
            "sack" BOOLEAN,
            "qbScramble" BOOLEAN,
            "rushAttempt" BOOLEAN,
            "passAttempt" BOOLEAN,
            "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT NOW()
        )
    ''')

    # Create NFLSituationalStats table for aggregated data
    cur.execute('''
        CREATE TABLE IF NOT EXISTS "NFLSituationalStats" (
            id SERIAL PRIMARY KEY,
            season INTEGER NOT NULL,
            "playerId" VARCHAR(50),
            "playerName" VARCHAR(200),
            team VARCHAR(10),
            "position" VARCHAR(10),
            -- Overall stats
            "totalPlays" INTEGER DEFAULT 0,
            "totalRushAttempts" INTEGER DEFAULT 0,
            "totalPassAttempts" INTEGER DEFAULT 0,
            "totalYards" INTEGER DEFAULT 0,
            "totalTouchdowns" INTEGER DEFAULT 0,
            "totalInterceptions" INTEGER DEFAULT 0,
            -- Leading (score diff > 7)
            "playsWhenLeading" INTEGER DEFAULT 0,
            "rushAttemptsLeading" INTEGER DEFAULT 0,
            "passAttemptsLeading" INTEGER DEFAULT 0,
            "yardsLeading" INTEGER DEFAULT 0,
            "scrambleRateLeading" NUMERIC,
            "intRateLeading" NUMERIC,
            -- Trailing (score diff < -7)
            "playsWhenTrailing" INTEGER DEFAULT 0,
            "rushAttemptsTrailing" INTEGER DEFAULT 0,
            "passAttemptsTrailing" INTEGER DEFAULT 0,
            "yardsTrailing" INTEGER DEFAULT 0,
            "scrambleRateTrailing" NUMERIC,
            "intRateTrailing" NUMERIC,
            -- Close game (|score diff| <= 7)
            "playsWhenClose" INTEGER DEFAULT 0,
            "rushAttemptsClose" INTEGER DEFAULT 0,
            "passAttemptsClose" INTEGER DEFAULT 0,
            "yardsClose" INTEGER DEFAULT 0,
            -- Second half stats
            "secondHalfPlays" INTEGER DEFAULT 0,
            "secondHalfRushAttempts" INTEGER DEFAULT 0,
            "secondHalfPassAttempts" INTEGER DEFAULT 0,
            "secondHalfYards" INTEGER DEFAULT 0,
            -- Q4 / Clutch (Q4 + close game)
            "q4Plays" INTEGER DEFAULT 0,
            "q4RushAttempts" INTEGER DEFAULT 0,
            "q4PassAttempts" INTEGER DEFAULT 0,
            "q4Yards" INTEGER DEFAULT 0,
            "clutchPlays" INTEGER DEFAULT 0,
            "clutchYards" INTEGER DEFAULT 0,
            -- Classifications
            "gameScriptType" VARCHAR(30),
            "scrambleType" VARCHAR(30),
            "clutchType" VARCHAR(30),
            "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
            UNIQUE(season, "playerId", team)
        )
    ''')

    # Create NFLTeamTendencies table
    cur.execute('''
        CREATE TABLE IF NOT EXISTS "NFLTeamTendencies" (
            id SERIAL PRIMARY KEY,
            season INTEGER NOT NULL,
            team VARCHAR(10) NOT NULL,
            -- Overall tendencies
            "totalPlays" INTEGER DEFAULT 0,
            "rushRate" NUMERIC,
            "passRate" NUMERIC,
            -- Leading tendencies
            "rushRateLeading" NUMERIC,
            "passRateLeading" NUMERIC,
            -- Trailing tendencies
            "rushRateTrailing" NUMERIC,
            "passRateTrailing" NUMERIC,
            -- Second half
            "rushRateSecondHalf" NUMERIC,
            "passRateSecondHalf" NUMERIC,
            -- Close game
            "rushRateClose" NUMERIC,
            "passRateClose" NUMERIC,
            -- Classification
            "runHeavyWhenLeading" BOOLEAN,
            "passHeavyWhenTrailing" BOOLEAN,
            "conservativeType" VARCHAR(30),
            "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
            UNIQUE(season, team)
        )
    ''')
    conn.commit()
    print("NFL tables created/verified")

    # Check what seasons we have
    cur.execute('SELECT MAX(season) FROM "NFLPlayByPlay"')
    result = cur.fetchone()
    last_season = result[0] if result and result[0] else 2023

    # Get current season (2024 or 2025 based on date)
    current_year = datetime.now().year
    current_season = current_year if datetime.now().month >= 9 else current_year - 1

    seasons_to_fetch = [s for s in range(last_season, current_season + 1)]
    print(f"\nFetching NFL play-by-play for seasons: {seasons_to_fetch}")

    for season in seasons_to_fetch:
        print(f"\n  Loading {season} PBP data from nflfastR...", end=' ')

        try:
            # Load play-by-play data
            pbp = nfl.import_pbp_data([season])

            if pbp is None or len(pbp) == 0:
                print("no data")
                continue

            print(f"{len(pbp):,} plays")

            # Filter to meaningful plays (no penalties, timeouts, etc)
            pbp = pbp[pbp['play_type'].isin(['pass', 'run', 'qb_kneel', 'qb_spike'])]

            # Insert raw plays (sample to avoid too much data)
            print(f"    Processing plays...", end=' ')
            plays_inserted = 0

            for _, play in pbp.iterrows():
                if plays_inserted >= 50000:  # Limit per season
                    break

                try:
                    cur.execute('''
                        INSERT INTO "NFLPlayByPlay"
                        (season, week, "gameId", "playId", "playType",
                         "passerId", "rusherId", "receiverId",
                         "passerName", "rusherName", "receiverName",
                         team, "defTeam", "quarter", "gameSecondsRemaining",
                         "scoreDifferential", "wpBefore", "wpAfter",
                         "yardsGained", "touchdown", "interception", "fumble",
                         "sack", "qbScramble", "rushAttempt", "passAttempt")
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                        ON CONFLICT DO NOTHING
                    ''', (
                        season,
                        int(play.get('week', 0)) if pd.notna(play.get('week')) else None,
                        play.get('game_id'),
                        int(play.get('play_id', 0)) if pd.notna(play.get('play_id')) else None,
                        play.get('play_type'),
                        play.get('passer_player_id'),
                        play.get('rusher_player_id'),
                        play.get('receiver_player_id'),
                        play.get('passer_player_name'),
                        play.get('rusher_player_name'),
                        play.get('receiver_player_name'),
                        play.get('posteam'),
                        play.get('defteam'),
                        int(play.get('qtr', 0)) if pd.notna(play.get('qtr')) else None,
                        int(play.get('game_seconds_remaining', 0)) if pd.notna(play.get('game_seconds_remaining')) else None,
                        int(play.get('score_differential', 0)) if pd.notna(play.get('score_differential')) else 0,
                        float(play.get('wp', 0)) if pd.notna(play.get('wp')) else None,
                        float(play.get('wpa', 0)) if pd.notna(play.get('wpa')) else None,
                        int(play.get('yards_gained', 0)) if pd.notna(play.get('yards_gained')) else 0,
                        bool(play.get('touchdown', 0)),
                        bool(play.get('interception', 0)),
                        bool(play.get('fumble_lost', 0)),
                        bool(play.get('sack', 0)),
                        bool(play.get('qb_scramble', 0)),
                        bool(play.get('rush_attempt', 0)),
                        bool(play.get('pass_attempt', 0)),
                    ))
                    plays_inserted += 1
                except Exception as e:
                    continue

            conn.commit()
            print(f"{plays_inserted:,} inserted")

            # Compute player situational stats
            print(f"    Computing player situational stats...")

            # QB stats (passers)
            qb_stats = pbp[pbp['passer_player_id'].notna()].groupby(['passer_player_id', 'passer_player_name', 'posteam']).agg({
                'play_id': 'count',
                'pass_attempt': 'sum',
                'yards_gained': 'sum',
                'touchdown': 'sum',
                'interception': 'sum',
                'qb_scramble': 'sum',
                'score_differential': lambda x: (x > 7).sum(),  # Leading plays
            }).reset_index()

            # Leading/trailing splits for QBs
            for _, qb in qb_stats.iterrows():
                player_id = qb['passer_player_id']
                player_name = qb['passer_player_name']
                team = qb['posteam']

                qb_plays = pbp[pbp['passer_player_id'] == player_id]

                leading = qb_plays[qb_plays['score_differential'] > 7]
                trailing = qb_plays[qb_plays['score_differential'] < -7]
                close = qb_plays[abs(qb_plays['score_differential']) <= 7]
                second_half = qb_plays[qb_plays['qtr'] >= 3]
                q4 = qb_plays[qb_plays['qtr'] == 4]
                clutch = qb_plays[(qb_plays['qtr'] == 4) & (abs(qb_plays['score_differential']) <= 7)]

                # Scramble rates
                scramble_rate_leading = leading['qb_scramble'].sum() / max(1, len(leading))
                scramble_rate_trailing = trailing['qb_scramble'].sum() / max(1, len(trailing))

                # INT rates
                int_rate_leading = leading['interception'].sum() / max(1, leading['pass_attempt'].sum())
                int_rate_trailing = trailing['interception'].sum() / max(1, trailing['pass_attempt'].sum())

                # Classifications
                game_script_type = 'NEUTRAL'
                if scramble_rate_leading > scramble_rate_trailing + 0.05:
                    game_script_type = 'SCRAMBLES_WHEN_LEADING'
                elif scramble_rate_trailing > scramble_rate_leading + 0.05:
                    game_script_type = 'SCRAMBLES_WHEN_TRAILING'

                clutch_type = 'NEUTRAL'
                if len(clutch) >= 20:
                    clutch_yards_per_play = clutch['yards_gained'].sum() / len(clutch)
                    overall_yards_per_play = qb_plays['yards_gained'].sum() / max(1, len(qb_plays))
                    if clutch_yards_per_play > overall_yards_per_play + 1:
                        clutch_type = 'CLUTCH_PERFORMER'
                    elif clutch_yards_per_play < overall_yards_per_play - 1:
                        clutch_type = 'CLUTCH_STRUGGLES'

                try:
                    cur.execute('''
                        INSERT INTO "NFLSituationalStats"
                        (season, "playerId", "playerName", team, "position",
                         "totalPlays", "totalPassAttempts", "totalYards", "totalTouchdowns", "totalInterceptions",
                         "playsWhenLeading", "passAttemptsLeading", "yardsLeading", "scrambleRateLeading", "intRateLeading",
                         "playsWhenTrailing", "passAttemptsTrailing", "yardsTrailing", "scrambleRateTrailing", "intRateTrailing",
                         "playsWhenClose", "passAttemptsClose", "yardsClose",
                         "secondHalfPlays", "secondHalfPassAttempts", "secondHalfYards",
                         "q4Plays", "q4PassAttempts", "q4Yards",
                         "clutchPlays", "clutchYards",
                         "gameScriptType", "clutchType")
                        VALUES (%s, %s, %s, %s, 'QB',
                                %s, %s, %s, %s, %s,
                                %s, %s, %s, %s, %s,
                                %s, %s, %s, %s, %s,
                                %s, %s, %s,
                                %s, %s, %s,
                                %s, %s, %s,
                                %s, %s,
                                %s, %s)
                        ON CONFLICT (season, "playerId", team) DO UPDATE SET
                            "totalPlays" = EXCLUDED."totalPlays",
                            "totalPassAttempts" = EXCLUDED."totalPassAttempts",
                            "scrambleRateLeading" = EXCLUDED."scrambleRateLeading",
                            "intRateTrailing" = EXCLUDED."intRateTrailing",
                            "gameScriptType" = EXCLUDED."gameScriptType",
                            "clutchType" = EXCLUDED."clutchType",
                            "updatedAt" = NOW()
                    ''', (
                        season, player_id, player_name, team,
                        len(qb_plays), int(qb_plays['pass_attempt'].sum()), int(qb_plays['yards_gained'].sum()),
                        int(qb_plays['touchdown'].sum()), int(qb_plays['interception'].sum()),
                        len(leading), int(leading['pass_attempt'].sum()), int(leading['yards_gained'].sum()),
                        round(scramble_rate_leading, 4), round(int_rate_leading, 4),
                        len(trailing), int(trailing['pass_attempt'].sum()), int(trailing['yards_gained'].sum()),
                        round(scramble_rate_trailing, 4), round(int_rate_trailing, 4),
                        len(close), int(close['pass_attempt'].sum()), int(close['yards_gained'].sum()),
                        len(second_half), int(second_half['pass_attempt'].sum()), int(second_half['yards_gained'].sum()),
                        len(q4), int(q4['pass_attempt'].sum()), int(q4['yards_gained'].sum()),
                        len(clutch), int(clutch['yards_gained'].sum()),
                        game_script_type, clutch_type
                    ))
                except Exception as e:
                    continue

            conn.commit()

            # Compute team tendencies
            print(f"    Computing team tendencies...")
            teams = pbp['posteam'].dropna().unique()

            for team in teams:
                team_plays = pbp[pbp['posteam'] == team]
                if len(team_plays) < 100:
                    continue

                leading = team_plays[team_plays['score_differential'] > 7]
                trailing = team_plays[team_plays['score_differential'] < -7]
                close = team_plays[abs(team_plays['score_differential']) <= 7]
                second_half = team_plays[team_plays['qtr'] >= 3]

                rush_rate = team_plays['rush_attempt'].sum() / max(1, len(team_plays))
                pass_rate = team_plays['pass_attempt'].sum() / max(1, len(team_plays))

                rush_rate_leading = leading['rush_attempt'].sum() / max(1, len(leading)) if len(leading) > 0 else None
                pass_rate_leading = leading['pass_attempt'].sum() / max(1, len(leading)) if len(leading) > 0 else None
                rush_rate_trailing = trailing['rush_attempt'].sum() / max(1, len(trailing)) if len(trailing) > 0 else None
                pass_rate_trailing = trailing['pass_attempt'].sum() / max(1, len(trailing)) if len(trailing) > 0 else None
                rush_rate_2h = second_half['rush_attempt'].sum() / max(1, len(second_half)) if len(second_half) > 0 else None
                pass_rate_2h = second_half['pass_attempt'].sum() / max(1, len(second_half)) if len(second_half) > 0 else None
                rush_rate_close = close['rush_attempt'].sum() / max(1, len(close)) if len(close) > 0 else None

                run_heavy_leading = rush_rate_leading and rush_rate_leading > rush_rate + 0.1 if rush_rate_leading else False
                pass_heavy_trailing = pass_rate_trailing and pass_rate_trailing > pass_rate + 0.1 if pass_rate_trailing else False

                conservative_type = 'NEUTRAL'
                if run_heavy_leading and pass_heavy_trailing:
                    conservative_type = 'GAME_SCRIPT_DEPENDENT'
                elif run_heavy_leading:
                    conservative_type = 'CONSERVATIVE_WHEN_LEADING'
                elif pass_heavy_trailing:
                    conservative_type = 'AGGRESSIVE_WHEN_TRAILING'

                try:
                    cur.execute('''
                        INSERT INTO "NFLTeamTendencies"
                        (season, team, "totalPlays", "rushRate", "passRate",
                         "rushRateLeading", "passRateLeading",
                         "rushRateTrailing", "passRateTrailing",
                         "rushRateSecondHalf", "passRateSecondHalf",
                         "rushRateClose",
                         "runHeavyWhenLeading", "passHeavyWhenTrailing", "conservativeType")
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                        ON CONFLICT (season, team) DO UPDATE SET
                            "totalPlays" = EXCLUDED."totalPlays",
                            "rushRate" = EXCLUDED."rushRate",
                            "rushRateLeading" = EXCLUDED."rushRateLeading",
                            "runHeavyWhenLeading" = EXCLUDED."runHeavyWhenLeading",
                            "conservativeType" = EXCLUDED."conservativeType",
                            "updatedAt" = NOW()
                    ''', (
                        season, team, len(team_plays),
                        round(rush_rate, 4), round(pass_rate, 4),
                        round(rush_rate_leading, 4) if rush_rate_leading else None,
                        round(pass_rate_leading, 4) if pass_rate_leading else None,
                        round(rush_rate_trailing, 4) if rush_rate_trailing else None,
                        round(pass_rate_trailing, 4) if pass_rate_trailing else None,
                        round(rush_rate_2h, 4) if rush_rate_2h else None,
                        round(pass_rate_2h, 4) if pass_rate_2h else None,
                        round(rush_rate_close, 4) if rush_rate_close else None,
                        run_heavy_leading, pass_heavy_trailing, conservative_type
                    ))
                except Exception as e:
                    continue

            conn.commit()
            print(f"    {season} complete")

        except Exception as e:
            print(f"error: {e}")
            continue

    # Summary stats
    cur.execute('SELECT COUNT(*) FROM "NFLPlayByPlay"')
    pbp_count = cur.fetchone()[0]
    cur.execute('SELECT COUNT(*) FROM "NFLSituationalStats"')
    player_count = cur.fetchone()[0]
    cur.execute('SELECT COUNT(*) FROM "NFLTeamTendencies"')
    team_count = cur.fetchone()[0]

    # Show sample results
    cur.execute('''
        SELECT "playerName", team, "scrambleRateLeading", "scrambleRateTrailing", "gameScriptType"
        FROM "NFLSituationalStats"
        WHERE "totalPlays" >= 100 AND "position" = 'QB'
        ORDER BY "scrambleRateLeading" DESC NULLS LAST
        LIMIT 10
    ''')
    results = cur.fetchall()
    if results:
        print("\nQB Scramble Rates by Game State:")
        for row in results:
            leading = f"{row[2]:.1%}" if row[2] else "N/A"
            trailing = f"{row[3]:.1%}" if row[3] else "N/A"
            print(f"  {row[0]} ({row[1]}): Leading={leading}, Trailing={trailing} - {row[4]}")

    cur.execute('''
        SELECT team, "rushRateLeading", "rushRateTrailing", "conservativeType"
        FROM "NFLTeamTendencies"
        WHERE "runHeavyWhenLeading" = true
        ORDER BY "rushRateLeading" DESC NULLS LAST
        LIMIT 10
    ''')
    results = cur.fetchall()
    if results:
        print("\nRun-Heavy When Leading Teams:")
        for row in results:
            leading = f"{row[1]:.1%}" if row[1] else "N/A"
            trailing = f"{row[2]:.1%}" if row[2] else "N/A"
            print(f"  {row[0]}: Leading={leading}, Trailing={trailing} - {row[3]}")

    cur.close()
    conn.close()
    print(f"\nNFL PBP complete: {pbp_count:,} plays, {player_count} players, {team_count} teams")

if __name__ == '__main__':
    scrape_nflfastr_pbp()
