#!/usr/bin/env python3
"""
Sync Kaggle Datasets Daily
Downloads updates from tracked Kaggle datasets that update frequently.

Run: Daily at 4am UTC via cron
"""
import os
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path

# Datasets to sync daily (only ones that update frequently)
DAILY_DATASETS = [
    'jerzyszocik/ufc-betting-odds-daily-dataset',  # UFC betting odds - daily updates
]

# Datasets to sync weekly (historical data, updates less often)
WEEKLY_DATASETS = [
    'tobycrabtree/nfl-scores-and-betting-data',    # NFL scores and betting
    'jonathanncoletti/nhl-historical-game-data',   # NHL historical
    'zachht/wnba-odds-history',                     # WNBA odds
]

ROOT = Path('/var/www/html/eventheodds')
VENV = ROOT / '.venv_kaggle'


def load_kaggle_token():
    """Load Kaggle API token from .env"""
    env_path = ROOT / '.env'
    try:
        with open(env_path, 'r') as f:
            for line in f:
                if line.startswith('KAGGLE_API_TOKEN='):
                    return line.split('=', 1)[1].strip()
    except FileNotFoundError:
        pass
    return os.environ.get('KAGGLE_API_TOKEN', '')


def sync_dataset(dataset: str) -> dict:
    """Download/update a single Kaggle dataset"""
    token = load_kaggle_token()
    if not token:
        return {'dataset': dataset, 'ok': False, 'error': 'No KAGGLE_API_TOKEN'}

    env = os.environ.copy()
    env['KAGGLE_API_TOKEN'] = token

    # Use the backfill script which handles download + manifest
    python = str(VENV / 'bin' / 'python3')
    script = str(ROOT / 'scripts' / 'kaggle_backfill.py')

    cmd = [python, script, '--dataset', dataset, '--unzip']

    try:
        result = subprocess.run(
            cmd,
            cwd=str(ROOT),
            env=env,
            capture_output=True,
            text=True,
            timeout=600  # 10 min timeout
        )

        if result.returncode == 0:
            return {'dataset': dataset, 'ok': True, 'output': result.stdout[-500:]}
        else:
            return {'dataset': dataset, 'ok': False, 'error': result.stderr[-500:] or result.stdout[-500:]}
    except Exception as e:
        return {'dataset': dataset, 'ok': False, 'error': str(e)}


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--weekly', action='store_true', help='Also sync weekly datasets')
    args = parser.parse_args()

    print("=" * 60)
    print("KAGGLE DATASET SYNC")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    datasets = DAILY_DATASETS.copy()
    if args.weekly:
        datasets.extend(WEEKLY_DATASETS)
        print("Mode: Weekly (all datasets)")
    else:
        print("Mode: Daily (frequent datasets only)")

    print(f"Syncing {len(datasets)} datasets...\n")

    results = []
    for ds in datasets:
        print(f"Syncing: {ds}...")
        result = sync_dataset(ds)
        results.append(result)
        status = "OK" if result['ok'] else f"FAILED: {result.get('error', 'unknown')[:80]}"
        print(f"  {status}")

    # Summary
    ok_count = sum(1 for r in results if r['ok'])
    print(f"\n{'=' * 60}")
    print(f"SUMMARY: {ok_count}/{len(datasets)} successful")
    print("=" * 60)

    return 0 if ok_count == len(datasets) else 1


if __name__ == '__main__':
    sys.exit(main())
