#!/usr/bin/env python3
"""
Kaggle dataset backfill downloader (no normalization yet).

Downloads Kaggle datasets to disk and writes a small manifest JSON into:
  data/external_feeds/inbox/kaggle/
so SportsDB ExternalFeedRecord can track provenance + dedupe.

Auth options (auto-detected):
- KaggleHub token: set KAGGLE_API_TOKEN=KGAT_... (preferred if you have it)
- Kaggle CLI token file: ~/.kaggle/kaggle.json (classic)

Example:
  KAGGLE_API_TOKEN=KGAT_... python3 scripts/kaggle_backfill.py --dataset zachht/wnba-odds-history
  python3 scripts/kaggle_backfill.py --dataset jerzyszocik/ufc-betting-odds-daily-dataset --unzip
"""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional


ROOT = Path("/var/www/html/eventheodds")
DATA_DIR = ROOT / "data"
KAGGLE_DIR = DATA_DIR / "kaggle"
INBOX_DIR = DATA_DIR / "external_feeds" / "inbox" / "kaggle"


def now_iso() -> str:
    return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"


def sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def write_atomic(path: Path, payload: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + ".tmp")
    tmp.write_text(json.dumps(payload, indent=2, default=str))
    tmp.replace(path)


def find_kaggle_json() -> Optional[Path]:
    cfg_dir = Path(os.environ.get("KAGGLE_CONFIG_DIR", str(Path.home() / ".kaggle")))
    token = cfg_dir / "kaggle.json"
    return token if token.exists() else None


def has_kaggle_api_token() -> bool:
    # KaggleHub token format often starts with "KGAT_"
    tok = (os.environ.get("KAGGLE_API_TOKEN") or "").strip()
    return bool(tok)


def run_cmd(cmd: List[str], cwd: Path | None = None) -> str:
    p = subprocess.run(cmd, cwd=str(cwd) if cwd else None, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    if p.returncode != 0:
        raise RuntimeError(f"Command failed ({p.returncode}): {' '.join(cmd)}\n{p.stdout}")
    return p.stdout


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dataset", required=True, help="Kaggle dataset slug, e.g. zachht/wnba-odds-history")
    ap.add_argument("--unzip", action="store_true", help="Unzip the downloaded archive")
    ap.add_argument("--engine", default="auto", choices=["auto", "kagglehub", "kagglecli"], help="Download engine")
    args = ap.parse_args()

    dataset = args.dataset.strip()
    slug_safe = dataset.replace("/", "__")
    dest_dir = KAGGLE_DIR / slug_safe
    dest_dir.mkdir(parents=True, exist_ok=True)
    INBOX_DIR.mkdir(parents=True, exist_ok=True)

    out = ""
    engine = args.engine
    kaggle_json = find_kaggle_json()
    token_ok = has_kaggle_api_token()

    if engine == "auto":
        engine = "kagglehub" if token_ok else "kagglecli"

    if engine == "kagglehub":
        if not token_ok:
            raise RuntimeError("KAGGLE_API_TOKEN is not set (required for engine=kagglehub)")
        try:
            import kagglehub  # type: ignore
        except Exception as e:
            raise RuntimeError(f"kagglehub not installed. Install with: pip install kagglehub[pandas-datasets]. Details: {e}") from e

        # kagglehub will download files to its cache; we then copy into our dest_dir.
        # dataset_download returns a directory path for the dataset in cache.
        cache_path = kagglehub.dataset_download(dataset)  # type: ignore
        out = f"kagglehub.dataset_download -> {cache_path}"

        src = Path(str(cache_path))
        if not src.exists():
            raise RuntimeError(f"kagglehub returned non-existent path: {src}")

        # Copy cache contents to our storage folder (skip if already exists)
        for p in src.rglob("*"):
            if p.is_dir():
                continue
            rel = p.relative_to(src)
            dest = dest_dir / rel
            dest.parent.mkdir(parents=True, exist_ok=True)
            if dest.exists():
                continue
            shutil.copy2(str(p), str(dest))
    else:
        # Kaggle CLI requires kaggle.json
        if not kaggle_json:
            raise RuntimeError(
                f"Kaggle credentials missing: expected {Path.home() / '.kaggle' / 'kaggle.json'} "
                "(or set KAGGLE_CONFIG_DIR)."
            )
        # Use kaggle CLI to download; it writes a zip when dataset has multiple files.
        cmd = ["kaggle", "datasets", "download", "-d", dataset, "-p", str(dest_dir)]
        out = run_cmd(cmd)

    # Find newest archive(s)
    files = sorted(dest_dir.glob("*.zip"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        # Some datasets can download as a single file; include everything newish
        files = sorted(dest_dir.glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise RuntimeError(f"No files downloaded for dataset={dataset} into {dest_dir}")

    if args.unzip:
        for z in [p for p in files if p.suffix.lower() == ".zip"]:
            shutil.unpack_archive(str(z), str(dest_dir))

    # Write a manifest file referencing what’s on disk (we do NOT embed the dataset itself in JSON).
    manifests: List[Dict[str, Any]] = []
    for p in files[:10]:
        if not p.is_file():
            continue
        manifests.append(
            {
                "fileName": p.name,
                "filePath": str(p),
                "bytes": p.stat().st_size,
                "sha256": sha256_file(p),
                "mtime": datetime.utcfromtimestamp(p.stat().st_mtime).isoformat() + "Z",
            }
        )

    payload = {
        "ok": True,
        "source": "kaggle",
        "dataset": dataset,
        "downloadedAt": now_iso(),
        "engine": engine,
        "kaggleCliOutput": out[-4000:] if isinstance(out, str) else str(out),
        "files": manifests,
        "notes": "Raw dataset stored on disk under data/kaggle/. This manifest is what gets ingested/deduped into SportsDB.ExternalFeedRecord.",
    }

    out_path = INBOX_DIR / f"{slug_safe}_{int(time.time())}.json"
    write_atomic(out_path, payload)

    print(f"[kaggle_backfill] wrote manifest {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())

