#!/usr/bin/env python3
import hashlib
import json
import os
import sys
from datetime import datetime
from pathlib import Path


DEFAULT_DATASETS = [
    "aminealibi/ufc-fights-fighters-and-events-dataset",
    "jerzyszocik/ufc-betting-odds-daily-dataset",
]


def ensure_kagglehub():
    try:
        import kagglehub  # noqa: F401
    except Exception as e:
        raise RuntimeError("kagglehub is not installed in the active Python environment") from e


def hash_manifest(manifest):
    payload = json.dumps(manifest, sort_keys=True).encode("utf-8")
    return hashlib.sha256(payload).hexdigest()


def list_files(root):
    files = []
    for path in Path(root).rglob("*"):
        if path.is_file():
            stat = path.stat()
            files.append({
                "path": str(path),
                "size": stat.st_size,
                "mtime": int(stat.st_mtime),
            })
    return sorted(files, key=lambda x: x["path"])


def main():
    ensure_kagglehub()
    import kagglehub

    base_dir = Path(os.environ.get("KAGGLE_UFC_DATA_DIR", "/var/www/html/eventheodds/data/kaggle/ufc"))
    base_dir.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.utcnow().isoformat() + "Z"

    datasets_env = os.environ.get("KAGGLE_DATASETS", "").strip()
    datasets = [d.strip() for d in datasets_env.split(",") if d.strip()] or DEFAULT_DATASETS

    outputs = []
    for dataset in datasets:
        try:
            path = kagglehub.dataset_download(dataset)
            files = list_files(path)
            manifest = {
                "dataset": dataset,
                "downloadPath": str(path),
                "files": files,
                "downloadedAt": timestamp,
            }
            dataset_dir = base_dir / dataset.replace("/", "_")
            dataset_dir.mkdir(parents=True, exist_ok=True)
            manifest_path = dataset_dir / f"manifest_{datetime.utcnow().strftime('%Y%m%d')}.json"
            manifest_path.write_text(json.dumps(manifest, indent=2))
            outputs.append({
                "dataset": dataset,
                "hash": hash_manifest(manifest),
                "manifest_path": str(manifest_path),
            })
        except Exception as e:
            print(f"[kaggle] Failed for {dataset}: {e}", file=sys.stderr)
            outputs.append({
                "dataset": dataset,
                "hash": None,
                "manifest_path": None,
                "error": str(e),
            })

    print(json.dumps(outputs))


if __name__ == "__main__":
    main()
