#!/usr/bin/env python3
"""
Dump NCAA data from a locally hosted ncaa-api instance (henrygd/ncaa-api) to JSON files.

Goal: collect *real* NCAA schedules + scoreboards + school index now; ingest into SportsDB later.

API (self-hosted):
  docker run -d --name eventheodds-ncaa-api -p 3099:3000 henrygd/ncaa-api:latest

Routes (from OpenAPI):
  /schools-index
  /schedule/{sport}/{division}/{path}
  /scoreboard/{sport}/{path}

We focus on:
  - Football (FBS) 2002..current
  - Basketball-men (D1) 2003..current
  - Basketball-women (D1) 2003..current

Outputs:
  data/ncaa_api/
    schools-index.json
    schedule/<sport>/<division>/<path>.json
    scoreboard/<sport>/<division>/<yyyy>/<mm>/<dd>/all-conf.json
    scoreboard/football/fbs/<yyyy>/<wk>/all-conf.json
"""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import sys
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import requests


def _now_iso() -> str:
    return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"


def _safe_path(s: str) -> str:
    # Keep slashes (directory structure) but prevent traversal
    s = s.strip().lstrip("/").replace("..", "_")
    return s


def _write_json_atomic(path: Path, payload: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + ".tmp")
    tmp.write_text(json.dumps(payload, indent=2, default=str))
    tmp.replace(path)


@dataclass
class Client:
    base_url: str
    out_dir: Path
    sleep: float
    timeout: float

    def get(self, rel: str) -> Any:
        url = self.base_url.rstrip("/") + "/" + rel.lstrip("/")
        r = requests.get(url, timeout=self.timeout)
        if r.status_code != 200:
            raise RuntimeError(f"GET {url} -> {r.status_code}: {r.text[:200]}")
        time.sleep(self.sleep)
        return r.json()

    def dump(self, rel: str, out_path: Path) -> bool:
        if out_path.exists():
            return False
        data = self.get(rel)
        envelope = {"fetchedAt": _now_iso(), "source": "ncaa-api", "path": rel, "data": data}
        _write_json_atomic(out_path, envelope)
        return True


def _schedule_paths_for_basketball(year: int) -> List[str]:
    # /schedule/{sport}/{division}/{path} where path for most sports is YYYY/MM.
    # We brute-force months 01-12 (API is fast, and we cache to disk).
    return [f"{year}/{m:02d}" for m in range(1, 13)]


def _schedule_paths_for_football(year: int) -> List[str]:
    # football uses YYYY
    return [str(year)]


def _extract_game_dates_from_schedule(schedule_payload: Dict[str, Any]) -> List[str]:
    """
    schedule payload is whatever ncaa-api returns. Common shape:
      { ..., "gameDates": [ { "contest_date": "02-01-2023", "year":"2023", "month":"02", "day":"01" }, ... ] }
    We return date strings in YYYY/MM/DD.
    """
    data = schedule_payload.get("data") if isinstance(schedule_payload.get("data"), dict) else schedule_payload
    if isinstance(data, dict):
        dates = data.get("gameDates") or data.get("game_dates") or []
    else:
        dates = []
    out: List[str] = []
    for d in dates:
        # Prefer explicit year/month/day
        y_raw = str(d.get("year") or "").strip()
        m_raw = str(d.get("month") or "").strip()
        day_raw = str(d.get("day") or "").strip()
        if y_raw.isdigit() and m_raw.isdigit() and day_raw.isdigit():
            y = int(y_raw)
            m = int(m_raw)
            day = int(day_raw)
            # guard against upstream weirdness (e.g., month=0)
            if 1900 <= y <= 2100 and 1 <= m <= 12 and 1 <= day <= 31:
                out.append(f"{y}/{m:02d}/{day:02d}")
                continue
        # Fallback: contest_date "MM-DD-YYYY"
        cd = str(d.get("contest_date") or d.get("contestDate") or "").strip()
        if len(cd) >= 10 and cd[2] == "-" and cd[5] == "-":
            mm, dd, yyyy = cd[0:2], cd[3:5], cd[6:10]
            if yyyy.isdigit():
                out.append(f"{yyyy}/{mm}/{dd}")
    # unique + sorted
    return sorted(set(out))


def _scoreboard_path_for_basketball(division: str, date_ymd: str) -> str:
    # scoreboard path param example: d1/2024/01/all-conf (docs omit day but description says YYYY/MM/DD works)
    # We'll include the day: d1/YYYY/MM/DD/all-conf
    return f"{division}/{date_ymd}/all-conf"


def _scoreboard_paths_for_football(division: str, year: int) -> List[str]:
    # football uses YYYY/WK; we don't know max weeks historically so try 1..20.
    return [f"{division}/{year}/{wk:02d}/all-conf" for wk in range(1, 21)]


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--base-url", default=os.environ.get("NCAA_API_BASE_URL", "http://127.0.0.1:3099"))
    ap.add_argument("--out-dir", default=os.environ.get("NCAA_API_OUT_DIR", "/var/www/html/eventheodds/data/ncaa_api"))
    ap.add_argument("--sleep", type=float, default=float(os.environ.get("NCAA_API_SLEEP", "0.25")))
    ap.add_argument("--timeout", type=float, default=30.0)
    ap.add_argument("--start-year", type=int, default=2002)
    ap.add_argument("--end-year", type=int, default=datetime.utcnow().year)
    args = ap.parse_args()

    out_dir = Path(args.out_dir)
    client = Client(base_url=args.base_url, out_dir=out_dir, sleep=args.sleep, timeout=args.timeout)

    print(f"[ncaa_dump] base={client.base_url} out={out_dir} years={args.start_year}..{args.end_year}")

    # 1) School index (single file)
    client.dump("schools-index", out_dir / "schools-index.json")

    jobs_done = 0

    # 2) Basketball schedules -> scoreboards by day (MBB + WBB)
    for sport, division, start in [
        ("basketball-men", "d1", max(2003, args.start_year)),
        ("basketball-women", "d1", max(2003, args.start_year)),
    ]:
        for year in range(start, args.end_year + 1):
            for path in _schedule_paths_for_basketball(year):
                sched_rel = f"schedule/{sport}/{division}/{path}"
                sched_out = out_dir / "schedule" / sport / division / f"{_safe_path(path)}.json"
                if not sched_out.exists():
                    try:
                        data = client.get(sched_rel)
                        envelope = {"fetchedAt": _now_iso(), "source": "ncaa-api", "path": sched_rel, "data": data}
                        _write_json_atomic(sched_out, envelope)
                        jobs_done += 1
                    except Exception as e:
                        print(f"[ncaa_dump] WARN schedule failed {sched_rel}: {e}")
                        continue

                # Use the stored schedule file for dates (avoid re-request)
                try:
                    sched_payload = json.loads(sched_out.read_text())
                    dates = _extract_game_dates_from_schedule(sched_payload)
                except Exception:
                    dates = []

                for ymd in dates:
                    sb_path = _scoreboard_path_for_basketball(division, ymd)
                    sb_rel = f"scoreboard/{sport}/{sb_path}"
                    sb_out = out_dir / "scoreboard" / sport / division / ymd / "all-conf.json"
                    try:
                        if client.dump(sb_rel, sb_out):
                            jobs_done += 1
                    except Exception as e:
                        # Some days/years may not have data; keep moving
                        print(f"[ncaa_dump] WARN scoreboard failed {sb_rel}: {e}")

    # 3) Football schedules + scoreboards by week (FBS only for now)
    sport = "football"
    division = "fbs"
    for year in range(args.start_year, args.end_year + 1):
        for path in _schedule_paths_for_football(year):
            sched_rel = f"schedule/{sport}/{division}/{path}"
            sched_out = out_dir / "schedule" / sport / division / f"{_safe_path(path)}.json"
            try:
                if client.dump(sched_rel, sched_out):
                    jobs_done += 1
            except Exception as e:
                print(f"[ncaa_dump] WARN schedule failed {sched_rel}: {e}")

        # Scoreboards by week (01..20)
        for sb_path in _scoreboard_paths_for_football(division, year):
            sb_rel = f"scoreboard/{sport}/{sb_path}"
            parts = sb_path.split("/")
            # fbs/YYYY/WK/all-conf
            _, yyyy, wk, _ = parts[0], parts[1], parts[2], parts[3]
            sb_out = out_dir / "scoreboard" / sport / division / str(yyyy) / str(wk) / "all-conf.json"
            try:
                if client.dump(sb_rel, sb_out):
                    jobs_done += 1
            except Exception as e:
                print(f"[ncaa_dump] WARN scoreboard failed {sb_rel}: {e}")

    print(f"[ncaa_dump] done jobs={jobs_done}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())

