#!/usr/bin/env python3
import csv
import json
import os
import subprocess
from datetime import datetime
from pathlib import Path


KAGGLE_CACHE = Path.home() / ".cache" / "kagglehub" / "datasets"


def latest_version_path(dataset_owner: str, dataset_name: str) -> Path:
    base = KAGGLE_CACHE / dataset_owner / dataset_name / "versions"
    if not base.exists():
        return None
    versions = sorted([int(v.name) for v in base.iterdir() if v.is_dir() and v.name.isdigit()], reverse=True)
    return base / str(versions[0]) if versions else None


def parse_date(val: str):
    if not val:
        return None
    val = val.strip()
    for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%d/%m/%Y"):
        try:
            return datetime.strptime(val.split('T')[0].split('+')[0], fmt).isoformat()
        except:
            pass
    return None


def sql_value(val):
    if val is None or val == "":
        return "NULL"
    if isinstance(val, (int, float)):
        return str(val)
    s = str(val).replace("'", "''")
    return f"'{s}'"


def to_int(val):
    try:
        return int(float(val))
    except:
        return None


def to_float(val):
    try:
        return float(val)
    except:
        return None


def load_env_db_url():
    db_url = os.environ.get("SPORTS_DATABASE_URL", "")
    if db_url:
        return db_url
    env_path = "/var/www/html/eventheodds/.env"
    if os.path.exists(env_path):
        with open(env_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.startswith("SPORTS_DATABASE_URL="):
                    return line.split("=", 1)[1].strip()
    return ""


def run_sql(sql):
    db_url = load_env_db_url()
    db_url = db_url.split("?schema=")[0] if "?schema=" in db_url else db_url
    if not db_url:
        raise RuntimeError("SPORTS_DATABASE_URL not set")
    tmp_path = "/tmp/ufc_kaggle_batch.sql"
    with open(tmp_path, "w", encoding="utf-8") as f:
        f.write(sql)
    result = subprocess.run(
        ["psql", db_url, "-f", tmp_path],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        timeout=300,
    )
    if result.returncode != 0:
        err = (result.stderr or "").strip()
        raise RuntimeError(f"psql failed: {err[:500]}")


def batch_insert(table, columns, batch_rows, conflict_clause):
    if not batch_rows:
        return
    # FIX: Add commas between value rows
    values_sql = ",\n".join(
        "(" + ", ".join(sql_value(row.get(col)) for col in columns) + ")"
        for row in batch_rows
    )
    cols_sql = ", ".join(f'"{c}"' for c in columns)
    sql = f'INSERT INTO "{table}" ({cols_sql}) VALUES\n{values_sql}\n{conflict_clause};'
    run_sql(sql)
    print(f"[ufc] Inserted {len(batch_rows)} rows into {table}")


def ingest_events():
    path = latest_version_path("aminealibi", "ufc-fights-fighters-and-events-dataset")
    if not path:
        print("[ufc] Events dataset not found")
        return
    csv_path = path / "data" / "Events.csv"
    if not csv_path.exists():
        print(f"[ufc] Events.csv not found")
        return
    
    print(f"[ufc] events_csv={csv_path}")
    rows = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            event_id = r.get("Event_Id")
            if not event_id:
                continue
            rows.append({
                "eventId": event_id,
                "name": r.get("Name"),
                "date": parse_date(r.get("Date")),
                "location": r.get("Location"),
                "raw": json.dumps(r),
            })
    
    print(f"[ufc] events rows={len(rows)}")
    for i in range(0, len(rows), 500):
        batch = rows[i:i+500]
        batch_insert(
            "UfcEvent",
            ["eventId", "name", "date", "location", "raw"],
            batch,
            'ON CONFLICT ("eventId") DO UPDATE SET name=EXCLUDED.name, date=EXCLUDED.date, location=EXCLUDED.location, raw=EXCLUDED.raw'
        )


def ingest_fighters():
    path = latest_version_path("aminealibi", "ufc-fights-fighters-and-events-dataset")
    if not path:
        print("[ufc] Fighters dataset not found")
        return
    csv_path = path / "data" / "Fighters.csv"
    if not csv_path.exists():
        print(f"[ufc] Fighters.csv not found")
        return
    
    print(f"[ufc] fighters_csv={csv_path}")
    rows = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            fighter_id = r.get("Fighter_Id")
            if not fighter_id:
                continue
            
            height_str = r.get("Ht.", "")
            height_in = None
            if height_str and '.' in height_str:
                try:
                    parts = height_str.split('.')
                    height_in = int(parts[0]) * 12 + int(parts[1])
                except:
                    pass
            
            rows.append({
                "fighterId": fighter_id,
                "name": r.get("Full Name"),
                "nickname": r.get("Nickname") if r.get("Nickname") != "No Nickname" else None,
                "heightIn": height_in,
                "weightLb": to_float(r.get("Wt.")),
                "reachIn": to_float(r.get("Reach")),
                "stance": r.get("Stance"),
                "wins": to_int(r.get("W")),
                "losses": to_int(r.get("L")),
                "draws": to_int(r.get("D")),
                "belt": r.get("Belt") if r.get("Belt") == "True" else None,
                "raw": json.dumps(r),
            })
    
    print(f"[ufc] fighters rows={len(rows)}")
    for i in range(0, len(rows), 500):
        batch = rows[i:i+500]
        batch_insert(
            "UfcFighter",
            ["fighterId", "name", "nickname", "heightIn", "weightLb", "reachIn", "stance", "wins", "losses", "draws", "belt", "raw"],
            batch,
            'ON CONFLICT ("fighterId") DO UPDATE SET name=EXCLUDED.name, nickname=EXCLUDED.nickname, "heightIn"=EXCLUDED."heightIn", "weightLb"=EXCLUDED."weightLb", "reachIn"=EXCLUDED."reachIn", stance=EXCLUDED.stance, wins=EXCLUDED.wins, losses=EXCLUDED.losses, draws=EXCLUDED.draws, belt=EXCLUDED.belt, raw=EXCLUDED.raw'
        )


def ingest_fights():
    path = latest_version_path("aminealibi", "ufc-fights-fighters-and-events-dataset")
    if not path:
        print("[ufc] Fights dataset not found")
        return
    csv_path = path / "data" / "Fights.csv"
    if not csv_path.exists():
        print(f"[ufc] Fights.csv not found")
        return
    
    print(f"[ufc] fights_csv={csv_path}")
    rows = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            fight_id = r.get("Fight_Id")
            if not fight_id:
                continue
            rows.append({
                "fightId": fight_id,
                "eventId": r.get("Event_Id"),
                "fighter1Id": r.get("Fighter_Id_1"),
                "fighter2Id": r.get("Fighter_Id_2"),
                "fighter1Name": r.get("Fighter_1"),
                "fighter2Name": r.get("Fighter_2"),
                "weightClass": r.get("Weight_Class"),
                "method": r.get("Method"),
                "round": to_int(r.get("Round")),
                "fightTime": r.get("Fight_Time"),
                "result1": r.get("Result_1"),
                "result2": r.get("Result_2"),
                "referee": r.get("Referee"),
                "methodDetails": r.get("Method Details"),
                "raw": json.dumps(r),
            })
    
    print(f"[ufc] fights rows={len(rows)}")
    for i in range(0, len(rows), 500):
        batch = rows[i:i+500]
        batch_insert(
            "UfcFight",
            ["fightId", "eventId", "fighter1Id", "fighter2Id", "fighter1Name", "fighter2Name", "weightClass", "method", "round", "fightTime", "result1", "result2", "referee", "methodDetails", "raw"],
            batch,
            'ON CONFLICT ("fightId") DO UPDATE SET "eventId"=EXCLUDED."eventId", "fighter1Id"=EXCLUDED."fighter1Id", "fighter2Id"=EXCLUDED."fighter2Id", "fighter1Name"=EXCLUDED."fighter1Name", "fighter2Name"=EXCLUDED."fighter2Name", "weightClass"=EXCLUDED."weightClass", method=EXCLUDED.method, round=EXCLUDED.round, "fightTime"=EXCLUDED."fightTime", result1=EXCLUDED.result1, result2=EXCLUDED.result2, referee=EXCLUDED.referee, "methodDetails"=EXCLUDED."methodDetails", raw=EXCLUDED.raw'
        )


def ingest_odds():
    path = latest_version_path("jerzyszocik", "ufc-betting-odds-daily-dataset")
    if not path:
        print("[ufc] Odds dataset not found")
        return
    csv_path = path / "UFC_betting_odds.csv"
    if not csv_path.exists():
        print(f"[ufc] UFC_betting_odds.csv not found")
        return
    
    print(f"[ufc] odds_csv={csv_path}")
    rows = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            fighter1 = r.get("fighter_1")
            fighter2 = r.get("fighter_2")
            if not fighter1 or not fighter2:
                continue
            
            rows.append({
                "fightUrl": r.get("fight_url") or None,
                "fighter1": fighter1,
                "fighter2": fighter2,
                "odds1": to_float(r.get("odds_1")),
                "odds2": to_float(r.get("odds_2")),
                "f1KoOdds": to_float(r.get("f1_ko_odds")),
                "f2KoOdds": to_float(r.get("f2_ko_odds")),
                "f1SubOdds": to_float(r.get("f1_sub_odds")),
                "f2SubOdds": to_float(r.get("f2_sub_odds")),
                "f1DecOdds": to_float(r.get("f1_dec_odds")),
                "f2DecOdds": to_float(r.get("f2_dec_odds")),
                "eventDate": parse_date(r.get("event_date")),
                "addingDate": parse_date(r.get("adding_date")),
                "source": r.get("source"),
                "region": r.get("region") or "unknown",
                "raw": json.dumps(r),
            })
    
    print(f"[ufc] odds rows={len(rows)}")
    for i in range(0, len(rows), 500):
        batch = rows[i:i+500]
        batch_insert(
            "UfcOdds",
            ["fightUrl", "fighter1", "fighter2", "odds1", "odds2", "f1KoOdds", "f2KoOdds", "f1SubOdds", "f2SubOdds", "f1DecOdds", "f2DecOdds", "eventDate", "addingDate", "source", "region", "raw"],
            batch,
            'ON CONFLICT (fighter1, fighter2, "eventDate", source, region) DO UPDATE SET "fightUrl"=EXCLUDED."fightUrl", odds1=EXCLUDED.odds1, odds2=EXCLUDED.odds2, "f1KoOdds"=EXCLUDED."f1KoOdds", "f2KoOdds"=EXCLUDED."f2KoOdds", "f1SubOdds"=EXCLUDED."f1SubOdds", "f2SubOdds"=EXCLUDED."f2SubOdds", "f1DecOdds"=EXCLUDED."f1DecOdds", "f2DecOdds"=EXCLUDED."f2DecOdds", "addingDate"=EXCLUDED."addingDate", raw=EXCLUDED.raw'
        )


def main():
    print("[ufc] Starting UFC Kaggle normalization...")
    ingest_events()
    ingest_fighters()
    ingest_fights()
    ingest_odds()
    print("[ufc] UFC normalization complete")


if __name__ == "__main__":
    main()
