#!/usr/bin/env python3
"""
KenPom Data Ingestion Pipeline for Rainmaker
---------------------------------------------
Fetches all KenPom endpoints and stores in normalized tables.
Designed for daily cron execution.

Usage:
    python3 ingest.py                    # Full daily pull (all endpoints)
    python3 ingest.py --endpoint ratings # Single endpoint
    python3 ingest.py --fanmatch-date 2026-03-08  # Specific fanmatch date
    python3 ingest.py --archive-date 2026-02-15   # Historical archive
    python3 ingest.py --backfill-archive 30        # Backfill N days of archive
"""

import argparse
import json
import os
import sys
import time
import unicodedata
import re
import logging
from datetime import datetime, date, timedelta
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError

import psycopg2
from psycopg2.extras import execute_values

# ── Config ──────────────────────────────────────────────────────

KENPOM_API_KEY = os.environ.get(
    "KENPOM_API_KEY",
    "d330867ab2c7deae412e8fe1274021fba1e9a9af5c5b53a9e1c5e289b32a8edb",
)
KENPOM_BASE = "https://kenpom.com/api.php"
SEASON = 2026  # 2025-26 season

DB_CONFIG = {
    "host": "127.0.0.1",
    "port": 5433,
    "dbname": "eventheodds_sports",
    "user": "eventheodds",
    "password": "eventheodds_dev_password",
}

# Rate limit: be respectful to KenPom's servers
REQUEST_DELAY_SECONDS = 1.5

LOG_DIR = "/var/log/rainmaker"
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(f"{LOG_DIR}/kenpom-ingest.log"),
        logging.StreamHandler(),
    ],
)
log = logging.getLogger("kenpom")


# ── Name Normalization ──────────────────────────────────────────

def normalize_team_name(name: str) -> str:
    """Produce a canonical lowercase key from any team name variant.
    Strips diacritics, punctuation, extra whitespace, common suffixes.
    """
    if not name:
        return ""
    # Unicode normalize → strip diacritics
    s = unicodedata.normalize("NFKD", name)
    s = "".join(c for c in s if not unicodedata.combining(c))
    # Lowercase
    s = s.lower().strip()
    # Remove apostrophes, periods, hyphens, commas
    s = re.sub(r"['\"\.\-,;:!?()]", "", s)
    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    # Common suffix normalization
    s = re.sub(r"\bst\.?\b", "st", s)
    s = re.sub(r"\buniv\.?\b", "", s)
    s = re.sub(r"\buniversity\b", "", s)
    s = re.sub(r"\bcollege\b", "", s)
    s = s.strip()
    return s


# ── API Client ──────────────────────────────────────────────────

def kenpom_fetch(endpoint: str, params: dict = None, retries: int = 3) -> list:
    """Fetch from KenPom API with retry logic and rate limiting."""
    params = params or {}
    params["endpoint"] = endpoint
    qs = "&".join(f"{k}={v}" for k, v in params.items())
    url = f"{KENPOM_BASE}?{qs}"

    for attempt in range(1, retries + 1):
        try:
            req = Request(url)
            req.add_header("Authorization", f"Bearer {KENPOM_API_KEY}")
            req.add_header("User-Agent", "Rainmaker/1.0")

            with urlopen(req, timeout=30) as resp:
                raw = resp.read().decode("utf-8")
                data = json.loads(raw)

                if isinstance(data, dict) and "error" in data:
                    log.error(f"KenPom API error on {endpoint}: {data['error']}")
                    return []

                time.sleep(REQUEST_DELAY_SECONDS)
                return data if isinstance(data, list) else [data]

        except HTTPError as e:
            log.warning(f"HTTP {e.code} on {endpoint} (attempt {attempt}/{retries})")
            if e.code == 429:  # rate limited
                wait = min(30, 5 * attempt)
                log.info(f"Rate limited, waiting {wait}s...")
                time.sleep(wait)
            elif e.code >= 500:
                time.sleep(3 * attempt)
            else:
                log.error(f"HTTP {e.code} on {endpoint}: {e.read().decode()[:200]}")
                return []
        except (URLError, OSError) as e:
            log.warning(f"Network error on {endpoint} (attempt {attempt}/{retries}): {e}")
            time.sleep(3 * attempt)

    log.error(f"Failed to fetch {endpoint} after {retries} attempts")
    return []


# ── Database Helpers ────────────────────────────────────────────

def get_conn():
    return psycopg2.connect(**DB_CONFIG)


def log_ingestion(conn, endpoint, season, pull_date, rows_fetched, rows_inserted,
                  rows_updated=0, status="success", error_msg=None, duration_ms=0):
    with conn.cursor() as cur:
        cur.execute("""
            INSERT INTO kp_ingestion_log
            (endpoint, season, pull_date, rows_fetched, rows_inserted, rows_updated,
             status, error_message, duration_ms)
            VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
        """, (endpoint, season, pull_date, rows_fetched, rows_inserted,
              rows_updated, status, error_msg, duration_ms))
    conn.commit()


# ── Endpoint Ingestors ──────────────────────────────────────────

def ingest_ratings(conn, season=SEASON):
    """Ingest kp_ratings from the ratings endpoint."""
    t0 = time.time()
    data = kenpom_fetch("ratings", {"y": season})
    if not data:
        log_ingestion(conn, "ratings", season, date.today(), 0, 0, status="error",
                      error_msg="No data returned")
        return 0

    today = date.today()
    rows = []
    for t in data:
        rows.append((
            season, t.get("TeamName"), normalize_team_name(t.get("TeamName", "")),
            t.get("ConfShort"), t.get("Coach"), t.get("Wins"), t.get("Losses"),
            t.get("Seed"), t.get("AdjEM"), t.get("RankAdjEM"),
            t.get("Pythag"), t.get("RankPythag"),
            t.get("AdjOE"), t.get("RankAdjOE"), t.get("OE"),
            t.get("AdjDE"), t.get("RankAdjDE"), t.get("DE"),
            t.get("Tempo"), t.get("AdjTempo"), t.get("RankAdjTempo"),
            t.get("Luck"), t.get("RankLuck"),
            t.get("SOS"), t.get("RankSOS"),
            t.get("SOSO"), t.get("RankSOSO"),
            t.get("SOSD"), t.get("RankSOSD"),
            t.get("NCSOS"), t.get("RankNCSOS"),
            t.get("APL_Off"), t.get("APL_Def"),
            t.get("Event"), t.get("DataThrough"),
            today,
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_ratings
            (season, team_name, team_name_canonical, conf_short, coach, wins, losses,
             seed, adj_em, rank_adj_em, pythag, rank_pythag,
             adj_oe, rank_adj_oe, oe, adj_de, rank_adj_de, de,
             tempo, adj_tempo, rank_adj_tempo, luck, rank_luck,
             sos, rank_sos, soso, rank_soso, sosd, rank_sosd,
             ncsos, rank_ncsos, apl_off, apl_def, event, data_through, pull_date)
            VALUES %s
            ON CONFLICT (season, team_name, pull_date) DO UPDATE SET
                adj_em = EXCLUDED.adj_em, rank_adj_em = EXCLUDED.rank_adj_em,
                adj_oe = EXCLUDED.adj_oe, rank_adj_oe = EXCLUDED.rank_adj_oe,
                adj_de = EXCLUDED.adj_de, rank_adj_de = EXCLUDED.rank_adj_de,
                adj_tempo = EXCLUDED.adj_tempo, wins = EXCLUDED.wins,
                losses = EXCLUDED.losses, seed = EXCLUDED.seed,
                luck = EXCLUDED.luck, sos = EXCLUDED.sos, rank_sos = EXCLUDED.rank_sos,
                data_through = EXCLUDED.data_through, pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "ratings", season, today, len(data), len(rows), duration_ms=ms)
    log.info(f"Ratings: {len(rows)} teams ingested ({ms}ms)")
    return len(rows)


def ingest_four_factors(conn, season=SEASON, conf_only=False):
    """Ingest kp_four_factors."""
    t0 = time.time()
    params = {"y": season}
    if conf_only:
        params["conf_only"] = "true"
    data = kenpom_fetch("four-factors", params)
    if not data:
        log_ingestion(conn, "four-factors", season, date.today(), 0, 0, status="error")
        return 0

    today = date.today()
    rows = []
    for t in data:
        is_conf = str(t.get("ConfOnly", "false")).lower() == "true"
        rows.append((
            season, t.get("TeamName"), normalize_team_name(t.get("TeamName", "")),
            is_conf,
            t.get("eFG_Pct"), t.get("RankeFG_Pct"),
            t.get("TO_Pct"), t.get("RankTO_Pct"),
            t.get("OR_Pct"), t.get("RankOR_Pct"),
            t.get("FT_Rate"), t.get("RankFT_Rate"),
            t.get("DeFG_Pct"), t.get("RankDeFG_Pct"),
            t.get("DTO_Pct"), t.get("RankDTO_Pct"),
            t.get("DOR_Pct"), t.get("RankDOR_Pct"),
            t.get("DFT_Rate"), t.get("RankDFT_Rate"),
            t.get("AdjOE"), t.get("AdjDE"), t.get("AdjTempo"),
            t.get("DataThrough"), today,
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_four_factors
            (season, team_name, team_name_canonical, conf_only,
             efg_pct, rank_efg_pct, to_pct, rank_to_pct,
             or_pct, rank_or_pct, ft_rate, rank_ft_rate,
             defg_pct, rank_defg_pct, dto_pct, rank_dto_pct,
             dor_pct, rank_dor_pct, dft_rate, rank_dft_rate,
             adj_oe, adj_de, adj_tempo, data_through, pull_date)
            VALUES %s
            ON CONFLICT (season, team_name, pull_date, conf_only) DO UPDATE SET
                efg_pct = EXCLUDED.efg_pct, to_pct = EXCLUDED.to_pct,
                or_pct = EXCLUDED.or_pct, ft_rate = EXCLUDED.ft_rate,
                defg_pct = EXCLUDED.defg_pct, dto_pct = EXCLUDED.dto_pct,
                dor_pct = EXCLUDED.dor_pct, dft_rate = EXCLUDED.dft_rate,
                adj_oe = EXCLUDED.adj_oe, adj_de = EXCLUDED.adj_de,
                pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    label = "four-factors-conf" if conf_only else "four-factors"
    log_ingestion(conn, label, season, today, len(data), len(rows), duration_ms=ms)
    log.info(f"Four Factors {'(conf)' if conf_only else ''}: {len(rows)} teams ({ms}ms)")
    return len(rows)


def ingest_height(conn, season=SEASON):
    """Ingest kp_height (height, experience, continuity)."""
    t0 = time.time()
    data = kenpom_fetch("height", {"y": season})
    if not data:
        log_ingestion(conn, "height", season, date.today(), 0, 0, status="error")
        return 0

    today = date.today()
    rows = []
    for t in data:
        rows.append((
            season, t.get("TeamName"), normalize_team_name(t.get("TeamName", "")),
            t.get("ConfShort"),
            t.get("AvgHgt"), t.get("HgtEff"),
            t.get("Hgt5"), t.get("Hgt4"), t.get("Hgt3"), t.get("Hgt2"), t.get("Hgt1"),
            t.get("Exp"), t.get("ExpRank"),
            t.get("Bench"), t.get("BenchRank"),
            t.get("Continuity"), t.get("RankContinuity"),
            t.get("DataThrough"), today,
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_height
            (season, team_name, team_name_canonical, conf_short,
             avg_hgt, hgt_eff, hgt_5, hgt_4, hgt_3, hgt_2, hgt_1,
             experience, rank_exp, bench, rank_bench, continuity, rank_continuity,
             data_through, pull_date)
            VALUES %s
            ON CONFLICT (season, team_name, pull_date) DO UPDATE SET
                avg_hgt = EXCLUDED.avg_hgt, hgt_eff = EXCLUDED.hgt_eff,
                experience = EXCLUDED.experience, bench = EXCLUDED.bench,
                continuity = EXCLUDED.continuity, pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "height", season, today, len(data), len(rows), duration_ms=ms)
    log.info(f"Height/Exp: {len(rows)} teams ({ms}ms)")
    return len(rows)


def ingest_misc_stats(conn, season=SEASON, conf_only=False):
    """Ingest kp_misc_stats."""
    t0 = time.time()
    params = {"y": season}
    if conf_only:
        params["conf_only"] = "true"
    data = kenpom_fetch("misc-stats", params)
    if not data:
        log_ingestion(conn, "misc-stats", season, date.today(), 0, 0, status="error")
        return 0

    today = date.today()
    rows = []
    for t in data:
        is_conf = str(t.get("ConfOnly", "false")).lower() == "true"
        rows.append((
            season, t.get("TeamName"), normalize_team_name(t.get("TeamName", "")),
            t.get("ConfShort"), is_conf,
            t.get("FG3Pct"), t.get("RankFG3Pct"),
            t.get("FG2Pct"), t.get("RankFG2Pct"),
            t.get("FTPct"), t.get("RankFTPct"),
            t.get("BlockPct"), t.get("RankBlockPct"),
            t.get("StlRate"), t.get("RankStlRate"),
            t.get("NSTRate"), t.get("RankNSTRate"),
            t.get("ARate"), t.get("RankARate"),
            t.get("F3GRate"), t.get("RankF3GRate"),
            t.get("Avg2PADist"), t.get("RankAvg2PADist"),
            t.get("OppFG3Pct"), t.get("RankOppFG3Pct"),
            t.get("OppFG2Pct"), t.get("RankOppFG2Pct"),
            t.get("OppFTPct"),
            t.get("OppBlockPct"), t.get("RankOppBlockPct"),
            t.get("OppStlRate"), t.get("RankOppStlRate"),
            t.get("OppNSTRate"),
            t.get("OppARate"), t.get("RankOppARate"),
            t.get("OppF3GRate"), t.get("RankOppF3GRate"),
            t.get("OppAvg2PADist"),
            t.get("AdjOE"), t.get("AdjDE"),
            t.get("DataThrough"), today,
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_misc_stats
            (season, team_name, team_name_canonical, conf_short, conf_only,
             fg3_pct, rank_fg3_pct, fg2_pct, rank_fg2_pct,
             ft_pct, rank_ft_pct, block_pct, rank_block_pct,
             stl_rate, rank_stl_rate, nst_rate, rank_nst_rate,
             a_rate, rank_a_rate, f3g_rate, rank_f3g_rate,
             avg_2pa_dist, rank_avg_2pa_dist,
             opp_fg3_pct, rank_opp_fg3_pct, opp_fg2_pct, rank_opp_fg2_pct,
             opp_ft_pct, opp_block_pct, rank_opp_block_pct,
             opp_stl_rate, rank_opp_stl_rate, opp_nst_rate,
             opp_a_rate, rank_opp_a_rate, opp_f3g_rate, rank_opp_f3g_rate,
             opp_avg_2pa_dist, adj_oe, adj_de,
             data_through, pull_date)
            VALUES %s
            ON CONFLICT (season, team_name, pull_date, conf_only) DO UPDATE SET
                fg3_pct = EXCLUDED.fg3_pct, fg2_pct = EXCLUDED.fg2_pct,
                ft_pct = EXCLUDED.ft_pct, block_pct = EXCLUDED.block_pct,
                stl_rate = EXCLUDED.stl_rate, a_rate = EXCLUDED.a_rate,
                opp_fg3_pct = EXCLUDED.opp_fg3_pct, opp_fg2_pct = EXCLUDED.opp_fg2_pct,
                adj_oe = EXCLUDED.adj_oe, adj_de = EXCLUDED.adj_de,
                pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "misc-stats", season, today, len(data), len(rows), duration_ms=ms)
    log.info(f"Misc Stats: {len(rows)} teams ({ms}ms)")
    return len(rows)


def ingest_point_dist(conn, season=SEASON, conf_only=False):
    """Ingest kp_point_dist."""
    t0 = time.time()
    params = {"y": season}
    if conf_only:
        params["conf_only"] = "true"
    data = kenpom_fetch("pointdist", params)
    if not data:
        log_ingestion(conn, "pointdist", season, date.today(), 0, 0, status="error")
        return 0

    today = date.today()
    rows = []
    for t in data:
        is_conf = str(t.get("ConfOnly", "false")).lower() == "true"
        rows.append((
            season, t.get("TeamName"), normalize_team_name(t.get("TeamName", "")),
            t.get("ConfShort"), is_conf,
            t.get("OffFt"), t.get("RankOffFt"),
            t.get("OffFg2"), t.get("RankOffFg2"),
            t.get("OffFg3"), t.get("RankOffFg3"),
            t.get("DefFt"), t.get("RankDefFt"),
            t.get("DefFg2"), t.get("RankDefFg2"),
            t.get("DefFg3"), t.get("RankDefFg3"),
            t.get("DataThrough"), today,
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_point_dist
            (season, team_name, team_name_canonical, conf_short, conf_only,
             off_ft, rank_off_ft, off_fg2, rank_off_fg2, off_fg3, rank_off_fg3,
             def_ft, rank_def_ft, def_fg2, rank_def_fg2, def_fg3, rank_def_fg3,
             data_through, pull_date)
            VALUES %s
            ON CONFLICT (season, team_name, pull_date, conf_only) DO UPDATE SET
                off_ft = EXCLUDED.off_ft, off_fg2 = EXCLUDED.off_fg2,
                off_fg3 = EXCLUDED.off_fg3, def_ft = EXCLUDED.def_ft,
                def_fg2 = EXCLUDED.def_fg2, def_fg3 = EXCLUDED.def_fg3,
                pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "pointdist", season, today, len(data), len(rows), duration_ms=ms)
    log.info(f"Point Dist: {len(rows)} teams ({ms}ms)")
    return len(rows)


def ingest_conf_ratings(conn, season=SEASON):
    """Ingest kp_conf_ratings."""
    t0 = time.time()
    data = kenpom_fetch("conf-ratings", {"y": season})
    if not data:
        log_ingestion(conn, "conf-ratings", season, date.today(), 0, 0, status="error")
        return 0

    today = date.today()
    # Deduplicate (API returns repeated rows per conference member)
    seen = set()
    rows = []
    for c in data:
        key = (season, c.get("ConfShort"))
        if key in seen:
            continue
        seen.add(key)
        rows.append((
            season, c.get("ConfShort"), c.get("ConfLong"), c.get("ConfID"),
            c.get("Rank"), c.get("Rating"), today,
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_conf_ratings
            (season, conf_short, conf_long, conf_id, rank, rating, pull_date)
            VALUES %s
            ON CONFLICT (season, conf_short, pull_date) DO UPDATE SET
                rank = EXCLUDED.rank, rating = EXCLUDED.rating, pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "conf-ratings", season, today, len(data), len(rows), duration_ms=ms)
    log.info(f"Conf Ratings: {len(rows)} conferences ({ms}ms)")
    return len(rows)


def ingest_fanmatch(conn, game_date: str, season=SEASON):
    """Ingest kp_fanmatch for a specific date."""
    t0 = time.time()
    data = kenpom_fetch("fanmatch", {"d": game_date})
    if not data:
        log_ingestion(conn, "fanmatch", season, game_date, 0, 0, status="error")
        return 0

    today = date.today()
    rows = []
    for g in data:
        rows.append((
            g.get("Season", season), g.get("GameID"),
            g.get("DateOfGame", game_date),
            g.get("Visitor"), normalize_team_name(g.get("Visitor", "")),
            g.get("Home"), normalize_team_name(g.get("Home", "")),
            g.get("HomeRank"), g.get("VisitorRank"),
            g.get("HomePred"), g.get("VisitorPred"),
            g.get("HomeWP"), g.get("PredTempo"), g.get("ThrillScore"),
            today,
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_fanmatch
            (season, game_id, game_date, visitor, visitor_canonical,
             home, home_canonical, home_rank, visitor_rank,
             home_pred, visitor_pred, home_wp, pred_tempo, thrill_score, pull_date)
            VALUES %s
            ON CONFLICT (season, game_id, pull_date) DO UPDATE SET
                home_wp = EXCLUDED.home_wp, home_pred = EXCLUDED.home_pred,
                visitor_pred = EXCLUDED.visitor_pred, pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "fanmatch", season, game_date, len(data), len(rows), duration_ms=ms)
    log.info(f"Fanmatch {game_date}: {len(rows)} games ({ms}ms)")
    return len(rows)


def ingest_teams(conn, season=SEASON):
    """Ingest kp_teams (team ID lookup)."""
    t0 = time.time()
    data = kenpom_fetch("teams", {"y": season})
    if not data:
        log_ingestion(conn, "teams", season, date.today(), 0, 0, status="error")
        return 0

    rows = []
    for t in data:
        rows.append((
            season, t.get("TeamName"), normalize_team_name(t.get("TeamName", "")),
            t.get("TeamID"), t.get("ConfShort"), t.get("Coach"),
            t.get("Arena"), t.get("ArenaCity"), t.get("ArenaState"),
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_teams
            (season, team_name, team_name_canonical, team_id, conf_short, coach,
             arena, arena_city, arena_state)
            VALUES %s
            ON CONFLICT (season, team_id) DO UPDATE SET
                team_name = EXCLUDED.team_name, coach = EXCLUDED.coach,
                pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "teams", season, date.today(), len(data), len(rows), duration_ms=ms)
    log.info(f"Teams: {len(rows)} teams ({ms}ms)")
    return len(rows)


def ingest_archive(conn, archive_date: str, season=SEASON):
    """Ingest kp_archive for a specific date."""
    t0 = time.time()
    data = kenpom_fetch("archive", {"d": archive_date})
    if not data:
        log_ingestion(conn, "archive", season, archive_date, 0, 0, status="error")
        return 0

    rows = []
    for t in data:
        is_pre = str(t.get("Preseason", "false")).lower() == "true"
        rows.append((
            t.get("ArchiveDate", archive_date), t.get("Season", season), is_pre,
            t.get("TeamName"), normalize_team_name(t.get("TeamName", "")),
            t.get("Seed"), t.get("Event"), t.get("ConfShort"),
            t.get("AdjEM"), t.get("RankAdjEM"),
            t.get("AdjOE"), t.get("RankAdjOE"),
            t.get("AdjDE"), t.get("RankAdjDE"),
            t.get("AdjTempo"), t.get("RankAdjTempo"),
            t.get("AdjEMFinal"), t.get("RankAdjEMFinal"),
            t.get("AdjOEFinal"), t.get("AdjDEFinal"), t.get("AdjTempoFinal"),
            t.get("RankChg"), t.get("AdjEMChg"), t.get("AdjTChg"),
        ))

    with conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO kp_archive
            (archive_date, season, is_preseason, team_name, team_name_canonical,
             seed, event, conf_short,
             adj_em, rank_adj_em, adj_oe, rank_adj_oe,
             adj_de, rank_adj_de, adj_tempo, rank_adj_tempo,
             adj_em_final, rank_adj_em_final,
             adj_oe_final, adj_de_final, adj_tempo_final,
             rank_chg, adj_em_chg, adj_t_chg)
            VALUES %s
            ON CONFLICT (season, team_name, archive_date, is_preseason) DO UPDATE SET
                adj_em = EXCLUDED.adj_em, rank_adj_em = EXCLUDED.rank_adj_em,
                adj_oe = EXCLUDED.adj_oe, adj_de = EXCLUDED.adj_de,
                pulled_at = NOW()
        """, rows)
    conn.commit()

    ms = int((time.time() - t0) * 1000)
    log_ingestion(conn, "archive", season, archive_date, len(data), len(rows), duration_ms=ms)
    log.info(f"Archive {archive_date}: {len(rows)} teams ({ms}ms)")
    return len(rows)


# ── Team Name Map Builder ──────────────────────────────────────

def build_team_map(conn, season=SEASON):
    """Cross-reference KenPom team names against existing Rainmaker team names.
    Uses exact match, then fuzzy matching, then flags unmatched for manual review.
    """
    log.info("Building kp_team_map...")

    with conn.cursor() as cur:
        # Get all KenPom team names
        cur.execute("SELECT DISTINCT team_name FROM kp_teams WHERE season = %s", (season,))
        kp_names = [r[0] for r in cur.fetchall()]

        # Get all known team names from SGO / SportsGame / PGM
        cur.execute("""
            SELECT DISTINCT "homeTeam" FROM "SportsGame" WHERE league = 'ncaab'
            UNION
            SELECT DISTINCT "awayTeam" FROM "SportsGame" WHERE league = 'ncaab'
            UNION
            SELECT DISTINCT team FROM "PlayerGameMetric" WHERE league = 'ncaab'
        """)
        existing_names = {r[0] for r in cur.fetchall()}

    # Build normalized lookup
    existing_norm = {}
    for name in existing_names:
        key = normalize_team_name(name)
        existing_norm.setdefault(key, []).append(name)

    mapped = 0
    unmatched = []

    with conn.cursor() as cur:
        for kp_name in kp_names:
            kp_norm = normalize_team_name(kp_name)

            # Exact normalized match
            if kp_norm in existing_norm:
                canonical = existing_norm[kp_norm][0]
                cur.execute("""
                    INSERT INTO kp_team_map (kp_name, canonical_name, match_method, match_confidence)
                    VALUES (%s, %s, 'exact', 1.0)
                    ON CONFLICT (kp_name) DO UPDATE SET
                        canonical_name = EXCLUDED.canonical_name,
                        match_method = 'exact', match_confidence = 1.0,
                        updated_at = NOW()
                """, (kp_name, canonical))
                mapped += 1
                continue

            # Fuzzy: try removing "St." variants, state abbreviations
            fuzzy_keys = [
                kp_norm.replace(" st ", " state "),
                kp_norm.replace(" state ", " st "),
                re.sub(r"\s+(st|state)$", "", kp_norm),
                kp_norm.split()[0] if len(kp_norm.split()) > 1 else kp_norm,
            ]

            found = False
            for fk in fuzzy_keys:
                if fk in existing_norm:
                    canonical = existing_norm[fk][0]
                    cur.execute("""
                        INSERT INTO kp_team_map (kp_name, canonical_name, match_method, match_confidence)
                        VALUES (%s, %s, 'fuzzy', 0.75)
                        ON CONFLICT (kp_name) DO UPDATE SET
                            canonical_name = EXCLUDED.canonical_name,
                            match_method = 'fuzzy', match_confidence = 0.75,
                            updated_at = NOW()
                    """, (kp_name, canonical))
                    mapped += 1
                    found = True
                    break

            if not found:
                # Insert as unmatched for manual review
                cur.execute("""
                    INSERT INTO kp_team_map (kp_name, match_method, match_confidence)
                    VALUES (%s, 'unmatched', 0.0)
                    ON CONFLICT (kp_name) DO NOTHING
                """, (kp_name,))
                unmatched.append(kp_name)

    conn.commit()
    log.info(f"Team map: {mapped} matched, {len(unmatched)} unmatched")
    if unmatched:
        log.warning(f"Unmatched teams ({len(unmatched)}): {unmatched[:20]}...")
    return mapped, unmatched


# ── Full Daily Run ──────────────────────────────────────────────

def run_full_daily(conn, season=SEASON):
    """Execute all endpoint ingestions for daily cron."""
    log.info(f"=== KenPom Daily Ingestion Start (season={season}) ===")
    t0 = time.time()
    totals = {}

    # 1. Teams (needed for team map)
    totals["teams"] = ingest_teams(conn, season)

    # 2. Ratings (core)
    totals["ratings"] = ingest_ratings(conn, season)

    # 3. Four Factors
    totals["four_factors"] = ingest_four_factors(conn, season)

    # 4. Height / Experience
    totals["height"] = ingest_height(conn, season)

    # 5. Misc Stats
    totals["misc_stats"] = ingest_misc_stats(conn, season)

    # 6. Point Distribution
    totals["point_dist"] = ingest_point_dist(conn, season)

    # 7. Conference Ratings
    totals["conf_ratings"] = ingest_conf_ratings(conn, season)

    # 8. Fanmatch for yesterday AND today (today's games are available day-of)
    yesterday_str = (date.today() - timedelta(days=1)).isoformat()
    today_str = date.today().isoformat()
    totals["fanmatch_yesterday"] = ingest_fanmatch(conn, yesterday_str, season)
    totals["fanmatch_today"] = ingest_fanmatch(conn, today_str, season)

    # 9. Build team name map
    mapped, unmatched = build_team_map(conn, season)
    totals["team_map_matched"] = mapped
    totals["team_map_unmatched"] = len(unmatched)

    elapsed = time.time() - t0
    log.info(f"=== KenPom Daily Ingestion Complete ({elapsed:.1f}s) ===")
    for k, v in totals.items():
        log.info(f"  {k}: {v}")

    return totals


# ── CLI ─────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="KenPom Data Ingestion for Rainmaker")
    parser.add_argument("--endpoint", help="Single endpoint to ingest")
    parser.add_argument("--season", type=int, default=SEASON)
    parser.add_argument("--fanmatch-date", help="Date for fanmatch (YYYY-MM-DD)")
    parser.add_argument("--archive-date", help="Date for archive (YYYY-MM-DD)")
    parser.add_argument("--backfill-archive", type=int, help="Backfill N days of archive")
    parser.add_argument("--build-map", action="store_true", help="Build team name map only")
    args = parser.parse_args()

    conn = get_conn()

    try:
        if args.build_map:
            build_team_map(conn, args.season)
        elif args.endpoint:
            dispatch = {
                "ratings": lambda: ingest_ratings(conn, args.season),
                "four-factors": lambda: ingest_four_factors(conn, args.season),
                "height": lambda: ingest_height(conn, args.season),
                "misc-stats": lambda: ingest_misc_stats(conn, args.season),
                "pointdist": lambda: ingest_point_dist(conn, args.season),
                "conf-ratings": lambda: ingest_conf_ratings(conn, args.season),
                "teams": lambda: ingest_teams(conn, args.season),
            }
            if args.endpoint in dispatch:
                dispatch[args.endpoint]()
            else:
                log.error(f"Unknown endpoint: {args.endpoint}")
        elif args.fanmatch_date:
            ingest_fanmatch(conn, args.fanmatch_date, args.season)
        elif args.archive_date:
            ingest_archive(conn, args.archive_date, args.season)
        elif args.backfill_archive:
            log.info(f"Backfilling {args.backfill_archive} days of archive...")
            for i in range(args.backfill_archive):
                d = (date.today() - timedelta(days=i)).isoformat()
                ingest_archive(conn, d, args.season)
        else:
            run_full_daily(conn, args.season)
    finally:
        conn.close()


if __name__ == "__main__":
    main()
