#!/usr/bin/env python3
"""
Build a canonical NCAAB team name mapping by matching
SportsGame variants to Odds API canonical names.

Output: prints Python dict that maps dirty → canonical.
"""

import os
import re
import sys
from collections import defaultdict

import psycopg2

def get_db_url():
    env_path = os.path.join(os.path.dirname(__file__), "..", ".env")
    if os.path.exists(env_path):
        with open(env_path) as f:
            for line in f:
                line = line.strip()
                if line.startswith("SPORTS_DATABASE_URL="):
                    url = line.split("=", 1)[1]
                    return url.split("?")[0]
    return None

conn = psycopg2.connect(get_db_url())
cur = conn.cursor()

# 1. Get all names from odds tables (GameOdds, OddsSnapshot, LineMovement)
#    These use the consistent "Full Name Mascot" format from The Odds API
cur.execute("""
    SELECT DISTINCT name FROM (
        SELECT "homeTeam" AS name FROM "GameOdds" WHERE league='ncaab'
        UNION SELECT "awayTeam" FROM "GameOdds" WHERE league='ncaab'
        UNION SELECT "homeTeam" FROM "OddsSnapshot" WHERE league='ncaab'
        UNION SELECT "awayTeam" FROM "OddsSnapshot" WHERE league='ncaab'
        UNION SELECT "homeTeam" FROM "LineMovement" WHERE league='ncaab'
        UNION SELECT "awayTeam" FROM "LineMovement" WHERE league='ncaab'
    ) x WHERE name IS NOT NULL ORDER BY name
""")
odds_names = set(row[0] for row in cur.fetchall())
print(f"# Odds API canonical names: {len(odds_names)}", file=sys.stderr)

# 2. Get all names from SportsGame
cur.execute("""
    SELECT DISTINCT name FROM (
        SELECT "homeTeam" AS name FROM "SportsGame" WHERE league='ncaab'
        UNION SELECT "awayTeam" FROM "SportsGame" WHERE league='ncaab'
    ) x WHERE name IS NOT NULL ORDER BY name
""")
sg_names = set(row[0] for row in cur.fetchall())
print(f"# SportsGame names: {len(sg_names)}", file=sys.stderr)

# 3. Names only in SportsGame (need mapping)
sg_only = sg_names - odds_names
# Names in both (already canonical)
both = sg_names & odds_names
print(f"# Already canonical (in both): {len(both)}", file=sys.stderr)
print(f"# SportsGame-only (need mapping): {len(sg_only)}", file=sys.stderr)

# 4. Build matching helpers
# Normalize for matching: lowercase, remove punctuation, collapse spaces
def normalize(s):
    s = s.lower().strip()
    s = s.replace("'", "").replace("'", "").replace(".", "").replace("-", " ").replace("&", "and")
    s = re.sub(r'\s+', ' ', s)
    return s

# Build index of canonical names by various keys
canonical_by_norm = {}
canonical_by_first_word = defaultdict(list)
canonical_by_words = defaultdict(list)

for name in odds_names:
    n = normalize(name)
    canonical_by_norm[n] = name
    words = n.split()
    canonical_by_first_word[words[0]].append(name)
    for w in words:
        canonical_by_words[w].append(name)

# Known abbreviation map (SGO/kaggle abbreviations → school name prefix)
ABBREV_MAP = {
    "AFA": "Air Force", "AKR": "Akron", "ALBY": "Albany", "AMER": "American",
    "APP ST": "Appalachian St", "AR BAP": None, "ARK PB": "Arkansas-Pine Bluff",
    "ARK ST": "Arkansas St", "ASU": "Arizona St", "AU": "Auburn",
    "AUG U": None, "AZ ST": "Arizona St", "BALL": "Ball State",
    "BAY": "Baylor", "BC": "Boston College", "BGSU": "Bowling Green",
    "BL COL": None, "BOB JO": None, "BSU": "Boise State", "BUFF": "Buffalo",
    "C ARK": "Central Arkansas", "C CONN": "Central Connecticut",
    "CAMP": "Campbell", "CEN ST": None, "CHI ST": "Chicago St",
    "CLE ST": "Cleveland St", "C MICH": "Central Michigan", "CMU": "Central Michigan",
    "CO CAR": "Coastal Carolina", "CO CHR": None, "CO COL": None,
    "C OF C": "Charleston", "COFC": "Charleston", "CO ST": "Colorado St",
    "CS FULL": "CSU Fullerton", "CS NORTH": "CSU Northridge",
    "DEL ST": "Delaware St", "DUKE": "Duke", "E CAR": "East Carolina",
    "E ILL": "Eastern Illinois", "E KY": "Eastern Kentucky", "E MICH": "Eastern Michigan",
    "ELON": "Elon", "EMU": "Eastern Michigan", "E WASH": "Eastern Washington",
    "FL A&M": "Florida A&M", "FL CHR": None, "FRES": "Fresno St",
    "FSU": "Florida St", "FT VAL": None, "GA SOU": "Georgia Southern",
    "GA ST": "Georgia St", "GC": "Grand Canyon", "GMU": "George Mason",
    "GRAM": "Grambling", "GW": "George Washington", "G WASH": "George Washington",
    "G WEBB": "Gardner-Webb", "IDHO": "Idaho", "ID ST": "Idaho State",
    "IDST": "Idaho State", "ILL": "Illinois", "ILL ST": "Illinois St",
    "ILST": "Illinois St", "IND ST": "Indiana St", "IND NW": None,
    "IOWA": "Iowa", "IW": "Incarnate Word", "JAC": "Jacksonville",
    "JAX ST": "Jacksonville St", "JKST": "Jackson St", "JOES": "Saint Joseph's",
    "KAN ST": "Kansas St", "KENT": "Kent State", "KSU": "Kansas St",
    "KY CHR": None, "KY ST": None, "LA MON": "UL Monroe",
    "LA SI": None, "LAS": "La Salle", "LINW": "Lindenwood",
    "LMC": "Le Moyne", "LOY LA": "Loyola Marymount", "LOY MD": "Loyola (MD)",
    "LOU": "Louisville", "LSU": "LSU", "MASS": "Massachusetts",
    "MD BAL": None, "MEM": "Memphis", "MGA ST": None,
    "MHU": None, "MIA OH": "Miami (OH)", "MICH": "Michigan",
    "MINN": "Minnesota", "MIZZ": "Missouri", "MO SOU": None,
    "MO ST": "Missouri St", "MOSU": "Missouri St", "MSM NY": None,
    "MSU": "Michigan St", "MS VAL": "Miss Valley St", "MT STV": None,
    "MT ALO": None, "MT OLI": None, "N ALA": "North Alabama",
    "NAU": "Northern Arizona", "NC A&T": "North Carolina A&T",
    "NC CEN": "North Carolina Central", "NC ILL": "Northern Illinois",
    "NC MIN": None, "N COLO": "N Colorado", "NC ST": "NC State", "NCST": "NC State",
    "NC WES": None, "N DAME": "Notre Dame", "ND MD": None,
    "ND ST": "North Dakota St", "NEB": "Nebraska", "NE ST": None,
    "NEV": "Nevada", "NIU": "Northern Illinois", "N IOWA": "Northern Iowa",
    "N KY": "Northern Kentucky", "N MEX": "New Mexico", "NM ST": "New Mexico St",
    "NO DAK": "North Dakota", "NO FLA": "North Florida", "NO TEX": "North Texas",
    "NW ST": "Northwestern St", "ODU": "Old Dominion", "OHIO": "Ohio",
    "ORE": "Oregon", "ORE ST": "Oregon St", "OSU": "Ohio State",
    "A PEAY": "Austin Peay", "PEAY": "Austin Peay",
    "PFW": "Fort Wayne", "PSU": "Penn State", "PUR": "Purdue",
    "PUR NW": None, "PV A&M": "Prairie View",
    "REG MA": None, "RID": "Rider",
    "S ALA": "South Alabama", "S CAR": "South Carolina", "SCAR": "South Carolina",
    "S Carolina": "South Carolina", "SC ST": "South Carolina St",
    "SC UPS": "South Carolina Upstate", "SDSU": "San Diego St",
    "SE LA": "SE Louisiana", "SEMO": "SE Missouri St",
    "S FRAN": "San Francisco", "SF ST": None, "SIH": "Southern Illinois",
    "S IND": "Southern Indiana", "SIND": "Southern Indiana",
    "SIU": "Southern Illinois", "SJSU": "San José St", "SLU": "Saint Louis",
    "SMC": "Saint Mary's", "S MISS": "Southern Miss", "SMU": "SMU",
    "SO DAK": "South Dakota", "S ORE": None, "SOU NO": None,
    "S UTAH": "Southern Utah", "ST LOU": "Saint Louis", "STON": "Stonehill",
    "ST PTR": "Saint Peter's", "ST. A": None,
    "TA&M": "Texas A&M", "TAMC": "Texas A&M-CC", "TARL": "Tarleton State",
    "TCU": "TCU", "TN WES": None, "TNTC": None, "TOWS": "Towson",
    "TTU": "Texas Tech", "TX A&M": "Texas A&M", "TX ARL": "UT-Arlington",
    "TX COL": None, "TX DAL": None, "TX LUT": None, "TX PER": None,
    "TX SOU": "Texas Southern", "TX ST": "Texas State", "TX WES": None,
    "UCI": "UC Irvine", "UCR": "UC Riverside", "UGA": "Georgia",
    "UK": "Kentucky", "UMass": "Massachusetts", "UNC": "North Carolina",
    "UNC A": "UNC Asheville", "UNC G": "UNC Greensboro", "UNCO": "N Colorado",
    "UNF": "North Florida", "UNM": "New Mexico", "URI": "Rhode Island",
    "USU": "Utah State", "UTM": "Tenn-Martin", "UT MAR": "Tenn-Martin",
    "UT ST": "Utah State", "UT VAL": "Utah Valley", "UW ST": None,
    "VILL": "Villanova", "VT": "Virginia Tech", "VCU": "VCU",
    "W CAR": "Western Carolina", "WEB": "Weber State", "WEB ST": "Weber State",
    "W GA": "West Georgia", "WICH": "Wichita St", "W ILL": "Western Illinois",
    "W KY": "Western Kentucky", "WM": "William and Mary", "W&M": "William and Mary",
    "WM&MRY": "William and Mary", "W MICH": "Western Michigan", "WMU": "Western Michigan",
    "W NM": None, "W ORE": None, "WIU": "Western Illinois", "WIS": "Wisconsin",
    "WYO": "Wyoming", "XAV": "Xavier", "YSU": "Youngstown St",
    "AND IN": None, "MO S&T": None, "MS&T": None, "MNE FK": None,
    "PR BYM": None, "PR MAY": None, "ROG ST": None, "SO VA": None,
    "VA LYN": None, "VA WES": None, "S WESL": None, "SW TX": None,
    "W NM": None, "PSU AL": None, "ED WAT": None, "MON IL": None,
    "ND ST": "North Dakota St", "S ORE": None, "PAC OR": None,
    "UC CS": None, "UC DAV": "UC Davis", "UC IRV": "UC Irvine", "UC RIV": "UC Riverside",
    "N Alabama": "North Alabama", "S Alabama": "South Alabama",
    "SE Louisiana": "SE Louisiana", "N Colorado": "N Colorado",
    "SO DAK": "South Dakota", "NO DAK": "North Dakota",
    "BL COL": None, "BOB JO": None, "MSM NY": None,
    "SBON": "St. Bonaventure", "WAG": "Wagner",
    "LAF": "Lafayette", "S FRAN": "San Francisco",
    "SAC ST": "Sacramento St", "CS Bakersfield": "CSU Bakersfield",
    "CS Fullerton": "CSU Fullerton", "CS Northridge": "CSU Northridge",
    "M-OH": "Miami (OH)", "L-MD": "Loyola (MD)", "L-IL": None,
    "ND MD": None, "SF(RF": None, "M(R": None, "L(G": None,
    "LYON ": None, "RUST ": None, "FT VAL": None, "ST. A": None,
}

# 5. Match SportsGame-only names to canonical
mapping = {}
unmatched = []

for name in sorted(sg_only):
    matched = False
    n = normalize(name)

    # a) Check abbreviation map
    name_upper = name.strip().upper()
    name_stripped = name.strip()
    if name_stripped in ABBREV_MAP:
        prefix = ABBREV_MAP[name_stripped]
        if prefix is None:
            # Known non-D1 or unmappable
            continue
        # Find canonical name starting with this prefix
        prefix_norm = normalize(prefix)
        for canonical in odds_names:
            if normalize(canonical).startswith(prefix_norm):
                mapping[name] = canonical
                matched = True
                break
        if matched:
            continue
    if name_upper in ABBREV_MAP:
        prefix = ABBREV_MAP[name_upper]
        if prefix is None:
            continue
        prefix_norm = normalize(prefix)
        for canonical in odds_names:
            if normalize(canonical).startswith(prefix_norm):
                mapping[name] = canonical
                matched = True
                break
        if matched:
            continue

    # b) Exact normalized match
    if n in canonical_by_norm:
        mapping[name] = canonical_by_norm[n]
        continue

    # c) Name is a prefix of a canonical name (e.g., "Akron" matches "Akron Zips")
    for canonical in odds_names:
        cn = normalize(canonical)
        if cn.startswith(n + " ") or cn == n:
            mapping[name] = canonical
            matched = True
            break
    if matched:
        continue

    # d) "State" → "St" normalization
    n2 = n.replace(" state ", " st ").replace(" state", " st")
    if n2 in canonical_by_norm:
        mapping[name] = canonical_by_norm[n2]
        continue
    # Try prefix after State→St
    for canonical in odds_names:
        cn = normalize(canonical)
        if cn.startswith(n2 + " ") or cn.startswith(n2.rstrip() + " "):
            mapping[name] = canonical
            matched = True
            break
    if matched:
        continue

    # e) Add " " + common suffixes removed
    # Strip mascot and try matching just school name
    # e.g., "Alabama State Hornets" → try "Alabama St" prefix
    n3 = n.replace(" state", " st").replace("  ", " ")
    for canonical in odds_names:
        cn = normalize(canonical)
        # Check if they share enough prefix words
        nw = n3.split()
        cw = cn.split()
        if len(nw) >= 2 and len(cw) >= 2:
            if nw[0] == cw[0] and (nw[1] == cw[1] or nw[1].startswith(cw[1]) or cw[1].startswith(nw[1])):
                mapping[name] = canonical
                matched = True
                break
    if matched:
        continue

    # f) "University" removal, hyphen normalization
    n4 = n.replace("university", "").replace("  ", " ").strip()
    for canonical in odds_names:
        cn = normalize(canonical)
        if cn.startswith(n4 + " ") or cn == n4:
            mapping[name] = canonical
            matched = True
            break
    if matched:
        continue

    unmatched.append(name)

# Print the mapping
print(f"# Matched: {len(mapping)}", file=sys.stderr)
print(f"# Unmatched: {len(unmatched)}", file=sys.stderr)

print("\n# === MAPPING (dirty → canonical) ===")
print("NCAAB_NAME_MAP = {")
for dirty, canonical in sorted(mapping.items()):
    print(f'    {dirty!r}: {canonical!r},')
print("}")

print(f"\n# === UNMATCHED ({len(unmatched)}) ===")
for name in sorted(unmatched):
    # Check if it looks like a D1 school (has significant game count)
    cur.execute("""
        SELECT COUNT(*) FROM "SportsGame" WHERE league='ncaab' AND ("homeTeam"=%s OR "awayTeam"=%s)
    """, (name, name))
    count = cur.fetchone()[0]
    print(f"# {name!r}  ({count} games)")

conn.close()
