#!/usr/bin/env python3
"""
NCAA Team & Player Name Normalization Utility
----------------------------------------------
Canonical normalization for Rainmaker entity resolution.
Handles punctuation, diacritics, suffixes, abbreviations, and fuzzy matching.

Usage:
    from normalize import normalize_team, normalize_player, fuzzy_match_team
"""

import re
import unicodedata
from difflib import SequenceMatcher


# ── Team Name Normalization ─────────────────────────────────────

# Common school abbreviation/alias map for NCAAB
SCHOOL_ALIASES = {
    # State vs flagship ambiguity
    "alabama crimson tide": "alabama",
    "alabama a&m bulldogs": "alabama am",
    "arizona wildcats": "arizona",
    "arizona st sun devils": "arizona st",
    "arkansas razorbacks": "arkansas",
    "arkansas st red wolves": "arkansas st",
    "arkansas pine bluff": "arkansas pine bluff",
    # Common variants
    "uconn": "connecticut",
    "uconn huskies": "connecticut",
    "ole miss": "mississippi",
    "ole miss rebels": "mississippi",
    "umass": "massachusetts",
    "cal": "california",
    "pitt": "pittsburgh",
    "pitt panthers": "pittsburgh",
    "lsu": "lsu",
    "lsu tigers": "lsu",
    "vcu": "vcu",
    "smu": "smu",
    "byu": "byu",
    "ucf": "ucf",
    "unlv": "unlv",
    "utep": "utep",
    # St./State variants
    "fresno state": "fresno st",
    "colorado state": "colorado st",
    "portland state": "portland st",
    "kennesaw state": "kennesaw st",
    "jackson state": "jackson st",
    "alcorn state": "alcorn st",
    "san diego state": "san diego st",
    "michigan state": "michigan st",
    "ohio state": "ohio st",
    "penn state": "penn st",
    "iowa state": "iowa st",
    "texas tech red raiders": "texas tech",
    # Direction variants
    "nc state": "north carolina st",
    "nc state wolfpack": "north carolina st",
    # Miami disambiguation
    "miami fl": "miami fl",
    "miami oh": "miami oh",
    "miami (fl)": "miami fl",
    "miami (oh)": "miami oh",
    "miami hurricanes": "miami fl",
    "miami redhawks": "miami oh",
}

# Suffixes to strip (mascot names)
MASCOT_PATTERN = re.compile(
    r'\b(wildcats|bulldogs|tigers|eagles|bears|hawks|panthers|cougars|'
    r'lions|wolves|mustangs|hornets|owls|cardinals|braves|knights|'
    r'spartans|huskies|aggies|rebels|warriors|raiders|lancers|broncos|'
    r'cowboys|miners|rockets|dolphins|gators|wolverines|boilermakers|'
    r'buckeyes|sooners|longhorns|crimson\s*tide|tar\s*heels|blue\s*devils|'
    r'golden\s*bears|fighting\s*irish|commodores|volunteers|razorbacks|'
    r'mountaineers|seminoles|cavaliers|hokies|jayhawks|cyclones|'
    r'red\s*raiders|horned\s*frogs|sun\s*devils|beavers|ducks|'
    r'golden\s*gophers|badgers|hawkeyes|hoosiers|cornhuskers|'
    r'nittany\s*lions|scarlet\s*knights|terrapins|illini|'
    r'orange|yellow\s*jackets|demon\s*deacons|wolfpack|'
    r'red\s*wolves|rams|49ers|bearcats|musketeers|pirates|'
    r'friars|bluejays|red\s*storm|hoyas|johnnies|ramblers)\b',
    re.IGNORECASE,
)


def normalize_team(name: str) -> str:
    """
    Produce a canonical lowercase key from any team name variant.
    Strips diacritics, punctuation, mascots, extra whitespace.
    """
    if not name:
        return ""

    # Unicode normalize → strip diacritics
    s = unicodedata.normalize("NFKD", name)
    s = "".join(c for c in s if not unicodedata.combining(c))

    # Lowercase + strip
    s = s.lower().strip()

    # Remove punctuation (apostrophes, periods, hyphens, etc.)
    s = re.sub(r"['\"\.\-,;:!?()&]", "", s)

    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()

    # Check alias map first
    if s in SCHOOL_ALIASES:
        return SCHOOL_ALIASES[s]

    # Strip mascot names
    s = MASCOT_PATTERN.sub("", s).strip()
    s = re.sub(r"\s+", " ", s).strip()

    # Normalize "state" → "st"
    s = re.sub(r"\bstate\b", "st", s)

    # Check alias again after cleanup
    if s in SCHOOL_ALIASES:
        return SCHOOL_ALIASES[s]

    return s


# ── Player Name Normalization ───────────────────────────────────

def normalize_player(name: str) -> str:
    """
    Produce a canonical lowercase key from any player name variant.
    Handles: D'Angelo/DAngelo/D-Angelo/D Angelo, Jr./Jr/III, accents, etc.
    """
    if not name:
        return ""

    # Unicode normalize → strip diacritics
    s = unicodedata.normalize("NFKD", name)
    s = "".join(c for c in s if not unicodedata.combining(c))

    # Lowercase
    s = s.lower().strip()

    # Remove apostrophes, periods, hyphens (but preserve the letters)
    # D'Angelo → dangelo, O'Brien → obrien
    s = re.sub(r"['`']", "", s)
    s = re.sub(r"\.", "", s)

    # Replace hyphens with nothing (for D-Angelo → dangelo)
    s = re.sub(r"-", "", s)

    # Remove suffixes: Jr, Sr, II, III, IV, V
    s = re.sub(r"\b(jr|sr|ii|iii|iv|v)\b\.?", "", s)

    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()

    return s


def normalize_sgo_player_id(player_id: str) -> str:
    """
    Convert SGO playerID format (FIRST_LAST_N_LEAGUE) to normalized name.
    Example: JAYLEN_BROWN_2_NBA → jaylen brown
    """
    if not player_id:
        return ""

    parts = player_id.split("_")
    # Remove trailing league code and number
    # Pattern: FIRST_LAST_N_LEAGUE or FIRST_MIDDLE_LAST_N_LEAGUE
    if len(parts) >= 3:
        # Last part is league code, second-to-last is disambiguator number
        name_parts = []
        for p in parts:
            if p.isdigit() or p.upper() in ("NBA", "NFL", "NHL", "MLB", "NCAAM", "NCAAB", "EPL", "WNBA"):
                continue
            name_parts.append(p.lower())
        return normalize_player(" ".join(name_parts))

    return normalize_player(player_id.replace("_", " "))


# ── Fuzzy Matching ──────────────────────────────────────────────

def fuzzy_match_team(query: str, candidates: list, threshold: float = 0.70) -> list:
    """
    Find fuzzy matches for a team name against a list of candidates.
    Returns list of (candidate, score) tuples above threshold, sorted by score desc.
    """
    query_norm = normalize_team(query)
    results = []

    for candidate in candidates:
        cand_norm = normalize_team(candidate)

        # Exact normalized match
        if query_norm == cand_norm:
            results.append((candidate, 1.0))
            continue

        # Sequence matching
        score = SequenceMatcher(None, query_norm, cand_norm).ratio()
        if score >= threshold:
            results.append((candidate, round(score, 3)))

    return sorted(results, key=lambda x: -x[1])


def fuzzy_match_player(query: str, candidates: list, threshold: float = 0.75) -> list:
    """
    Find fuzzy matches for a player name against a list of candidates.
    """
    query_norm = normalize_player(query)
    results = []

    for candidate in candidates:
        cand_norm = normalize_player(candidate)

        if query_norm == cand_norm:
            results.append((candidate, 1.0))
            continue

        score = SequenceMatcher(None, query_norm, cand_norm).ratio()
        if score >= threshold:
            results.append((candidate, round(score, 3)))

    return sorted(results, key=lambda x: -x[1])


# ── Tests ───────────────────────────────────────────────────────

if __name__ == "__main__":
    # Team normalization tests
    print("=== Team Normalization Tests ===")
    tests = [
        ("Duke Blue Devils", "duke"),
        ("Michigan Wolverines", "michigan"),
        ("Michigan St.", "michigan st"),
        ("Michigan State Spartans", "michigan st"),
        ("Arizona St.", "arizona st"),
        ("UConn", "connecticut"),
        ("UConn Huskies", "connecticut"),
        ("Ole Miss", "mississippi"),
        ("Miami FL", "miami fl"),
        ("Miami (FL)", "miami fl"),
        ("Colorado St.", "colorado st"),
        ("Colorado State Rams", "colorado st"),
        ("St. John's", "st johns"),
        ("Saint Mary's", "saint marys"),
        ("N.C. State", "nc st"),
        ("Texas A&M", "texas am"),
    ]
    for raw, expected in tests:
        result = normalize_team(raw)
        status = "PASS" if result == expected else f"FAIL (got '{result}')"
        print(f"  {raw:<30} → {result:<20} {status}")

    # Player normalization tests
    print("\n=== Player Normalization Tests ===")
    player_tests = [
        ("D'Angelo Russell", "dangelo russell"),
        ("DAngelo Russell", "dangelo russell"),
        ("D-Angelo Russell", "dangelo russell"),
        ("D Angelo Russell", "dangelo russell"),
        ("Marcus O'Brien Jr.", "marcus obrien"),
        ("José García", "jose garcia"),
        ("LeBron James III", "lebron james"),
        ("Shaquille O'Neal", "shaquille oneal"),
    ]
    for raw, expected in player_tests:
        result = normalize_player(raw)
        status = "PASS" if result == expected else f"FAIL (got '{result}')"
        print(f"  {raw:<30} → {result:<20} {status}")

    # SGO player ID test
    print("\n=== SGO Player ID Tests ===")
    sgo_tests = [
        ("JAYLEN_BROWN_2_NBA", "jaylen brown"),
        ("DANGELO_RUSSELL_1_NBA", "dangelo russell"),
        ("LEBRON_JAMES_1_NBA", "lebron james"),
    ]
    for raw, expected in sgo_tests:
        result = normalize_sgo_player_id(raw)
        status = "PASS" if result == expected else f"FAIL (got '{result}')"
        print(f"  {raw:<30} → {result:<20} {status}")