
    4Ki!&                     .   d Z ddlZddlZddlmZ i dddddd	d
dddddddddddddddddddddddddddi d d d!d!d"d"d#d#d$d$d%d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d=d>d?d>d?d>d?d@	Z ej                  dAej                        ZdBe	dCe	fdDZ
dBe	dCe	fdEZdFe	dCe	fdGZd\dHe	dIedJedCefdKZd]dHe	dIedJedCefdLZedMk(  r edN       g dOZeD ]-  \  ZZ e
e      Zeek(  rdPndQe dRZ edSedTdUedVdWe        /  edX       g dYZeD ]-  \  ZZ ee      Zeek(  rdPndQe dRZ edSedTdUedVdWe        /  edZ       g d[ZeD ]-  \  ZZ ee      Zeek(  rdPndQe dRZ edSedTdUedVdWe        / yy)^a:  
NCAA Team & Player Name Normalization Utility
----------------------------------------------
Canonical normalization for Rainmaker entity resolution.
Handles punctuation, diacritics, suffixes, abbreviations, and fuzzy matching.

Usage:
    from normalize import normalize_team, normalize_player, fuzzy_match_team
    N)SequenceMatcherzalabama crimson tidealabamazalabama a&m bulldogsz
alabama amzarizona wildcatsarizonazarizona st sun devils
arizona stzarkansas razorbacksarkansaszarkansas st red wolveszarkansas stzarkansas pine bluffuconnconnecticutzuconn huskieszole missmississippizole miss rebelsumassmassachusettscal
californiapitt
pittsburghzpitt pantherslsuz
lsu tigersvcusmubyuucfunlvutepzfresno statez	fresno stzcolorado statecolorado stzportland statezportland stzkennesaw statezkennesaw stzjackson statez
jackson stzalcorn statez	alcorn stzsan diego statezsan diego stzmichigan statemichigan stz
ohio statezohio stz
penn statezpenn stz
iowa stateziowa stz
texas techznorth carolina stmiami flmiami oh)	ztexas tech red raidersznc stateznc state wolfpackr   r   z
miami (fl)z
miami (oh)zmiami hurricaneszmiami redhawksa  \b(wildcats|bulldogs|tigers|eagles|bears|hawks|panthers|cougars|lions|wolves|mustangs|hornets|owls|cardinals|braves|knights|spartans|huskies|aggies|rebels|warriors|raiders|lancers|broncos|cowboys|miners|rockets|dolphins|gators|wolverines|boilermakers|buckeyes|sooners|longhorns|crimson\s*tide|tar\s*heels|blue\s*devils|golden\s*bears|fighting\s*irish|commodores|volunteers|razorbacks|mountaineers|seminoles|cavaliers|hokies|jayhawks|cyclones|red\s*raiders|horned\s*frogs|sun\s*devils|beavers|ducks|golden\s*gophers|badgers|hawkeyes|hoosiers|cornhuskers|nittany\s*lions|scarlet\s*knights|terrapins|illini|orange|yellow\s*jackets|demon\s*deacons|wolfpack|red\s*wolves|rams|49ers|bearcats|musketeers|pirates|friars|bluejays|red\s*storm|hoyas|johnnies|ramblers)\bnamereturnc                     | syt        j                  d|       }dj                  d |D              }|j                         j	                         }t        j                  dd|      }t        j                  dd|      j	                         }|t        v r	t        |   S t        j                  d|      j	                         }t        j                  dd|      j	                         }t        j                  dd|      }|t        v r	t        |   S |S )	z
    Produce a canonical lowercase key from any team name variant.
    Strips diacritics, punctuation, mascots, extra whitespace.
     NFKDc              3   L   K   | ]  }t        j                  |      r|  y wNunicodedata	combining.0cs     7/var/www/html/rainmaker/backend/src/kenpom/normalize.py	<genexpr>z!normalize_team.<locals>.<genexpr>d        =aK$9$9!$<=   $$z['\"\.\-,;:!?()&]\s+ z	\bstate\bst)	r$   	normalizejoinlowerstripresubSCHOOL_ALIASESMASCOT_PATTERNr   ss     r)   normalize_teamr:   Z   s    
  	fd+A
=1==A 	
	A 	#R+A 	vsA$$&A 	Na   	2q!'')A
vsA$$&A 	|T1%A 	Na  H    c                    | syt        j                  d|       }dj                  d |D              }|j                         j	                         }t        j                  dd|      }t        j                  dd|      }t        j                  dd|      }t        j                  dd|      }t        j                  dd	|      j	                         }|S )
z
    Produce a canonical lowercase key from any player name variant.
    Handles: D'Angelo/DAngelo/D-Angelo/D Angelo, Jr./Jr/III, accents, etc.
    r   r    c              3   L   K   | ]  }t        j                  |      r|  y wr"   r#   r&   s     r)   r*   z#normalize_player.<locals>.<genexpr>   r+   r,   z['`']z\.-z\b(jr|sr|ii|iii|iv|v)\b\.?r-   r.   )r$   r0   r1   r2   r3   r4   r5   r8   s     r)   normalize_playerr?      s    
  	fd+A
=1==A 	
	A 	xQA
ub!A 	tRA 	,b!4A 	vsA$$&AHr;   	player_idc                 F   | sy| j                  d      }t        |      dk\  reg }|D ]D  }|j                         s|j                         dv r&|j	                  |j                                F t        dj                  |            S t        | j                  dd            S )u   
    Convert SGO playerID format (FIRST_LAST_N_LEAGUE) to normalized name.
    Example: JAYLEN_BROWN_2_NBA → jaylen brown
    r   _   )NBANFLNHLMLBNCAAMNCAABEPLWNBAr.   )	splitlenisdigitupperappendr2   r?   r1   replace)r@   parts
name_partsps       r)   normalize_sgo_player_idrU      s    
 OOC E 5zQ
 	)Ayy{aggi+hhaggi(	)   455I--c3788r;   query
candidates	thresholdc                 
   t        |       }g }|D ]d  }t        |      }||k(  r|j                  |df       't        d||      j                         }||k\  sH|j                  |t	        |d      f       f t        |d       S )z
    Find fuzzy matches for a team name against a list of candidates.
    Returns list of (candidate, score) tuples above threshold, sorted by score desc.
          ?NrC   c                     | d    S N    xs    r)   <lambda>z"fuzzy_match_team.<locals>.<lambda>       !A$ r;   key)r:   rP   r   ratioroundsortedrV   rW   rX   
query_normresults	candidate	cand_normscores           r)   fuzzy_match_teamrn      s    
  &JG 9	"9-	 "NNIs+,  j)<BBDINNIuUA789 '//r;   c                 
   t        |       }g }|D ]d  }t        |      }||k(  r|j                  |df       't        d||      j                         }||k\  sH|j                  |t	        |d      f       f t        |d       S )zL
    Find fuzzy matches for a player name against a list of candidates.
    rZ   NrC   c                     | d    S r\   r^   r_   s    r)   ra   z$fuzzy_match_player.<locals>.<lambda>   rb   r;   rc   )r?   rP   r   re   rf   rg   rh   s           r)   fuzzy_match_playerrq      s     "%(JG 	9	$Y/	"NNIs+,j)<BBDINNIuUA78	9 '//r;   __main__z === Team Normalization Tests ===))zDuke Blue Devilsduke)zMichigan Wolverinesmichigan)zMichigan St.r   )zMichigan State Spartansr   )zArizona St.r   )UConnr	   )zUConn Huskiesr	   )zOle Missr
   )zMiami FLr   )z
Miami (FL)r   )zColorado St.r   )zColorado State Ramsr   )z
St. John'szst johns)zSaint Mary'szsaint marys)z
N.C. Stateznc st)z	Texas A&Mztexas amPASSzFAIL (got 'z')z  z<30u    → z<20r.   z#
=== Player Normalization Tests ===))zD'Angelo Russelldangelo russell)zDAngelo Russellrw   )zD-Angelo Russellrw   )zD Angelo Russellrw   )zMarcus O'Brien Jr.zmarcus obrien)u   José Garcíazjose garcia)zLeBron James IIIlebron james)zShaquille O'Nealzshaquille onealz
=== SGO Player ID Tests ===))JAYLEN_BROWN_2_NBAzjaylen brown)DANGELO_RUSSELL_1_NBArw   )LEBRON_JAMES_1_NBArx   )gffffff?)g      ?)__doc__r4   r$   difflibr   r6   compile
IGNORECASEr7   strr:   r?   rU   listfloatrn   rq   __name__printtestsrawexpectedresultstatusplayer_tests	sgo_testsr^   r;   r)   <module>r      s   
  #1I1 L1 		1
 \1 :1 m1 01 ]1 ]1 1 }1 _1 
<1  L!1" \#1$ 
5%1& %'1( 
5)1* 
5+1, 
5-1. 
5/10 F112 F316 K718 m91: m;1< m=1> \?1@ KA1B ~C1D mE1F )G1H )I1J )K1L +#," a1h > MM$$ $ $R3 3 @9s 9s 920C 0T 0e 0t 000c 0t 0 0QU 0. z	
,-E$  8X$!X-[3K3s)5AfX678 

01	L & 8X!#&!X-[3K3s)5AfX678 

)*I
 # 8X(-!X-[3K3s)5AfX678e r;   