
    
ie                    v   S r SSKJr  SSKrSSKrSSKrSSKJrJr  SSKJ	r	  SSK
JrJrJr  SSKJr  \	" \5      R#                  5       R$                  r\S-  S	-  r\S-  S
-  r\R-                  SSS9  / SQrSS jrSS jrSS jrSS jrSS jrSS jrSSS jjr\S:X  a  \R>                  " \" 5       5        gg)z
Lightweight QA regression harness for the RAG system.

Usage:
    python -m tests.run_regression
    python -m tests.run_regression --categories history,finance --limit 5
    )annotationsN)datetimetimezone)Path)DictListAny)	RAGSystem	baselineszqa_cases.jsonresultsT)parentsexist_ok)z based on the available documentsz"i analyzed the available documentszno documents availablezno relevant documents foundz$according to the distilled documentsc                   [        [        SSS9 n[        R                  " U5      nS S S 5        U (       ae  U  Vs1 s H   oDR	                  5       R                  5       iM"     nnW Vs/ s H+  nUR                  SS5      R                  5       U;   d  M)  UPM-     nnUb  WS U nW$ ! , (       d  f       N= fs  snf s  snf )Nrutf-8encodingcategory )openBASELINE_PATHjsonloadstriplowerget)
categorieslimitfcasesccategories_lowercases          tests/run_regression.py
load_casesr%   #   s    	mS7	3q		! 
4 7ABz!GGIOO-zB"
"Txx
B'--/3CC U 	 

 fuL 
4	3 C
s   B%'B6'(B;B;%
B3c                V   ^ U R                  5       m[        U4S j[         5       5      $ )Nc              3  ,   >#    U  H	  oT;   v   M     g 7fN ).0patternanswer_lowers     r$   	<genexpr>$is_generic_answer.<locals>.<genexpr>6   s     G6F7,&6Fs   )r   anyGENERIC_PATTERNS)answerr,   s    @r$   is_generic_answerr2   4   s     <<>LG6FGGG    c           
       ^ US   nUR                  S/ 5      mUR                  S/ 5      nUR                  S/ 5      n U R                  U5      n[	        U[        5      (       a  US   UR                  S5      USUU/ 0 S.$ UR                  S	S
5      nUR                  S/ 5      =(       d    / nU V	s/ s H+  n	[	        U	[
        5      (       d  M  U	R                  S5      PM-     n
n	UR                  5       nU Vs/ s H  nUR                  5       U;  d  M  UPM     nnU Vs/ s H  nUR                  5       U;   d  M  UPM     nn[        U4S jU
 5       5      n[        U5      n[        UR                  5       5      nSnU(       d  U(       d  U(       d  SnUUUUUU
S S S.nUS   UR                  S5      UUUUUS.$ ! [         a0  nUS   UR                  S5      US[        U5      S / 0 S.s S nA$ S nAff = fs  sn	f s  snf s  snf )Nquestionpreferred_sourcesrequired_termsoptional_termsidr   error)r9   r   r5   statusr:   r1   sourcesmetricsr1   r   r<   sourcec              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7fr(   r)   )r*   srcr6   s     r$   r-    evaluate_case.<locals>.<genexpr>g   s      #.?'?|s   		passfail   )answer_lengthmissing_required_termsoptional_hitspreferred_sources_hitgeneric_detectedtop_sources)r9   r   r5   r;   r1   r<   r=   )r   search_and_answer	Exceptionstr
isinstancedictr   r/   r2   lensplit)ragr#   r5   r7   r8   
raw_resultexcr1   r<   r@   source_namesr,   termmissing_requiredrG   preferred_hitgenericrE   r;   r=   r6   s                       @r$   evaluate_caserZ   9   s6   JH!4b9XX.3NXX.3N
**84
 *c""t*,  	
 		
 ^^Hb)FnnY+1rG18R#JsD<Q%CGGH%LR<<>L''::<|+ 	  
 ('::<<' 	  
  # M  'G'MF7- '"2&!.##BQ'G 4jHHZ( u  

t*, X	
 		


4 SsA   G  2G=G=:HH"H>H 
G:
%G5/G:5G:c                D   [        U 5      n[        S U  5       5      nX-
  n[        S U  5       5      n0 nU  H^  nUR                  S5      =(       d    SR                  5       nUR	                  USSSS.5      nUR                  US   S5      S-   XS   '   M`     UUUUUS	.$ )
Nc              3  :   #    U  H  oS    S:X  d  M  Sv   M     g7f)r;   rB      Nr)   r*   results     r$   r-   summarise.<locals>.<genexpr>   s     Gv8,<,F   	c              3  :   #    U  H  oS    S:X  d  M  Sv   M     g7f)r;   r:   r]   Nr)   r^   s     r$   r-   r`      s     Hv8,<,Gra   r   uncategorizedr   )rB   rC   r:   r;   r]   )total_casespassesfailureserrorsby_category)rP   sumr   r   
setdefault)	r   totalre   rf   rg   r   r_   catbuckets	            r$   	summarisern      s    LEGGGF~HHHHF,.Jzz*%8??A&&sQA,NO#)::fX.>#BQ#Fh   ! r3   c                   [         R                  " [        R                  5      R	                  S5      nUUU S.n[
        U S3-  n[        USSS9 n[        R                  " X5SS9  S S S 5        [
        S	-  n[        USSS9 n[        R                  " X5SS9  S S S 5        U$ ! , (       d  f       NB= f! , (       d  f       U$ = f)
Nz%Y%m%dT%H%M%SZ)generated_atsummaryr    z.jsonwr   r      )indentzlatest.json)	r   nowr   utcstrftimeRESULTS_DIRr   r   dump)r   rq   	timestamppayloadoutput_pathr   latest_paths          r$   save_resultsr~      s    X\\*334DEI!G
 9+U 33K	k3	1Q		'Q' 
2 -K	k3	1Q		'Q' 
2  
2	1 
2	1 s   B#B4#
B14
Cc                   U S   nU S   nU S   nU S   n[        S5        [        SU 35        [        SU 35        [        SU 35        [        S	U 35        [        S
5        [        U S   R                  5       5       H[  u  pV[        SUR                  5       S SUR	                  SS5       SUR	                  SS5       SUR	                  SS5       35        M]     g )Nrd   re   rf   rg   z
=== QA Regression Summary ===zTotal cases: zPasses    : zFailures  : zErrors    : z
By category:rh   z  z<20z pass=rB   r   z fail=rC   z error=r:   )printsorteditemstitler   )rq   rk   re   rf   rg   r   countss          r$   print_summaryr      s    M"EXFz"HXF	
+,	M%
!"	L
!"	L

#$	L
!"	
"7=#9#?#?#AB8>>#C(vzz&!/D.E Fjj+,GFJJw4J3KM 	N Cr3   c           
        [         R                  " SS9nUR                  SS S SS9  UR                  S[        S SS9  UR                  S	S
SS9  UR	                  U 5      n[        UR                  UR                  5      nU(       d  [        S[        R                  S9  g[        5       nUR                  R                  5         UR                  R                  (       d  [        S[        R                  S9  gUR                  5         / nU GH  n[!        XF5      nUR#                  U5        US   R%                  5       n[        SU SUS    SUS    35        US:w  d  MU  UR'                  S0 5      n	UR'                  S5      (       a  [        SUS    35        OrU	R'                  S5      =(       d    / n
U
(       a  [        SU
 35        U	R'                  SS5      (       d  [        S5        U	R'                  S 5      (       a  [        S!5        UR(                  (       d  GM    O   [+        U5      n[-        X[5      n[/        U5        [        S"U 35        US#   S$:X  a  US%   S$:X  a  S$$ S$ )&NzRun QA regression checks.)descriptionz--categoriesc                    U R                  S5       Vs/ s H)  oR                  5       (       d  M  UR                  5       PM+     sn$ s  snf )N,)rQ   r   )sparts     r$   <lambda>main.<locals>.<lambda>   s,    N

NNs
   AAz/Comma-separated list of categories to evaluate.)typedefaulthelpz--limitz"Optional limit on number of cases.z--fail-fast
store_truez&Stop after the first failure or error.)actionr   zNo cases matched the filters.)filer]   z3Vector store is empty. Run setup before regression.r;   [z] r9   z  :: r5   PASSr=   r:   z    error: rF   z    missing terms: rH   Fz.    preferred sources not found in top resultsrI   z    generic response detectedz
Detailed results saved to: rf   r   rg   )argparseArgumentParseradd_argumentint
parse_argsr%   r   r   r   sysstderrr
   vector_storeload_existing_index	documentsinitialize_modelrZ   appendupperr   	fail_fastrn   r~   r   )argvparserargsr    rR   r   r#   r_   r;   r=   missingrq   	save_paths                r$   mainr      sB   $$1LMF
N>	   1	   5  
 T"Dt

3E-CJJ?
+C((*%%C#**U$&Gs)v!'')&DJ<uT*-=,>?@VjjB/Gzz'""F7O#456!++&>?E2/y9:{{#:EBBJL;;1229:~~~% (  GW.I'	))
56
#q(WX->!-C1JJr3   __main__)r   List[str] | Noner   z
int | NonereturnList[Dict[str, Any]])r1   rM   r   bool)rR   r
   r#   Dict[str, Any]r   r   )r   r   r   r   )r   r   rq   r   r   r   )rq   r   r   Noner(   )r   r   r   r   ) __doc__
__future__r   r   r   r   r   r   pathlibr   typingr   r   r	   r   r
   __file__resolveparentROOT_DIRr   rx   mkdirr0   r%   r2   rZ   rn   r~   r   __name__exitr)   r3   r$   <module>r      s    #   
 '  " "  >!!#**;&8$y0   $  . "H
JZ*$N";K| zHHTV r3   