
    i                       d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZmZmZ dd
l m!Z! e G d d             Z"dddddddddddddddddddddddddddddddZ#g dZ$g dZ%g dZ&g dZ'g dZ(d d!d"d#d$d%d&d!d'd$i d(d)d*d+d,d-d%id.d"id"d!d/d0d1id2d2d3d+d4g d5Z)g d6g d7g d8d9d:gd;gd<gd=i i g d>Z*h d?Z+h d@Z,h dAZ-h dBZ. ej^                  dCej`                         ej^                  dDej`                         ej^                  dEej`                         ej^                  dFej`                         ej^                  dGej`                         ej^                  dHej`                         ej^                  dIej`                        gZ1dJZ2dKZ3	 ddl4Z4dLZ5	 ddNl7m8Z9 dLZ:	 ddOl;m<Z< dLZ=	 ddl>Z?dLZ@	 ddPlAmBZB dLZC	 ddlDZDdLZE	 ddlFZFdLZG G dQ dR      ZH G dS dT      ZI G dU dV      ZJ G dW dX      ZK G dY dZ      ZL eeM      ZNd[eNj                  d\<    e"       ZO eIeO      ZP eJeO      ZQ eKeO      ZR eLeO      ZSi ZTd]dd^dddddd eUej                  j                  d_d`            da
aX e	j                         ZZ e	j                         Z[ e
j                         Z]da^eOj                  dbz  Z`dc Zadd Zbej                  j                  dedf      Zcdg Zd eedh       eQj                           eb        diegdjegdkeUdlegfdmZhdiegdnedoegfdpZieNj                  dq      dr        ZkeNj                  dsdtgu      dv        ZleNj                  dwdtgu      dx        ZmeNj                  dydzgu      edd{               ZneNj                  d|dtgu      d}        ZoeNj                  d~dzgu      edd               ZpeNj                  ddzgu      edd               ZqeNj                  ddtgu      d        ZreNj                  ddtgu      d        ZseNj                  ddtgu      d        ZteNj                  ddtgu      d        ZueNj                  dddzgu      edd               ZveNj                  ddzgu      edd               ZweNj                  ddzgu      edd               ZxeNj                  ddtgu      d        ZyeNj                  ddzgu      edd               ZzeMdk(  rn eUej                  j                  dd            Z{ eede{         eedeE deC de5         eed e|eQj                                eNj                  de{dM       yy# e6$ r dMZ5Y w xY w# e6$ r dMZ:Y w xY w# e6$ r dMZ=Y w xY w# e6$ r dMZ@Y w xY w# e6$ r dMZCY w xY w# e6$ r dMZEY w xY w# e6$ r dMZGY w xY w)z
Advanced Sports Betting RAG Service
Incorporates sophisticated PDF processing, FAISS vector search, BM25 hybrid scoring,
and DeepSeek/Tesseract OCR fallback.

Port: 5001 (configurable via RAG_PORT)
    NCounter)datetimetimezone)Pathwraps)	dataclass)ListDictAnyOptionalTuple)Flaskrequestjsonifysend_from_directory)secure_filenamec                   d   e Zd ZU dZ ee      j                  j                  Zeed<   dZ	eed<   dZ
eed<   dZeed<   dZeed<   dZeed<   d	Zeed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed <   d!Zeed"<   d# Zy)$	RAGConfigzRAG Service Configurationbase_dirNdata_dir	cache_dirpdf_dirsource_ranking_configquery_expansion_config  
chunk_size  chunk_min_size@  chunk_max_size   sentence_overlap   min_sentences      ?hybrid_vector_weighthybrid_keyword_weightg      ?hybrid_bm25_weightmax_chunks_per_source
   	default_kx   min_vector_candidates   vector_oversample_factorhttp://127.0.0.1:5003/ocrdeepseek_ocr_urlhttp://127.0.0.1:5002/ocrtesseract_ocr_urlc                    | j                   ;t        t        j                  j	                  d| j
                  dz              | _         | j                  | j                   dz  | _        | j                  | j                   dz  | _        | j                  | j
                  dz  dz  | _        | j                  | j
                  dz  dz  | _	        | j                  j                  dd	       | j                  j                  dd	       y )
NDATA_DIRdata	rag_cachecsvconfigzrag_source_ranking.jsonzrag_query_expansions.jsonT)parentsexist_ok)r   r   osenvirongetr   r   r   r   r   mkdirselfs    @/var/www/html/eventheodds/python_service/advanced_rag_service.py__post_init__zRAGConfig.__post_init__@   s    ==  
DMMF<R!STDM>>!!]][8DN<<==50DL%%-)-)AD])]D&&&.*.--(*BE`*`D' 	TD94$7    ) __name__
__module____qualname____doc__r   __file__parentr   __annotations__r   r   r   r   r   r   intr    r"   r$   r&   r(   floatr)   r*   r+   r-   r/   r1   r3   strr5   rE    rF   rD   r   r   !   s    #(^**11Hd1HdItGT"&4&#'D' JNCNCcM3 #'%&#'5' $$!"3"Is!$3$$&c& 8c78s88rF   r      r        r#   )r   r    r"   r$   r&   r     i  r%   i  ix  r!   )	technicalresearchlegalsportsdefault)\	moneylinespreadoverunderparlayteaserpropfuturesoddsjuicevigvigorishlinezpoint spreadtotalhandicapfavoriteunderdogpushcoveratszagainst the spreadbankrollunitroizreturn on investmentedgeevzexpected valuesharpsquarepublicsteamzreverse line movementrlmnbanflmlbnhlncaacollege
basketballfootballbaseballhockeybetwagerstakeriskpayoutwinlossprofit
sportsbook	bookmakerbookiebettinggamblinghandicappingmodelsystemstrategy	analytics
statisticsr8   trendsinjurylineuprotationrestzback to backtravelschedulehomeawayroadneutralvenuecourtfieldquarterhalfperiodinningovertimelivezin-playclosing linezopening linezline movementmarketprice)the_logic_of_sports_bettingsharp_sports_bettingsquares_n_sharpsthe_perfect_betthe_oddsgamblbetting_history)z\bmlb\br   	moneyballsaberm
bill_james
curve_ballmvp_machine	winshares)z_games_part\d+z_players_part\d+
_standings_teams_records)probabilityr   modelingr   research_methodsall_of_statisticsthe_art_of_statistics
ףp=
?{Gz?皙?g)\(?)r   r   propsgeneric_mlb_propsQ?gQ?333333?gg{GzgQ)r   r   r   )elitestrongr   penaltyr   r   )r   r   r   gQg
ףp=
ǿg333333ӿ)r   r   r   	reference	data_dump)tier_weightstag_weightssource_rules)zsports bettingrc   r   zmarket pricer   zedge expected value)player propszprop betzplayer prop linesz	alt lines)zmlb baseballzpitcher batterzlineup bullpenzweather park factor)betting_core
props_corebaseball_corer   r   r   )r   r   r   )named_groupsintent_default_groupssport_aliasessport_default_groupsquery_rules>   rs   r   rq   rr   rg   rc   ra   linesr   r   r   r   r   r   pricesr   marketsri   r   r   player propr   >   home runstolen basesr{   rbihitswindwalksbatterhitterr   r   battersbullpenhittersinningslineupspitcherweatherr   pitchers	strikeout
strikeouts	home runstotal bases>   r   r   ra   r   r   r   r   r   r   r   r   >   r:   logsarchivedatasethistoryresultsarchivesbacktestdatasetstracking
historicalbacktestingz\bcopyright\bz\btable of contents\bz\ball rights reserved\bz\bintentionally left blank\bz\blimit of liability\bz\bdisclaimer of warranty\bz\bisbn(?:-1[03])?\b      ?g      ?TFextract_text)	PdfReader)SentenceTransformerc            
       T    e Zd ZdZd Zd
dedededee   fdZ	dedededee   fdZ
y	)
LLMServicez0LLM service using Grok Fast Reasoning (Enforced)c                 B   t         j                  j                  d      xs  t         j                  j                  dd      j                         j                  d      j                  d      | _        d| _        d| _        d| _        | j                  st        d	       y y )
NGROK_API_KEYXAI_API_KEY "'z$https://api.x.ai/v1/chat/completionszgrok-4-fast-reasoningFz<[RAG] WARNING: GROK_API_KEY not set. LLM features will fail.)	r>   r?   r@   stripgrok_api_keygrok_url
grok_modeldeepseek_availableprintrB   s    rD   __init__zLLMService.__init__!  sz    ZZ^^N;`rzz~~m]_?`ggioopstzz{~>1"'  PQ !rF   prompt
max_tokenstemperaturereturnc                 (    | j                  |||      S )zGenerate text using Grok API)
_call_grok)rC   r  r  r  s       rD   generatezLLMService.generate+  s    vz;??rF   c           
      :   t         r| j                  st        d       y	 t        d| j                   d       t	        j
                  | j                  d| j                   dd| j                  d|d	g||d
dd      }|j                  dk(  r3|j                         }d|v rUt        |d         dkD  rD|d   d   d   d   S t        d|j                   d|j                  r|j                  dd nd        y# t        $ r}t        d|        Y d}~yd}~ww xY w)zCall Grok APIz*[RAG] Grok API not configured (no API key)Nz[RAG] Calling Grok (z)...zBearer zapplication/json)AuthorizationzContent-Typeuser)rolecontentF)r   messagesr  r  stream-   )headersjsontimeout   choicesr   messager  z[RAG] Grok API error: z - rU   z(empty)z[RAG] Grok API exception: )HAS_REQUESTSr  r  r  requestspostr  status_coder!  lentext	Exception)rC   r  r  r  responser8   es          rD   r  zLLMService._call_grok/  s:   4#4#4>?	4((9>?}}'.t/@/@.A%B$6
 "__*0V!D E",#.# H  ##s*}}$T)_)=)A	?1-i8CC.x/C/C.DC_g_l_lVZWZH[r{G|}~   	4.qc233	4s   B!C9 6C9 9	DDDN)rR   r'   )rG   rH   rI   rJ   r  rP   rN   rO   r   r  r  rQ   rF   rD   r  r    s^    :R@s @ @ @X`adXe @! !# !E !hWZm !rF   r  c            
          e Zd ZdZdefdZdedefdZdedefdZ	ded	edefd
Z
dedee   fdZdedeeeeef      fdZddededeeeef      fdZdedee   dedee   fdZdededee   fdZdededefdZddededee   fdZy)PDFProcessorz4Advanced PDF processor with structure-aware chunkingr;   c                 $    || _         g d| _        y )N)u   copyright\s+©?\s*\d{4}zall rights reservedzpage\s+\d+\s+of\s+\d+z^\s*$z^\d+$z[a-f0-9]{32,}z	^[\s\W]*$)r;   boilerplate_patternsrC   r;   s     rD   r  zPDFProcessor.__init__V  s    %
!rF   	file_pathr  c                    |j                   j                         dk(  rht        r	 | j                  |      S t        r	 t        t        |            S t        r.	 t        |      }dj                  d |j                  D              S y|j                   j                         d	k(  r)	 t        |d
d      5 }|j                         cddd       S y# t        $ r}t        d|        Y d}~d}~ww xY w# t        $ r}t        d|        Y d}~d}~ww xY w# t        $ r}t        d|        Y d}~yd}~ww xY w# 1 sw Y   yxY w# t        $ r8 t        |d
d      5 }|j                         cddd       cY S # 1 sw Y   Y yxY ww xY w)z(Extract text using best available method.pdfzPyMuPDF failed: Nzpdfminer failed: 
c              3   D   K   | ]  }|j                         xs d   yw)r  Nr   ).0pages     rD   	<genexpr>z,PDFProcessor.extract_text.<locals>.<genexpr>w  s!     $X4T%6%6%8%>B%>$Xs    zPyPDF2 failed: r  .txtrutf-8encodingzlatin-1)suffixlowerHAS_PYMUPDF_extract_with_pymupdfr,  r  HAS_PDFMINERpdfminer_extractrP   
HAS_PYPDF2r   joinpagesopenreadUnicodeDecodeError)rC   r4  r.  readerfs        rD   r   zPDFProcessor.extract_textb  s   !!#v-255i@@
 3+C	N;;
 1&y1F99$X6<<$XXX ##%/$)S7; $q668$ $ 9 ! 2,QC0112 ! 3-aS1223 ! 1OA3/001$ 	 & $)S9= $668$ $ $ 	$s   C C1 ,D $E 2D9	E 	C.C))C.1	D:DD	D6D11D69E>E E FE8,
F8F	=FFc                 4   t        j                  |      }g }	 |D ]E  }|j                  dt         j                  t         j                  z        }|j                  |       G 	 |j                          dj                  |      S # |j                          w xY w)z&High-fidelity extraction using PyMuPDFr+  flagsr7  )fitzrJ  get_textTEXT_DEHYPHENATETEXT_PRESERVE_WHITESPACEappendcloserH  )rC   r4  docrI  r:  r+  s         rD   rD  z"PDFProcessor._extract_with_pymupdf  s    ii	"	 #}}V43H3H4KhKh3h}iT"# IIKyy IIKs   A
B Br+  filenamec                     |j                         |j                         }g d}t        fd|D              dk\  ryg d}t        fd|D              dk\  ryg d}t        fd	|D              dk\  ry
y)z)Detect document type for optimal chunking)r   rc   r\   r[   r_   r   r   c              3   ,   K   | ]  }|v sd   ywrT   NrQ   r9  ind
text_lowers     rD   r;  z4PDFProcessor.detect_document_type.<locals>.<genexpr>  s     CS
1BqC   	r#   rY   )specificationrV   apiprotocol	algorithmc              3   ,   K   | ]  }|v sd   ywr\  rQ   r]  s     rD   r;  z4PDFProcessor.detect_document_type.<locals>.<genexpr>  s     ASsj/@qAr`  rV   )abstractmethodologyr   
conclusion
referencesc              3   ,   K   | ]  }|v sd   ywr\  rQ   r]  s     rD   r;  z4PDFProcessor.detect_document_type.<locals>.<genexpr>  s     ES3*3DqEr`  rW   rZ   )rB  sum)rC   r+  rY  filename_lowersports_indicatorstech_indicatorsresearch_indicatorsr_  s          @rD   detect_document_typez!PDFProcessor.detect_document_type  s{    ZZ\
!) oC-CCqH YAOAAQF aE/EEJrF   c                     |sg S d}t        j                  ||      }|D cg c]/  }t        |j                               dkD  s |j                         1 c}S c c}w )zSplit text into sentencesz?(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])\s*$|(?<=[.!?"\'])\s+(?=[A-Z])r,   )resplitr*  r
  )rC   r+  sentence_endings	sentencesss        rD   split_into_sentencesz!PDFProcessor.split_into_sentences  sO    I ^HH-t4	#,DaAGGI0C	DDDs   !AAc           	      
   g }|j                  d      }t        |      D ]  \  }}|j                         }|rt        |      dk  r'|j	                         r1t        |      dk\  r#t        |      dk  r|j                  ||df       ht        j                  d|      }|st        d |j                         dd	 D              }|j                  d      }	t        |	      dkD  s|j                  |	|t        |d      f        |S )
zDetect headings in documentr7  r%      P   rT   z^\s*(\d+)(?:\.(\d+))?\s+(.+)$c              3   &   K   | ]	  }|sd   ywr\  rQ   )r9  gs     rD   r;  z/PDFProcessor.detect_headings.<locals>.<genexpr>  s     H!aAHs   Nr#   )rs  	enumerater
  r*  isupperrV  rr  matchrk  groupsgroupmin)
rC   r+  headingsr   irg   line_strippednumbered_matchdepthheading_texts
             rD   detect_headingszPDFProcessor.detect_headings  s    

4  ' 	FGAt JJLM C$6$: $$&3}+=+Bs=GY\^G^1 56  XX&FVNH~'<'<'>r'BHH-33A6|$q(OO\1c%m$DE!	F$ rF   doc_typec                 &   t         j                  |t         d         }|d   }|d   }|d   }|d   }|d   }|rt        |j                               |k  rg S | j	                  |      }	|	r| j                  ||	|      }
|
r|
S | j                  ||      S )zStructure-aware chunkingrZ   r   r    r"   r$   r&   )DOCUMENT_TYPE_CONFIGSr@   r*  r
  r  _chunk_by_sections_sentence_based_chunking)rC   r+  r  r;   r   	chunk_min	chunk_maxoverlapr&   r  chunkss              rD   
chunk_textzPDFProcessor.chunk_text  s    &**85J95UVL)
+,	+,	+,/s4::<(94I ''-,,T8VDF ,,T6::rF   r  c                    g }|j                  d      }t        |      D ]  \  }\  }}}	|dz   t        |      k  r||dz      d   n
t        |      }
dj                  ||dz   |
       j	                         }|sYt        |      |d   k\  sk| j                  ||      }|D ]  }||d<   |	|d<    |j                  |        |S )zChunk text by detected sectionsr7  rT   r    sectionsection_level)rs  r}  r*  rH  r
  r  extend)rC   r+  r  r;   r  r   r  r  heading_poslevelend_possection_contentsection_chunkschunks                 rD   r  zPDFProcessor._chunk_by_sections  s    

4  6?x5H 		.1A1k5,-ECM,Ahq1uoa(s5zG"iikAog(FGMMOO3#76BR;S#S!%!>!>PV!W+ 3E'3E)$-2E/*3 n-		. rF   c                    | j                  |      }t        |      |d   k  rg S g }g }d}|D ]k  }t        |      dz   }||z   |d   kD  r|rdj                  |      j                         }	t        |	      |d   k\  r'|j	                  |	t        |      t        |	      d       t        |d   t        |            }
|
dkD  r||
 d	 ng }t        d
 |D              }|j	                  |       ||z  }||d   k\  st        |      |d   k\  sdj                  |      j                         }	t        |	      |d   k\  r'|j	                  |	t        |      t        |	      d       t        |d   t        |            }
|
dkD  r||
 d	 ng }t        d |D              }n |rWdj                  |      j                         }	t        |	      |d   k\  r'|j	                  |	t        |      t        |	      d       |S )z$Sentence-based chunking with overlapr&   r   rT   r"    r    )r  sentence_count
char_countr$   Nc              3   8   K   | ]  }t        |      d z     ywr\  r*  r9  rv  s     rD   r;  z8PDFProcessor._sentence_based_chunking.<locals>.<genexpr>       $GASVaZ$G   r   c              3   8   K   | ]  }t        |      d z     ywr\  r  r  s     rD   r;  z8PDFProcessor._sentence_based_chunking.<locals>.<genexpr>  r  r  )rw  r*  rH  r
  rV  r  rk  )rC   r+  r;   ru  r  current_chunkcurrent_lengthsentencesentence_lengthr  overlap_counts              rD   r  z%PDFProcessor._sentence_based_chunking  s   --d3	y>F?33I! 	HH!(ma/O/&9I2JJ} XXm4::<
z?f-=&>>MM#-*-m*<&)*o#  !$F+=$>M@R SBORSBS}no >Y[!$$G$G!G  *o-N!55#m:LPVWfPg:g XXm4::<
z?f-=&>>MM#-*-m*<&)*o#  !$F+=$>M@R SBORSBS}no >Y[!$$G$G!G?	HD -0668J:&)9"::)&)-&8"%j/  rF   r  c                    |j                  dd      }t        |      |j                  dd      k  ry|j                         }| j                  D ])  }t	        j
                  ||t        j                        s) y t	        j                  d|      }t        |      dk  ry|r t        t        |            t        |      z  nd}|d	k  ryy
)zValidate chunk qualityr  r  r    r   F\b\w+\b   r   333333?T)	r@   r*  rB  r2  rr  search
IGNORECASEfindallset)rC   r  r;   r  content_lowerpatternwordsunique_ratios           rD   is_valid_chunkzPDFProcessor.is_valid_chunk-  s    ))Ir*w<&**%5s;;  00 	Gyy-?	
 

:}5u:> 8=s3u:U3!#rF   Nsource_namec                    ||j                   }| j                  |      }|rt        |j                               dk  rg S | j	                  ||      }t
        j                  |t
        d         }| j                  ||      }g }t        |      D ]F  \  }}	| j                  |	|      s||	d<   ||	d<   t        |      |	d<   ||	d<   |j                  |	       H |S )z.Process a file and return chunks with metadatad   rZ   sourcechunk_idtotal_chunksdocument_type)namer   r*  r
  rp  r  r@   r  r}  r  rV  )
rC   r4  r  r+  r  r;   
raw_chunksr  r  r  s
             rD   process_filezPDFProcessor.process_fileE  s    #..K   +s4::<(3.I ,,T;?&**85J95UV __T84
 !*- 	%HAu""5&1"-h$%j!(+Jn%)1o&e$	% rF   )rZ   N)rG   rH   rI   rJ   r   r  r   rP   r   rD  rp  r   rw  r   rN   r  r   r   r  dictr  r  boolr  r  rQ   rF   rD   r0  r0  S  s:   >

y 

#d #s #J
 t 
  
    ,E Ec EC DsC}1E,F 2;s ;c ;$tCQTH~BV ;.s d5k 4 TXY]T^ &5S 5$ 54: 5nD $ 4 0d  T
 rF   r0  c                      e Zd ZdZdefdZdedeee	f   fdZ
dedeee	f   fdZd Zd	edej                  fd
Zdee   dej                  fdZdefdZd Zd ZdedefdZd	edee   defdZdededeee	f   fdZdeeeee	f   f   deee	f   fdZdeeeee	f   f   deeeee	f   f   fdZdedeee	f   fdZdedeeee	f      fdZdeee	f   deee	f   fdZd Zd ee   fd!Z d"ed#ee   fd$Z!d;d%ed&e"dee   fd'Z#d(ed)ee   deeef   fd*Z$d(ed+eee	f   defd,Z%d(edee   fd-Z&d(ed.eeef   dee   fd/Z'd%ed.eeef   defd0Z(d.eeef   dee   fd1Z)d2eee	f   d.eeef   de*fd3Z+d4e"d.eeef   dede*fd5Z,d2eee	f   d.eeef   d(edefd6Z-dee   deee*f   fd7Z.d4e"dee   d8ede*fd9Z/y:)<VectorStorez0FAISS-based vector store with BM25 hybrid searchr;   c                    || _         g | _        g | _        d | _        d | _        g | _        g | _        d| _        t               | _	        i | _
        g | _        i | _        | j                  |j                        | _        | j!                  |j"                        | _        |j&                  dz  | _        |j&                  dz  | _        |j&                  dz  | _        | j/                          y )Nr   zfaiss_index.indexzdocuments_metadata.jsonzembeddings.npy)r;   	documents
embeddingsindexr   searchable_textsdoc_lengthsavg_doc_lengthr   word_doc_freqsterm_df_cachedoc_profilessource_profiles_load_source_ranking_configr   source_ranking_load_query_expansion_configr   query_expansionsr   
index_filemetadata_fileembeddings_file_init_modelr3  s     rD   r  zVectorStore.__init__i  s    

 "%i!">>v?[?[\ $ A A&B_B_ ` !**-@@#--0II%//2BB 	rF   config_pathr  c                 D   t        j                  t        j                  t                    }	 |r|j	                         rt        |dd      5 }t        j                  |      }ddd       t        t              rsdD ]9  }t        |j                  |      t              s#||   j                  ||          ; t        |j                  d      t              r|d   |d<   t        d|        |S # 1 sw Y   xY w# t        $ r}t        d|        Y d}~|S d}~ww xY w)	z6Load curated source ranking config with sane defaults.r=  r>  r?  N)r   r   r   z$[RAG] Loaded source ranking config: z,[RAG] Failed to load source ranking config: )r!  loadsdumpsDEFAULT_SOURCE_RANKING_CONFIGexistsrJ  load
isinstancer  r@   updatelistr  r,  rC   r  config_datarN  loadedkeyr.  s          rD   r  z'VectorStore._load_source_ranking_config  s
   jj,I!JK	F{113+sW= *!YYq\F*fd+> A%fjjot<',33F3K@A "&**^"<dC6<^6LN3@NO * *  	F@DEE	F6    C= C1$<C= !AC= 1C:6C= =	DDDc                 D   t        j                  t        j                  t                    }	 |r|j	                         rt        |dd      5 }t        j                  |      }ddd       t        t              rsdD ]9  }t        |j                  |      t              s#||   j                  ||          ; t        |j                  d      t              r|d   |d<   t        d|        |S # 1 sw Y   xY w# t        $ r}t        d|        Y d}~|S d}~ww xY w)	z;Load configurable query expansion rules with sane defaults.r=  r>  r?  N)r   r   r   r   r   z%[RAG] Loaded query expansion config: z-[RAG] Failed to load query expansion config: )r!  r  r  DEFAULT_QUERY_EXPANSION_CONFIGr  rJ  r  r  r  r@   r  r  r  r,  r  s          rD   r  z(VectorStore._load_query_expansion_config  s
   jj,J!KL	G{113+sW= *!YYq\F*fd+q A%fjjot<',33F3K@A "&**]";TB5;M5JM2A+OP * *  	GA!EFF	Gr  c                     t         r#	 t        d      | _        d| _        t	        d       | j                  d| _        t	        d       yy# t
        $ r}t	        d|        d| _        Y d}~Cd}~ww xY w)zInitialize embedding modelzall-MiniLM-L6-v2i  z&[RAG] Loaded SentenceTransformer modelz"[RAG] SentenceTransformer failed: NzT[RAG] Using hash-based embeddings (install sentence-transformers for better results))HAS_SENTENCE_TRANSFORMERSr  r   embedding_dimr  r,  )rC   r.  s     rD   r  zVectorStore._init_model  su    $"01CD
%(">@
 ::!$Dhi 	  ":1#>?!

"s   "A
 
	A2A--A2r+  c                    | j                   r| j                   j                  |d      S t        j                  |j                               j	                         }t        j                  |t
        j                        j                  t
        j                        }|t
        j                  j                  |      dz   z  S )zGet embedding for textT)normalize_embeddings)dtypeg&.>)r   encodehashlibsha384digestnp
frombufferuint8astypefloat32linalgnorm)rC   r+  harrs       rD   _get_embeddingzVectorStore._get_embedding  s    ::::$$T$EE t{{}-446A--299"**EC"))..-455rF   textsc                     | j                   r| j                   j                  |dd      S t        j                  |D cg c]  }| j	                  |       c}      S c c}w )z!Get embeddings for multiple textsT)r  show_progress_bar)r   r  r  arrayr   )rC   r  ts      rD   _get_embeddings_batchz!VectorStore._get_embeddings_batch  sN    ::::$$UY]$^^88UCT003CDDCs   Ac                    | j                   j                         sy	 t        | j                   dd      5 }t        j                  |      | _        ddd       t        rH| j                  j                         r.t        j                  t        | j                              | _        nG| j                  j                         r-t        j                  t        | j                              | _        | j                          t!        dt#        | j
                         d       y# 1 sw Y   xY w# t$        $ r}t!        d	|        Y d}~yd}~ww xY w)
z!Load existing index and documentsFr=  r>  r?  Nz[RAG] Loaded 
 documentsTz[RAG] Failed to load index: )r  r  rJ  r!  r  r  	HAS_FAISSr  faiss
read_indexrP   r  r  r  r  _prepare_searchable_textsr  r*  r,  rC   rN  r.  s      rD   r  zVectorStore.load  s    !!((*	d((#@ .A!%1. T__335"--c$//.BC
%%,,."$''#d.B.B*C"D**,M#dnn"5!6jAB. .  	045	s/   D, D CD,  D)%D, ,	E5EEc                 z   	 t        | j                  dd      5 }t        j                  | j                  |dd       ddd       t
        r@| j                  4t        j                  | j                  t        | j                               n^t        | j                        dkD  rFt        j                  t        | j                        t        j                   | j                               t#        d	t        | j                         d
       y# 1 sw Y   xY w# t$        $ r}t#        d|        Y d}~yd}~ww xY w)zSave index and documentswr>  r?  Fr#   ensure_asciiindentNr   z[RAG] Saved r  z[RAG] Failed to save index: )rJ  r  r!  dumpr  r	  r  r
  write_indexrP   r  r*  r  r  saver  r  r  r,  r  s      rD   r  zVectorStore.save  s    	6d((#@ KA		$..!%JK TZZ3!!$**c$//.BCT__%)D001288DOO3LMLT^^!4 5Z@AK K  	60455	6s.   D $DCD DD 	D:"D55D:c           	      N   g | _         g | _        t               | _        i | _        g | _        i | _        g }i }| j                  D ]  }| j                  |      }|j                         }| j                   j                  |       | j                  j                  t        |j                                      | j                  j                  t        t        j                   d|                   | j#                  ||      }|j                  |       | j%                  ||        | j'                  |      | _        |D cg c]  }| j)                  |       c}| _        | j                  r1t+        | j                        t        | j                        z  | _        yd| _        yc c}w )zBuild searchable text cacher  r   N)r  r  r   r  r  r  r  r  _build_searchable_textrB  rV  r*  rs  r  r  rr  r  _build_preliminary_profile_update_source_aggregate_finalize_source_profiles_build_doc_profilerk  r  )rC   prelim_docssource_aggregatesrX  r+  searchable_textprelims          rD   r  z%VectorStore._prepare_searchable_texts  sg    "%i!>> 	EC..s3D"jjlO!!((9##C(=(=(?$@A&&s2::j/+R'ST44S/JFv&))*;VD	E  $==>OPKVWT44V<WOSO_O_c$"2"23c$:J:J6KKef Xs   =F"rX  c           
         g }|j                  dd      }|rXt        j                  dd|t        j                        }|j	                  |j                  dd      j                  dd             |j                  d      r|j	                  |d          |j                  d	      r)|j	                  t        |j                  d	                   |j                  d
      rH	 |j	                  dj                  |j                  d
g       D cg c]  }t        |       c}             |j                  d      rH	 |j	                  dj                  |j                  dg       D cg c]  }t        |       c}             |j	                  |j                  dd             dj                  |      S c c}w # t        $ r Y w xY wc c}w # t        $ r Y Vw xY w)z#Build searchable text from documentr  r  z\.(pdf|txt)$rP  _r  -r  summary
key_pointsthemesr  )	r@   rr  subr  rV  replacerP   rH  r,  )rC   rX  partsr  clean_sourcepr  s          rD   r  z"VectorStore._build_searchable_text  sx    2&66/2vR]]SLLL--c37??SIJ 779LLY( 779LLSWWY/0177< SXXsww|R7P&Q!s1v&QRS 778SXXswwx7L&M!s1v&MNO
 	SWWY+,xx 'R  'N sH    *F7 
F2F7 9*G #G5G 2F7 7	GGG 	GGpatternsc                 ,    t        fd|D              S )z9Check whether text matches any regex-like source pattern.c              3   h   K   | ])  }t        j                  |t         j                         + y wr  )rr  r  r  )r9  r  r+  s     rD   r;  z3VectorStore._matches_any_pattern.<locals>.<genexpr>+  s"     Sw299WdBMM:Ss   /2)any)rC   r+  r+  s    ` rD   _matches_any_patternz VectorStore._matches_any_pattern)  s    S(SSSrF   r  c                 |  
 t        |j                  dd            }|j                         }t        |j                  dd            j                         
t        |j                  dd            j                         }t        fdt        D              }t        fdt
        D              }t        fdt        D              }t        
fd	t        D              }	||||||	| j                  |t              | j                  |t              | j                  |t              | j                  |t              d

S )z7Build document-level signals before source aggregation.r  unknownr  r  r  c              3   ,   K   | ]  }|v sd   ywr\  rQ   r9  termr  s     rD   r;  z9VectorStore._build_preliminary_profile.<locals>.<genexpr>4  s     YAX1Yr`  c              3   ,   K   | ]  }|v sd   ywr\  rQ   r3  s     rD   r;  z9VectorStore._build_preliminary_profile.<locals>.<genexpr>5  s     [$4?CZA[r`  c              3   ,   K   | ]  }|v sd   ywr\  rQ   r3  s     rD   r;  z9VectorStore._build_preliminary_profile.<locals>.<genexpr>6  s     Sd4?;RSr`  c              3   l   K   | ]+  }|j                        xs |j                  d d        - y w)NrS   r  )r9  r  r  section_lowers     rD   r;  z9VectorStore._build_preliminary_profile.<locals>.<genexpr>8  s:      
 NN=)SW^^OET<R-SS
s   14)
r  r  betting_hitsbaseball_hits	prop_hitsis_front_mattersource_name_bettingsource_name_baseballsource_name_data_dumpsource_name_reference)rP   r@   rB  rk  BETTING_INTENT_TERMSBASEBALL_INTENT_TERMSPROP_INTENT_TERMSr.  FRONT_MATTER_PATTERNSr/  BETTING_SOURCE_PATTERNSBASEBALL_SOURCE_PATTERNSDATA_DUMP_SOURCE_PATTERNSREFERENCE_SOURCE_PATTERNS)rC   rX  r  r  source_lowerr  r:  r;  r<  r=  r9  s     `       @rD   r  z&VectorStore._build_preliminary_profile-  s   SWWXy12||~CGGIr2399;sww34::<Y)=YY[*?[[S&7SS	 
0
 
  (*".#'#<#<\Kb#c$($=$=lLd$e%)%>%>|Mf%g%)%>%>|Mf%g
 	
rF   
aggregatesr  c                 0   |d   }||vr$dt               dddd|d   |d   |d   |d   d
||<   ||   }|dxx   d	z  cc<   |d
   |d   xx   d	z  cc<   |dxx   |d   z  cc<   |dxx   |d   z  cc<   |dxx   |d   z  cc<   |dxx   |d   rd	ndz  cc<   y)z@Accumulate source-level stats used to tag corpus quality/topics.r  r   r>  r?  r@  rA  )
	doc_count	doc_typesbetting_hits_totalbaseball_hits_totalprop_hits_totalfront_matter_countr>  r?  r@  rA  rM  rT   rN  r  rO  r:  rP  r;  rQ  r<  rR  r=  Nr   )rC   rK  r  r  aggs        rD   r  z$VectorStore._update_source_aggregateJ  s    !#$Y&''(#$&''-.C'D(./E(F)/0G)H)/0G)H"Jv  KAK
+,1, !VN%;;!!"f_&=="&"55 !&1B*CQJ!rF   c                    i }|j                         D ]  \  }}t        |d   d      }|d   r|d   j                  d      d   d   nd}|d   |z  }|d   |z  }|d   |z  }	|d	   |z  }
|d
   }|d   xs |dk\  xs |dk(  xr |dk\  }|d   xs |dk\  }|	dk\  }|d   xs |dv xr | }|xr	 |xs |dk\  }||||	|
||||||d||<   | j                  |||           |S )z7Turn source-level stats into stable quality/topic tags.rM  rT   rN  r   r  rO  rP  rQ  rR  r@  r>  g      @rY   g      @r?  g      @r   rA  >   rW   rV   )dominant_doc_typeavg_betting_hitsavg_baseball_hitsavg_prop_hitsfront_matter_ratiois_data_dump_sourceis_betting_sourceis_baseball_sourceis_baseball_betting_sourceis_reference_sourceprop_focus_source)itemsmaxmost_common_apply_curated_source_rule)rC   rK  profilesr  rS  rM  rU  rV  rW  rX  rY  rZ  r[  r\  r_  r^  r]  s                    rD   r  z%VectorStore._finalize_source_profilesc  s   %++- '	FKFCC,a0IIL[IYK 0 < <Q ? B1 E_a"#789D #$9 :Y F 12Y>M!$%9!:Y!F"%&=">)* O#s*O%1M6F#6M  *+ ,$+  !. 4+, B$(AA#( ('   *;)o@R@nVgknVn& &7$4%6!.&8':%6&8.H':%6 HV ++FHV4DEO'	FR rF   r  profilec                 >   | j                  |      }|sg |d<   d|d<   yt        |j                  dg             }||d<   t        |j                  d      xs d      |d<   d|v rd|d	<   d
|v rd|d<   d|v rd|d<   d|v rd|d<   d|v rd|d<   |d	   r|d   rd|d<   yyy)zAApply curated source tags/tiers from config on top of heuristics.curated_tagsr   curated_tierNtagstierr   Tr[  r   r\  r   r_  r   r^  r   rZ  r]  )_find_curated_source_ruler  r@   rP   )rC   r  re  ruleri  s        rD   rc  z&VectorStore._apply_curated_source_rule  s    --f5&(GN#&/GN#DHHVR()"&"%dhhv&6&C)"D+/G'(,0G()d?+/G'($-1G)*$-1G)*&'G4H,I48G01 -J'rF   c                 d   | j                   j                  dg       D ]x  }t        |j                  dd            j                         }t        |j                  dd            }|sJ|dk(  r	||k(  r|c S |dk(  s^	 t	        j
                  ||      r|c S z y# t        j                  $ r Y w xY w)z5Find the first curated source rule matching a source.r   
match_typeexactr  r  regexN)r  r@   rP   rB  rr  r  error)rC   r  rl  rn  match_values        rD   rk  z%VectorStore._find_curated_source_rule  s    ''++NB? 	DTXXlG<=CCEJdhhw34KW$;)>W$yyf5# 6	  xx s   <BB/.B/c           	          | j                   j                  |d   i       }|d   |d   |d   |d   |d   dk(  xs |d   dk\  |d   xs |d   d	k\  |d
   |dS )z9Build lightweight relevance features used at search time.r  r  r:  r;  r<  rY   r%   r?  r#   r=  )r  r:  r;  r<  is_sports_docis_baseball_docr=  source_profile)r  r@   )rC   r  rv  s      rD   r  zVectorStore._build_doc_profile  s    --11&2BBGz*">2#O4,#J/8;Zvn?UYZ?Z%&<=]AX\]A]%&78,	
 		
rF   c                 ~   	 | j                   j                  | j                   j                  dz         }t        |dd      5 }t	        j
                  | j                  |dd       ddd       t        j                  || j                          y# 1 sw Y   *xY w# t        $ r}t        d	|        Y d}~yd}~ww xY w)
zOPersist documents metadata safely without rewriting the FAISS index/embeddings..tmpr  r>  r?  Fr#   r  Nz,[RAG] Failed to save metadata (checkpoint): )r  with_suffixrA  rJ  r!  r  r  r>   r'  r,  r  )rC   tmp_pathrN  r.  s       rD   save_metadata_onlyzVectorStore.save_metadata_only  s    	F))55d6H6H6O6ORX6XYHhg6 K!		$..!%JKJJx!3!34K K  	F@DEE	Fs0   A B $B&(B BB 	B<$B77B<r  c                    |sy|D cg c]  }|d   	 }}| j                  |      }t        r| j                  $t        j                  | j
                        | _        t        j                  |      j                  d      }t        j                  |       | j                  j                  |       njt        | j                  t        j                        r?t        | j                        dkD  r't        j                  | j                  |g      | _        n|| _        | j                   j#                  |       | j%                          t'        dt        |       dt        | j                                 yc c}w )zAdd documents to the storeNr  r  r   z[RAG] Added z documents. Total: )r  r	  r  r
  IndexFlatIPr  r  r  r  normalize_L2addr  r  ndarrayr*  vstackr  r  r  r  )rC   r  ccontentsnew_embeddingsembeddings_nps         rD   add_documentszVectorStore.add_documents  s   *01QAiL1133H=zz!"..t/A/AB
HH^4;;IFM}-JJNN=)$//2::63t;ORS;S"$))T__n,M"N"0f%&&(S[M)<S=P<QRS' 2s   E5r  
new_chunksc                    g }g }t        | j                        D ]i  \  }}|j                  d      |k7  s|j                  |       t        r3t        | j                        |kD  sL|j                  | j                  |          k || _        t        rt        j                  | j                        | _
        |r|D cg c]  }|d   	 }}| j                  |      j                  d      }	t        j                  |	       | j                  j                  |	       n1|rt        j                   |      nt        j                   g       | _        | j#                          | j%                  |       yc c}w )z"Replace all documents for a sourcer  r  r  N)r}  r  r@   rV  r	  r*  r  r
  r}  r  r  r  r  r~  r  r  r  r  r  )
rC   r  r  	keep_docskeep_embeddingsr  rX  dr  r  s
             rD   replace_sourcezVectorStore.replace_source  s.    	/ 	?FAswwx K/  % S%9A%=#**4??1+=>		? # **4+=+=>DJ2;<QAiL<<!77AHHS
"":.

z*;Jbhh7PRPXPXY[P\DO&&( 	:& =s   =E/querykc                    | j                   sg S |j                         }t        j                  d|      }| j	                  ||      }| j                  ||      }g }|D ]g  }t        |      dkD  s|j                  |t        j                  dt        j                  |      z   dz   t        j                        |t        v d       i t        D ]m  }	|	|v s|	|D 
cg c]  }
|
d   	 c}
vs|j                  |	t        j                  dt        j                  |	      z   dz   t        j                        dd       o | j                  |      }| j                  |      }t        r| j                  t!        j"                  |g      j%                  d      }t'        j(                  |       t+        t-        || j.                  j0                  z  | j.                  j2                        t        | j                               }| j                  j5                  ||      \  }}t7        t9        |d	   |d	               }nt;        | j<                  t         j>                        rt        | j<                        d	kD  rt!        j@                  | j<                  |      }t+        t-        || j.                  j0                  z  | j.                  j2                        t        | j                               }t!        jB                  |      | d ddd
   }|D cg c]	  }||   |f }}ng }g }g }|D ]  \  }}|t        | j                         k\  r | j                   |   }|t        | jD                        k  r| jD                  |   ni }|t        | jF                        k  r| jF                  |   nd}d}g }|D ]7  }
|
d   j5                  |      s|j                  |
d          ||
d   rdndz  }9 t+        d|      }| jI                  |||      }| jK                  |||      }| jM                  |jO                  di       ||      }| j.                  jP                  tS        |      z  | j.                  jT                  |z  z   | j.                  jV                  |z  z   } | |z  } |j                  |d   |jY                         D !ci c]  \  }}!|dk7  s||! c}!}| tS        |      ||||d       |s|j                  |j[                                 |j]                  d d       |j]                  d d       g }"i }#|D ]z  }$|$d   jO                  dd      }%|#jO                  |%d	      | j.                  j^                  k  sB|"j                  |$       |#jO                  |%d	      dz   |#|%<   t        |"      k\  sz n t        |"      k  r|D ]{  }$|$d   jO                  dd      }%|#jO                  |%d	      | j.                  j^                  k  sB|"j                  |$       |#jO                  |%d	      dz   |#|%<   t        |"      |k\  sz |"S  |"S c c}
w c c}w c c}!}w )zHybrid vector + keyword searchr  r#   z\b)r4  r  	is_sportsr4  TNr  r   r          r  r  r'   r        ?rv  r  )r  metadatascorevector_scorekeyword_score
bm25_scoreintent_boostmatched_termsc                     | d   S Nr  rQ   xs    rD   <lambda>z$VectorStore.search.<locals>.<lambda>x  s
    aj rF   r  reversec                     | d   S r  rQ   r  s    rD   r  z$VectorStore.search.<locals>.<lambda>y  s
    7 rF   r  r  r1  rT   )0r  rB  rr  r  _build_query_profile_expand_query_for_retrievalr*  rV  compileescaper  SPORTS_TERMS_compute_idfr   r	  r  r  r  r  r
  r~  r  ra  r;   r1   r/   r  r  zipr  r  r  dotargsortr  r  _compute_bm25_compute_intent_boost_should_suppress_candidater@   r(   rO   r)   r*   r`  popsortr+   )&rC   r  r  query_lowerquery_wordsquery_profileexpanded_querykeyword_patternswordr4  r*  	idf_tablequery_embedding
oversamplescoresindicesvector_resultssimilaritiestop_indicesr  
candidatessuppressed_candidatesr  idxrX  re  r  r  r  r  r  suppress_candidatecombinedvr   source_countsr  r  s&                                         rD   r  zVectorStore.search  s   ~~Ikkmjj[911+{K99%O  	D4y1} '' !zz%"))D/*AE*I2==Y!%!5) 	 ! 	D{"tGW3X!AfI3X'X '' !zz%"))D/*AE*I2==Y!%) 	 %%&67	 --n=/ hh'89@@KO/A<<<dkk>_>_`DNN#J #jj//LOFG!#fQi"<=N $//2::63t;ORS;S!vvdooG DKK@@@$++BcBcd'
 !jj6
{|DTrTJ@K!L1<?A"6!L!L!# 
 "!/ .	?L#c$..))..%C03c$:K:K6L0Ld'',RTG<?#dF[F[B\<\d33C8bdO  MM% DY<&&7!((63!AkNSCMD
  ]3M ++C1A9MJ55c=/ZL!%!@!@,b1" 0053FF11MAB..;< 
 $Hy>.1iikLdaQ)^QTL! %l 3!.( ,!.	 	 "%,,Z^^-=>].	?b 	0$?""';T"J 	Az]&&x;F  +dkk.O.OOq!(5(9(9&!(Dq(Hf%w<1$	 w<!* :**8Y? $$VQ/$++2S2SSNN1%,9,=,=fa,H1,LM&)7|q( [ 4Y> "MX Ms   YY9YYr  r  c                     t        |      }t        fdt        D              t        fdt        D              t        fdt        D              t        |      dk  dS )zCClassify the query so ranking can prefer the right document family.c              3   &   K   | ]  }|v  
 y wr  rQ   r9  r4  r  s     rD   r;  z3VectorStore._build_query_profile.<locals>.<genexpr>  s      V!4 V   c              3   &   K   | ]  }|v  
 y wr  rQ   r  s     rD   r;  z3VectorStore._build_query_profile.<locals>.<genexpr>  s     !X$$+"5!Xr  c              3   &   K   | ]  }|v  
 y wr  rQ   r  s     rD   r;  z3VectorStore._build_query_profile.<locals>.<genexpr>  s     Qtt{2Qr  r%   )wants_bettingwants_baseballwants_props
is_generic)r  r.  rB  rC  rD  r*  )rC   r  r  query_termss    `  rD   r  z VectorStore._build_query_profile  sQ    +&  VAU VV!!XBW!XXQ?PQQk*a/	
 	
rF   rl  c                    t        |j                  dd            j                         }t        |j                  dd            j                         }|sy|dk(  r||j                         k(  S |dk(  r|j                         |v S |dk(  r(	 t	        j
                  ||t        j                        duS y# t        j                  $ r Y yw xY w)	z.Check whether a configured query rule matches.rn  containsr  r  Fro  rp  N)rP   r@   rB  r
  rr  r  r  rq  )rC   r  rl  rn  rr  s        rD   _query_rule_matcheszVectorStore._query_rule_matches  s    ,
;<BBD
$((7B/0668 +"3"3"555#$$&+55 yyk2==IQUUU  88 s   &B3 3C	C	c                 2   g }| j                   j                  di       j                         D ]g  \  }}t        |t              s|D ]L  }t        |      j                         j                         }|s-||v s2|j                  t        |              g i |S )z&Detect sports from configured aliases.r   )	r  r@   r`  r  r  rP   r
  rB  rV  )rC   r  rY   sportaliasesalias
alias_texts          rD   _detect_query_sportsz VectorStore._detect_query_sports  s    "3377LRRT 	NE7gt,   Z--/557
*";MM#e*-		 rF   r  c                    g }| j                   j                  di       }|j                  d      r!|j                  |j                  dg              |j                  d      r!|j                  |j                  dg              |j                  d      r!|j                  |j                  dg              | j                  |      }| j                   j                  di       }|D ]#  }|j                  |j                  |g              % | j                   j                  d	g       D ]G  }t	        |t
              s| j                  ||      s'|j                  |j                  d
g              I g }	t               }
|D ]E  }t        |      j                         }|s||
vs$|	j                  |       |
j                  |       G |	S )zLResolve named expansion groups from intent, sport, and explicit query rules.r   r  r   r  r   r  r   r   r   
add_groups)r  r@   r  r  r  r  r  r  rP   r
  rV  r  )rC   r  r  r  intent_defaultsdetected_sportsr   r  rl  dedupedseenr  
group_names                rD   _resolve_expansion_groupsz%VectorStore._resolve_expansion_groups  s   //334KRP_-MM/--i<=]+MM/--gr:;-.MM/--j"=>33K@#44889OQST$ 	?EMM.225"=>	? ))--mR@ 	:D$%$*B*B;PT*Udhh|R89	: u 	%EU))+Jj4z*$		%
 rF   c                    |j                         }| j                  ||      }| j                  j                  di       }g }|D ]Y  }|j                  |g       }t	        |t
              s&|D ]/  }	t        |	      j                         }
|
s|j                  |
       1 [ |s|S g }t               }|D ])  }||vs|j                  |       |j                  |       + | ddj                  |       S )zGAdd light retrieval hints so short queries pull the right corpus slice.r   r  )rB  r  r  r@   r  r  rP   r
  rV  r  r  rH  )rC   r  r  r  r  r   
expansionsr  valuesvaluer+  r  r  items                 rD   r  z'VectorStore._expand_query_for_retrieval  s    kkm//]K,,00D
 	,E!%%eR0Ffd+ ,5z'')%%d+,		, Lu 	D4t$	
 #((7+,--rF   c                    g }|j                  d      r|j                  d       |j                  d      r|j                  d       |j                  d      r|j                  d       |j                  d      rD|j                  d      r3|j                  d      r"|j                  d      r|j                  d       |S )	zCMap query intent to weight keys used by the curated ranking config.r  r   r  r   r  r   r  r   )r@   rV  )rC   r  keyss      rD   _query_weight_keyszVectorStore._query_weight_keys  s    _-KK	"-.KK
#]+KK l+!!/2!!"23!!-0KK+,rF   rv  c           	         d}| j                  |      }t        |j                  d      xs d      }| j                  j                  di       }| j                  j                  di       }|D ]u  }|t	        |j                  |i       j                  |d            z  }|j                  dg       D ]0  }	|t	        |j                  |	i       j                  |d            z  }2 w |S )z2Apply curated source tier/tag weights from config.r  rh  r   r   r   rg  )r  rP   r@   r  rO   )
rC   rv  r  bonusweight_keysrj  r   r   r  tags
             rD   _curated_source_bonusz!VectorStore._curated_source_bonus  s    --m<>%%n5BC**..~rB))--mR@ 	GCU<++D"599#sCDDE%))."= G{sB7;;CEFFG	G
 rF   r  c                 n   |t        | j                        k\  ry| j                  |   }|j                  di       }| j                  ||      }|j                  d      rT|j                  d      r|dz  }n
|d   r|dz  }|t	        |d   d	      d
z  z  }|j                  d      r|d   dk(  r|dz  }|j                  d      r6|j                  d      r|dz  }n
|d   r|dz  }|t	        |d   d	      d
z  z  }|j                  d      rS|t	        |d   d      dz  z  }|j                  d      r|dz  }|j                  d      s
|d   s|dz  }|d   dk(  r|dz  }|j                  d      r8|j                  d      r'|j                  d      r|j                  d      r|dz  }|j                  d      r_|j                  d      rN|j                  d      r=|j                  d      r|dz  }|j                  d      r|j                  d      s|dz  }|j                  d      r)||j                  d      s|j                  d      rdndz  }|d    r|dz  }|j                  d!d      dkD  r|dz  }|j                  d"      d#k\  s|j                  d$      d%k\  r|dz  }t        d&t	        d'|            S )(zPBoost or penalize candidates based on query intent and obvious low-value chunks.r  rv  r  r[  gQ?rt  r   r:  ry  gQ?r  r   r   r  r\  g{Gz?ru  r;  r  r<  r%   g{Gz?r_  r   r   r]  gQ?r^  rZ  gffffff?r   r=  rY  	copyrightr#   ztable of contentsrT   g      r'   )r*  r  r@   r  r  countra  )rC   r  r  r  re  rv  boosts          rD   r  z!VectorStore._compute_intent_boost  s   #d''((##C( %5r:**>=I_-!!"56)S0!4t;;E  .7>3Ja3O-.!!"67*+S115<<E]+S-q1D88E!!"56!%%&9:7?C[{#q( l+!!/2!!"23""#?@TME l+!!"23!!-0!!"56!!"78ASASTgAh34m//@MDUDUVcDdTkooE$%TME2C84?TME   -2o6K6KL_6`de6eTME4S%))rF   c                     |j                  d      sy|j                  d      }|j                  d      }|j                  d      }t        fdt        D              }|ry|r|ry|r|r|syy)zEHard-suppress low-value dump sources for short generic prop searches.rZ  Fr  r  r  c              3   &   K   | ]  }|v  
 y wr  rQ   r  s     rD   r;  z9VectorStore._should_suppress_candidate.<locals>.<genexpr>e  s     QDDK/Qr  T)r@   r.  HISTORY_INTENT_TERMS)rC   rv  r  r  r  r  r  wants_historys      `    rD   r  z&VectorStore._should_suppress_candidateX  ss     !!"78#''6"&&|4
%))/:Q<PQQ:=rF   c                    i }t        | j                        }|dk(  r|S |D ]  d   }||v r|| j                  v r| j                  |   }not        j                  d|      r,| j
                  j                  |d      }|| j                  |<   n-t        fd| j                  D              }|| j                  |<   t        j                  ||z
  dz   |dz   z  dz         }t        |d      ||<    |S )zCompute IDF for termsr   r4  z\w+c              3   L   K   | ]  }d    j                  |      sd  yw)r  rT   Nr8  )r9  r+  r*  s     rD   r;  z+VectorStore._compute_idf.<locals>.<genexpr>  s#     Zt)@S@STX@YZs   $$r'   r  r  )r*  r  r  rr  	fullmatchr  r@   rk  r  mathlogra  )rC   r+  r  
total_docsr4  dfidfr*  s          @rD   r  zVectorStore._compute_idfr  s    	(
? 	,AV9Dy t)))''-fd+((,,T15+-""4(Zt'<'<ZZ+-""4(((JOc1b3h?#EFC!#smIdO	," rF   r  c                    |r|t        | j                        k\  ry| j                  |   }|t        | j                        k  r| j                  |   nt        |j                               }| j                  xs |xs d}d}|D ]q  }t        |d   j                  |            }	|	dk(  r&|j                  |d   d      }
|	t        dz   z  }|	t        dt        z
  t        ||z  z  z   z  z   }||
||z  z  z  }s |S )z!Compute BM25 score for a documentr  rT   r  r   r4  r  )	r*  r  r  rs  r  r  r@   BM25_K1BM25_B)rC   r  r+  r  doc_textdoc_lenavg_lenbm25r*  tfr  	numeratordenominators                rD   r  zVectorStore._compute_bm25  s   3#d&;&;"<<((-+.T5E5E1F+F$""3'CPXP^P^P`La%%55A 	4AQy\))(34BQw--&	3/Cgm,Iw#,7WCT9U*UVVKC9{233D	4 rF   N)r,   )0rG   rH   rI   rJ   r   r  r   r   rP   r   r  r  r  r  r  r   r   r  r  r  r  r  r  r/  r  r  r  rc  r   rk  r  r{  r  r  rN   r  r  r  r  r  r  r  rO   r  r  r  r  r  rQ   rF   rD   r  r  f  s   :y 0t S#X ( c3h (j63 62:: 6E49 E Ed *6g6$ 3 BT TS	 Td T
d 
S 
TRUWZRZ^ 
:K4T#s(^8K3L KVZ[^`c[cVd K2,Dd38n9L4M ,RVWZ\`adfiai\jWjRk ,\9 9tCH~ 92 c3h8P "
c3h 
DcN 
FTDJ T4'# '4: '<CC CC Cd CJ
 
$s) 
PTUXZ^U^P_ 
s $sCx. T & S	 S cSWi ]abe]f <. .T#t)_ .Y\ .8S$Y DI $DcN SWX[]aXaSb gl A* A*T#t)_ A*_b A*gl A*FS#X CI 	
 
4T$Z De4D 4 T
 t PU rF   r  c            	       P    e Zd ZdZdefdZdedee   fdZ	dedededee   fd	Z
y
)
OCRServicezVOCR fallback for scanned PDFs using DeepSeek OCR (port 5003) and Tesseract (port 5002)r;   c                     || _         t        j                  j                  dd      | _        t        j                  j                  dd      | _        y )NDEEPSEEK_OCR_URLr2   TESSERACT_OCR_URLr4   )r;   r>   r?   r@   r3   r5   r3  s     rD   r  zOCRService.__init__  s;     "

/AC^ _!#0CE`!arF   r4  r  c                     t         sy| j                  | j                  |d      }|r|S | j                  | j                  |d      }|S )zCTry OCR extraction with DeepSeek (primary) and Tesseract (fallback)NDeepSeek	Tesseract)r&  _try_ocrr3   r5   )rC   r4  r+  s      rD   r   zOCRService.extract_text  sI     }}T22IzJK }}T33YLrF   urlservice_namec           	      <   	 t        d| d| d       t        |d      5 }d|j                  |dfi}t        j                  ||d      }d	d	d	       j
                  d
k(  r|j                         }|j                  d      r4|j                  d      r#t        d| dt        |d          d       |d   S |j                  d      r#t        d| dt        |d          d       |d   S y	# 1 sw Y   xY w# t        $ r}t        d| d|        Y d	}~y	d	}~ww xY w)zTry OCR servicez[RAG] Trying z OCR at ...rbfilezapplication/pdfiX  )filesr"  Nr#  successr+  z[RAG] z OCR extracted z charsz OCR failed: )
r  rJ  r  r'  r(  r)  r!  r@   r*  r,  )	rC   r  r4  r  rN  r  r-  r8   r.  s	            rD   r  zOCRService._try_ocr  s2   	;M,xuC@Ai& H!)..!5F!GH#==E3GH ##s*}}88I&488F+;F<.DL@Q?RRXYZ<'XXf%F<.DL@Q?RRXYZ<' H H  	;F<.aS9::	;s5   C7 *C+
A+C7 63C7 +C40C7 7	D DDN)rG   rH   rI   rJ   r   r  r   r   rP   r   r  rQ   rF   rD   r
  r
    sP    `by bd x} C D  QT rF   r
  c                       e Zd ZdZdefdZdedefdZdededefdZ	dd	ed
e
dee   fdZdededee   fdZd	edee   fdZdededefdZddee   dee   fdZy)EnrichmentServicez&Service for LLM-based chunk enrichmentr;   c                     || _         t               | _        t        t        j
                  j                  dd            | _        d| _        y )NENRICHMENT_BUDGETi?B r   )	r;   r  llmrN   r>   r?   r@   enrichment_budgetenrichment_usedr3  s     rD   r  zEnrichmentService.__init__  s7    <!$RZZ^^4G%P!Q rF   r  r  c                    |j                  dd      }|rt        |      dk  r|S | j                  ||      }| j                  | j                  k  r:| j                  ||      }|r&|j                  |       | xj                  dz  c_        |j                         }|j                  dd      |d<   |j                  dg       |d<   |j                  dg       |d<   d|d	<   |S )
z1Enrich a single chunk with LLM-generated metadatar  r  2   rT   r#  r$  r%  Tenriched)r@   r*  _generate_basic_cardr   r  _enhance_with_llmr  copy)rC   r  r  cardenhancedenriched_chunks         rD   enrich_chunkzEnrichmentService.enrich_chunk  s    ))Ir*#g,+L ((%8 $"8"88--gt<HH%$$)$ $(HHY$;y!'+xxb'A|$#'88Hb#9x %)z" rF   r  c                    t        j                  d|      }|D cg c]/  }t        |j                               dkD  s |j                         1 }}|rdj	                  |dd       dd n|dd }|dd D cg c]  }|dd 	 }}| j                  |      }||||j                  d	d
      |j                  dd      dS c c}w c c}w )z)Generate basic knowledge card without LLMz(?<=[.!?])\s+r,   r  Nr#   r   ry     r  r1  r  r   )r#  r$  r%  r  r  )rr  rs  r*  r
  rH  _extract_themesr@   )rC   r  r  ru  rv  r#  r$  r%  s           rD   r$  z&EnrichmentService._generate_basic_card  s     HH-w7	(1I1S^b5HQWWYI	I 4=#((9Ra=)$3/'$3- (1!}5!ag5
5 %%g. $ii)4		*a0
 	
 J 6s   !B=B=6Cr+  
max_themesc                    t        j                  d|j                               }|sg S h d}i }|D ].  }||vst        |      dk\  s|j	                  |d      dz   ||<   0 t        |j                         d d      }|d	| D cg c]  \  }}|d
k\  s| }	}}t        D ];  }
|
|j                         v s|
|	vs|	j                  |
       t        |	      |k\  s; n |	d	| S c c}}w )z'Extract themes using frequency analysisz\b[a-zA-Z]{4,}\b>%   alsobeeneachevenfromhaveintojustlikemoremostonlysomethanthatthemtheythisverywerewhatwhenwillwithyouraboutbeingcouldothertheirtherewherewhichwouldshouldbecausethroughry  r   rT   c                     | d   S )NrT   rQ   r  s    rD   r  z3EnrichmentService._extract_themes.<locals>.<lambda>+  s
    1 rF   Tr  Nr#   )	rr  r  rB  r*  r@   sortedr`  r  rV  )rC   r+  r.  r  
stop_wordsword_countsr  sorted_wordsr  r%  r4  s              rD   r-  z!EnrichmentService._extract_themes  s    

.

=I

  	AD:%#d)q.$/OOD!$<q$@D!	A
 k//1~tT*6{
*CR;4uPQz$RR ! 	Dtzz|#F(:d#v;*,		 kz"" Ss   C%C%	base_cardc                     d|dd  d|d    d}	 | j                   j                  |dd	      }|r| j                  |      S 	 y# t        $ r}t	        d
|        Y d}~yd}~ww xY w)zEnhance card with LLMzsYou are refining a knowledge card for a sports betting document. Extract the most important information.

EXCERPT:
NrR   z

CURRENT SUMMARY:
r#  a   

Provide an improved knowledge card in this exact format:
Summary: <1-2 sentences capturing the main point about sports betting>
Key Points:
- <concise bullet 1>
- <concise bullet 2>
- <concise bullet 3>
Themes: theme1, theme2, theme3

Begin the card now:r#  r  r  r  z[RAG] LLM enhancement error: )r  r  _parse_llm_responser,  r  )rC   r  rZ  r  r-  r.  s         rD   r%  z#EnrichmentService._enhance_with_llm7  s     	#  9  
$	7xx((CS(QH//99 
   	71!566	7s   0A 	A%A  A%c                     |syi }t        j                  d|t         j                        }|r%|j                  d      j	                         dd |d<   g }t        j                  d|t         j                        }|r|j                  d      j	                         j                  d      }|D ]\  }|j	                         }|j                  d      r|dd j	                         }|s:t        |      d	kD  sI|j                  |dd
        ^ |r|dd	 |d<   t        j                  d|      }|rT|j                  d      j                  d      D 	cg c]  }	|	j	                          }
}	|
D 	cg c]  }	|	s|		 c}	dd	 |d<   |r|S dS c c}	w c c}	w )z%Parse LLM output into structured cardNz)Summary:\s*(.+?)(?=Key Points:|Themes:|$)rT   r   r#  z Key Points:\s*(.*?)(?=Themes:|$)r7  r"  r  r,  r$  zThemes:\s*(.+),r%  )	rr  r  DOTALLr  r
  rs  
startswithr*  rV  )rC   r+  resultsummary_matchr$  kp_matchr   rg   themes_matchr  r%  s              rD   r]  z%EnrichmentService._parse_llm_responseT  s|    		"NPTVXV_V_` - 3 3A 6 < < >t DF9 
99@$		RNN1%++-33D9E 2zz|??3'8>>+DCIM%%d4Cj12 #-bq>F<  yy!2D9)5););A)>)D)DS)IJAaggiJFJ+17aQ7;F8v)T) K7s   F+F3Foriginalr'  c                    d|j                  dd       d|j                  dd       dd|j                  d	d
       g}|j                  d      r/|j                  d       |d   D ]  }|j                  d|         |j                  d      r&|j                  ddj                  |d                 |j                  d|        dj                  |      S )z$Compose enriched content for storagez	[Source: r  Unknownz	 | Chunk r  ?]z	Summary: r#  r  r$  zKey Points:z- r%  zThemes: z, z
Original Content:
r7  )r@   rV  rH  )rC   rf  r'  r   points        rD   _compose_enriched_contentz+EnrichmentService._compose_enriched_contentv  s     956iUX@Y?ZZ[\B/01

 88L!LL'l+ +r%\*+ 88HLL8DIId8n$=#>?@,XJ78yyrF   Nr  c           
         g }t        |      }t        |      D ]c  \  }}| j                  |      }|j                  |       |r ||dz   |       |dz   dz  dk(  sBt	        d|dz    d| d| j
                   d       e |S )z(Enrich all chunks with progress trackingrT   r,   r   z[RAG] Enriched /z chunks (LLM used: ))r*  r}  r*  rV  r  r   )rC   r  progress_callbackr#  rh   r  r  r)  s           rD   enrich_all_chunksz#EnrichmentService.enrich_all_chunks  s    F!&) 	cHAu!..u5NOON+ !!a%/A|q Awaw6I$J^J^I__`ab	c rF   )r  r  )rG   rH   rI   rJ   r   r  r   r*  rP   r$  rN   r   r-  r   r%  r]  rl  rq  rQ   rF   rD   r  r    s    0!y !$ 4 <
C 
 
 
. #C  #S  #c  #D  (4. : *  *  *D #  T  c  &T
 tTXz rF   r  i  @MAX_CONTENT_LENGTHidler  ENRICHMENT_CHECKPOINT_EVERYr,   )
statusprogressr%  llm_used
updated_at
started_atr  total_targetdone_targetcheckpoint_everyzenrichment_status.jsonc                  N   	 t         j                  t         j                  dz         } t        | dd      5 }t	        j
                  t        |dd       ddd       t        j                  | t                y# 1 sw Y   $xY w# t        $ r}t        d	|        Y d}~yd}~ww xY w)
z4Persist enrichment status for resume after restarts.rx  r  r>  r?  Fr#   r  Nz+[RAG] Failed to persist enrichment status: )ENRICHMENT_STATUS_FILEry  rA  rJ  r!  r  enrichment_statusr>   r'  r,  r  )rz  rN  r.  s      rD   _save_enrichment_status_to_diskr    s    A)556L6S6SV\6\](C'2 	JaII'qI	J


834	J 	J  A;A3?@@As.   4B A7"B 7B <B 	B$BB$c            	      P   	 t         j                         rUt        t         dd      5 } t        j                  |       }ddd       t        t              rt        j                  |       t        j                  dd       t        j                  dd	       t        j                  d
d       t        j                  dd	       t        j                  dd       t        j                  dd       t        j                  dd       t        j                  dd	       t        j                  dd	       t        j                  dt        t        j                  j                  dd                   t        j                  d      dk(  rodt        d<   t        j                  d
      xs dt        d
<   t!        j"                  t$        j&                        j)                         dz   t        d<   t+                yy# 1 sw Y   xY w# t        $ r}t        d|        Y d}~d}~ww xY w)zRRestore enrichment status on startup (and mark stale running jobs as interrupted).r=  r>  r?  Nz([RAG] Failed to load enrichment status: ru  rs  rv  r   r%  r  rw  rx  ry  r  rz  r{  r|  rt  r,   runninginterrupted4Previous enrichment was interrupted. You can resume.Z)r~  r  rJ  r!  r  r  r  r  r  r,  r  
setdefaultrN   r>   r?   r@   r   nowr   utc	isoformatr  )rN  r8   r.  s      rD   !_load_enrichment_status_from_diskr    s   >!((*,cGD $yy|$$%!((.
   62  Q/  B/  Q/  t4  t4  40  3  2  !3SHegi9j5kl X&)3&3(#!!), FE 	)$ +3,,x||*D*N*N*PSV*V,'') 4)$ $  >8<==>s-   &H G6-H 6H ;H 	H%H  H%FLASK_API_KEYzeventheodds-flask-api-key-2025c                 .     t                fd       }|S )Nc                      t         j                  j                  dd      }|t        k7  rt	        ddi      dfS  | i |S )Nz	X-API-Keyr  rq  zInvalid API keyi  )r   r   r@   r  r   )argskwargsapi_keyrN  s      rD   	decoratedz"require_api_key.<locals>.decorated  sG    //%%k26m#G%678#==$!&!!rF   r   )rN  r  s   ` rD   require_api_keyr    s!    
1X" "
 rF   z9[RAG] Initializing Advanced Sports Betting RAG Service...job_idru  rv  r%  c                 ~    t         5  |||t        j                         d|t        | <   ddd       y# 1 sw Y   yxY w)zUpdate upload progress)ru  rv  r%  updatedN)progress_locktimeupload_progress)r  ru  rv  r%  extras        rD   update_progressr    sB    	 
 yy{	#

 #

 
 
s   #3<r4  rY  c                     	 t        | ddd       t        j                  |      }|rt        |j	                               dk  r#t        | ddd       t
        j                  |      }|rt        |j	                               dk  rt        | ddd	       y
t        | dddt        |       d       t        j                  ||      }t        j                  ||      }|st        | ddd       y
t        |      D ]"  \  }}||d<   ||d<   t        |      |d<   ||d<   $ t        | dddt        |       d       t        j                  ||       t        | ddd       t        j                          t        | dddt        |       dt        |      |t        t        j                               y
# t        $ rF}t        d|        dd
l}	|	j!                          t        | dddt#        |              Y d
}~y
d
}~ww xY w)z#Process uploaded file in background
processingr,   z Extracting text from document...r     z%Text extraction failed, trying OCR...failedr   z$Failed to extract text from documentN(   z
Extracted z characters. Chunking...z'No valid chunks extracted from documentr  r  r  r  <   zCreated z! chunks. Generating embeddings...Z   Saving index...	completedzSuccessfully processed! Added z chunks.)chunks_addedr  total_documentsz[RAG] Error processing file: Error: )r  pdf_processorr   r*  r
  ocr_servicerp  r  r}  vector_storer  r  r  r,  r  	traceback	print_excrP   )
r  r4  rY  r+  r  r  r  r  r.  r  s
             rD   process_file_backgroundr    s   6Ab2TU )))4 s4::<(3.FL"6]^++I6Ds4::<(3.FHa1WXbJs4ykIa2bc !55dHE ))$9FHa1Z[ "&) 	.HAu&E(O !E*$'KE.!%-E/"		. 	bHS[MIj2kl 	##Hf5b2CDK,S[MBV" 6 67	
  A-aS12!ws1vh-?@@	As&   BF. AF. +CF. .	G=7<G88G=rn  c                       y)z@Serve upload interface with chunk viewer and enrichment controlsug  <!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports Betting RAG - Document Upload</title>
    <style>
        * { box-sizing: border-box; margin: 0; padding: 0; }
        body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); min-height: 100vh; padding: 20px; color: #e4e4e7; }
        .container { max-width: 1000px; margin: 0 auto; }
        h1 { color: #fbbf24; margin-bottom: 10px; font-size: 2rem; }
        .subtitle { color: #9ca3af; margin-bottom: 30px; }
        .card { background: rgba(255,255,255,0.05); border-radius: 12px; padding: 24px; margin-bottom: 20px; border: 1px solid rgba(255,255,255,0.1); }
        .card h2 { color: #fbbf24; font-size: 1.25rem; margin-bottom: 16px; }
        .stats { display: grid; grid-template-columns: repeat(auto-fit, minmax(120px, 1fr)); gap: 16px; }
        .stat { background: rgba(251,191,36,0.1); padding: 16px; border-radius: 8px; text-align: center; }
        .stat-value { font-size: 1.75rem; font-weight: bold; color: #fbbf24; }
        .stat-label { color: #9ca3af; font-size: 0.75rem; }
        form { display: flex; flex-direction: column; gap: 16px; }
        input[type="text"], input[type="file"], select { padding: 12px; border-radius: 8px; border: 1px solid rgba(255,255,255,0.2); background: rgba(0,0,0,0.2); color: white; font-size: 1rem; }
        input[type="file"] { cursor: pointer; }
        button { background: #fbbf24; color: #1a1a2e; padding: 12px 20px; border: none; border-radius: 8px; font-weight: 600; font-size: 0.9rem; cursor: pointer; transition: all 0.2s; }
        button:hover { background: #f59e0b; transform: translateY(-1px); }
        button:disabled { opacity: 0.5; cursor: not-allowed; transform: none; }
        button.secondary { background: #6366f1; color: white; }
        button.secondary:hover { background: #4f46e5; }
        button.danger { background: #ef4444; color: white; }
        .btn-group { display: flex; gap: 10px; flex-wrap: wrap; }
        .progress-container { margin-top: 20px; }
        .progress-bar { height: 8px; background: rgba(255,255,255,0.1); border-radius: 4px; overflow: hidden; }
        .progress-fill { height: 100%; background: linear-gradient(90deg, #fbbf24, #f59e0b); transition: width 0.3s; }
        .progress-text { margin-top: 8px; color: #9ca3af; font-size: 0.875rem; }
        .result { padding: 16px; border-radius: 8px; margin-top: 16px; }
        .result.success { background: rgba(34,197,94,0.2); border: 1px solid rgba(34,197,94,0.3); }
        .result.error { background: rgba(239,68,68,0.2); border: 1px solid rgba(239,68,68,0.3); }
        .result.info { background: rgba(59,130,246,0.2); border: 1px solid rgba(59,130,246,0.3); }
        .tabs { display: flex; gap: 10px; margin-bottom: 20px; border-bottom: 1px solid rgba(255,255,255,0.1); padding-bottom: 10px; }
        .tab { padding: 8px 16px; cursor: pointer; border-radius: 6px; color: #9ca3af; }
        .tab.active { background: rgba(251,191,36,0.2); color: #fbbf24; }
        .tab:hover { background: rgba(255,255,255,0.05); }
        .chunk-list { max-height: 500px; overflow-y: auto; }
        .chunk-item { background: rgba(0,0,0,0.2); padding: 16px; border-radius: 8px; margin-bottom: 12px; border-left: 4px solid #6366f1; }
        .chunk-item.enriched { border-left-color: #22c55e; }
        .chunk-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
        .chunk-source { color: #fbbf24; font-weight: 600; font-size: 0.875rem; }
        .chunk-badge { padding: 2px 8px; border-radius: 4px; font-size: 0.75rem; }
        .chunk-badge.enriched { background: rgba(34,197,94,0.3); color: #22c55e; }
        .chunk-badge.raw { background: rgba(156,163,175,0.3); color: #9ca3af; }
        .chunk-summary { color: #e4e4e7; margin-bottom: 8px; font-size: 0.9rem; }
        .chunk-themes { display: flex; gap: 6px; flex-wrap: wrap; margin-bottom: 8px; }
        .theme-tag { background: rgba(99,102,241,0.3); color: #a5b4fc; padding: 2px 8px; border-radius: 4px; font-size: 0.75rem; }
        .chunk-content { color: #9ca3af; font-size: 0.8rem; background: rgba(0,0,0,0.2); padding: 10px; border-radius: 4px; white-space: pre-wrap; max-height: 150px; overflow-y: auto; }
        .pagination { display: flex; justify-content: center; gap: 10px; margin-top: 16px; }
        .hidden { display: none !important; }
    </style>
</head>
<body>
    <div class="container">
        <h1>🏆 Sports Betting RAG</h1>
        <p class="subtitle">Advanced Document Processing with FAISS + BM25 + DeepSeek LLM Enrichment</p>
        
        <div class="tabs">
            <div class="tab active" onclick="showTab('status')">📊 Status</div>
            <div class="tab" onclick="showTab('upload')">📤 Upload</div>
            <div class="tab" onclick="showTab('chunks')">📄 View Chunks</div>
            <div class="tab" onclick="showTab('enrich')">✨ Enrichment</div>
        </div>
        
        <!-- Status Tab -->
        <div id="tab-status" class="card">
            <h2>📊 System Status</h2>
            <div class="stats" id="stats">
                <div class="stat"><div class="stat-value" id="docCount">-</div><div class="stat-label">Total Chunks</div></div>
                <div class="stat"><div class="stat-value" id="enrichedCount">-</div><div class="stat-label">Enriched</div></div>
                <div class="stat"><div class="stat-value" id="rawCount">-</div><div class="stat-label">Raw</div></div>
                <div class="stat"><div class="stat-value" id="fileCount">-</div><div class="stat-label">Files</div></div>
                <div class="stat"><div class="stat-value" id="searchType">-</div><div class="stat-label">Search</div></div>
            </div>
        </div>
        
        <!-- Upload Tab -->
        <div id="tab-upload" class="card hidden">
            <h2>📤 Upload Documents</h2>
            <form id="uploadForm">
                <input type="text" id="apiKey" placeholder="API Key" value="eventheodds-flask-api-key-2025">
                <div id="dropZone" style="border:2px dashed #4a5568; border-radius:12px; padding:40px 20px; text-align:center; cursor:pointer; margin-bottom:12px; transition:border-color 0.2s, background 0.2s;">
                    <div style="font-size:2em; margin-bottom:8px;">📂</div>
                    <p style="color:#9ca3af; margin:0;">Drag & drop PDF/TXT files here or click to browse</p>
                    <p style="color:#6b7280; font-size:0.85em; margin:4px 0 0;">Up to 20 files at once — 100MB max each</p>
                    <input type="file" id="fileInput" accept=".pdf,.txt" multiple style="display:none;">
                </div>
                <div id="fileList" style="margin-bottom:12px;"></div>
                <div class="btn-group">
                    <button type="submit" id="submitBtn">Upload & Process All</button>
                    <button type="button" class="secondary" onclick="uploadWithEnrich()">Upload + Enrich All</button>
                </div>
            </form>
            <div id="progressArea" style="margin-top:12px;"></div>
            <div id="result"></div>
        </div>
        
        <!-- Chunks Tab -->
        <div id="tab-chunks" class="card hidden">
            <h2>📄 View Chunks</h2>
            <div style="display:flex; gap:10px; margin-bottom:16px; flex-wrap:wrap;">
                <select id="chunkFilter" onchange="loadChunks()">
                    <option value="all">All Chunks</option>
                    <option value="enriched">Enriched Only</option>
                    <option value="raw">Raw Only</option>
                </select>
                <select id="sourceFilter" onchange="loadChunks()">
                    <option value="">All Sources</option>
                </select>
                <button class="secondary" onclick="loadChunks()">🔄 Refresh</button>
            </div>
            <div class="chunk-list" id="chunkList">Loading...</div>
            <div class="pagination" id="pagination"></div>
        </div>
        
        <!-- Enrichment Tab -->
        <div id="tab-enrich" class="card hidden">
            <h2>✨ LLM Enrichment</h2>
            <p style="color:#9ca3af; margin-bottom:16px;">Use DeepSeek LLM to generate summaries, key points, and themes for all chunks.</p>
            <div class="btn-group">
                <button onclick="startEnrichment()">🚀 Enrich All Raw Chunks</button>
                <button class="secondary" onclick="resumeEnrichment()">▶️ Resume</button>
                <button class="secondary" onclick="checkEnrichmentStatus()">🔄 Check Status</button>
            </div>
            <div class="progress-container" id="enrichProgress" style="display:none;">
                <div class="progress-bar"><div class="progress-fill" id="enrichProgressFill" style="width:0%"></div></div>
                <div class="progress-text" id="enrichProgressText">Starting...</div>
            </div>
            <div id="enrichResult"></div>
        </div>
    </div>
    
    <script>
        const BASE_URL = window.location.pathname.endsWith('/') ? window.location.pathname.slice(0,-1) : window.location.pathname;
        let currentPage = 1;
        
        function showTab(tabName) {
            document.querySelectorAll('.card').forEach(c => c.classList.add('hidden'));
            document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
            document.getElementById('tab-' + tabName).classList.remove('hidden');
            event.target.classList.add('active');
            
            if (tabName === 'chunks') loadChunks();
            if (tabName === 'status') loadStatus();
            if (tabName === 'enrich') checkEnrichmentStatus();
        }
        
        async function loadStatus() {
            try {
                const resp = await fetch(BASE_URL + '/documents', {cache:'no-store'});
                const data = await resp.json();
                document.getElementById('docCount').textContent = data.total_chunks || 0;
                document.getElementById('enrichedCount').textContent = data.total_enriched || 0;
                document.getElementById('rawCount').textContent = (data.total_chunks || 0) - (data.total_enriched || 0);
                document.getElementById('fileCount').textContent = data.total_sources || 0;
                
                // Update source filter
                const sourceSelect = document.getElementById('sourceFilter');
                sourceSelect.innerHTML = '<option value="">All Sources</option>';
                (data.documents || []).forEach(d => {
                    sourceSelect.innerHTML += `<option value="${d.source}">${d.source} (${d.chunk_count})</option>`;
                });
                
                // Get search type
                const statusResp = await fetch(BASE_URL + '/status', {cache:'no-store'});
                const statusData = await statusResp.json();
                document.getElementById('searchType').textContent = statusData.has_faiss ? 'FAISS' : 'Cosine';
            } catch(e) {
                console.error('Status load failed:', e);
            }
        }
        
        async function loadChunks() {
            const filter = document.getElementById('chunkFilter').value;
            const source = document.getElementById('sourceFilter').value;
            const chunkList = document.getElementById('chunkList');
            chunkList.innerHTML = 'Loading...';
            
            let url = BASE_URL + '/chunks?page=' + currentPage + '&per_page=10';
            if (filter === 'enriched') url += '&enriched=true';
            if (filter === 'raw') url += '&raw=true';
            if (source) url += '&source=' + encodeURIComponent(source);
            
            try {
                const resp = await fetch(url, {cache:'no-store'});
                const data = await resp.json();
                
                if (!data.chunks || data.chunks.length === 0) {
                    chunkList.innerHTML = '<p style="color:#9ca3af;">No chunks found</p>';
                    return;
                }
                
                chunkList.innerHTML = data.chunks.map(c => `
                    <div class="chunk-item ${c.enriched ? 'enriched' : ''}">
                        <div class="chunk-header">
                            <span class="chunk-source">${c.source} #${c.chunk_id}</span>
                            <span class="chunk-badge ${c.enriched ? 'enriched' : 'raw'}">${c.enriched ? '✓ Enriched' : 'Raw'}</span>
                        </div>
                        ${c.summary ? `<div class="chunk-summary"><strong>Summary:</strong> ${c.summary}</div>` : ''}
                        ${c.themes && c.themes.length ? `<div class="chunk-themes">${c.themes.map(t => `<span class="theme-tag">${t}</span>`).join('')}</div>` : ''}
                        <div class="chunk-content">${c.content_preview}</div>
                    </div>
                `).join('');
                
                // Pagination
                const pag = data.pagination;
                document.getElementById('pagination').innerHTML = `
                    <button ${pag.page <= 1 ? 'disabled' : ''} onclick="currentPage=${pag.page-1};loadChunks()">← Prev</button>
                    <span style="color:#9ca3af;">Page ${pag.page} of ${pag.pages} (${pag.total} chunks)</span>
                    <button ${pag.page >= pag.pages ? 'disabled' : ''} onclick="currentPage=${pag.page+1};loadChunks()">Next →</button>
                `;
            } catch(e) {
                chunkList.innerHTML = '<p style="color:#ef4444;">Error loading chunks</p>';
            }
        }
        
        async function startEnrichment() {
            const apiKey = document.getElementById('apiKey').value.trim() || 'eventheodds-flask-api-key-2025';
            document.getElementById('enrichProgress').style.display = 'block';
            document.getElementById('enrichResult').innerHTML = '';
            
            try {
                const resp = await fetch(BASE_URL + '/enrich', {
                    method: 'POST',
                    headers: {'X-API-Key': apiKey, 'Content-Type': 'application/json'}
                });
                const data = await resp.json();
                
                if (data.error) {
                    document.getElementById('enrichResult').innerHTML = `<div class="result error">❌ ${data.error}</div>`;
                } else {
                    pollEnrichment();
                }
            } catch(e) {
                document.getElementById('enrichResult').innerHTML = `<div class="result error">❌ Error: ${e.message}</div>`;
            }
        }

        async function resumeEnrichment() {
            // Resume is the same API call as start; the server will skip already-enriched chunks.
            await startEnrichment();
        }
        
        async function pollEnrichment() {
            const poll = async () => {
                const resp = await fetch(BASE_URL + '/enrichment-status', {cache:'no-store'});
                const data = await resp.json();
                
                document.getElementById('enrichProgressFill').style.width = data.progress + '%';
                document.getElementById('enrichProgressText').textContent = data.message || 'Processing...';
                
                if (data.status === 'completed') {
                    document.getElementById('enrichResult').innerHTML = `<div class="result success">✅ ${data.message}</div>`;
                    loadStatus();
                    return;
                } else if (data.status === 'failed') {
                    document.getElementById('enrichResult').innerHTML = `<div class="result error">❌ ${data.message}</div>`;
                    return;
                } else if (data.status === 'interrupted') {
                    document.getElementById('enrichResult').innerHTML =
                      `<div class="result info">⏸️ Interrupted. Progress saved. Click <strong>Resume</strong> to continue.</div>`;
                    return;
                } else if (data.status === 'idle') {
                    document.getElementById('enrichProgress').style.display = 'none';
                    return;
                }
                
                setTimeout(poll, 1000);
            };
            poll();
        }
        
        async function checkEnrichmentStatus() {
            const resp = await fetch(BASE_URL + '/enrichment-status', {cache:'no-store'});
            const data = await resp.json();
            
            if (data.status === 'running' || data.status === 'interrupted') {
                document.getElementById('enrichProgress').style.display = 'block';
                document.getElementById('enrichProgressFill').style.width = data.progress + '%';
                document.getElementById('enrichProgressText').textContent = data.message;
                if (data.status === 'running') {
                  pollEnrichment();
                } else {
                  document.getElementById('enrichResult').innerHTML =
                    `<div class="result info">⏸️ Interrupted. Progress saved. Click <strong>Resume</strong> to continue. (LLM calls: ${data.llm_used || 0})</div>`;
                }
            } else {
                document.getElementById('enrichProgress').style.display = 'none';
                document.getElementById('enrichResult').innerHTML = `<div class="result info">Status: ${data.status} | LLM calls: ${data.llm_used || 0}</div>`;
            }
        }
        
        // --- Multi-file upload system ---
        let selectedFiles = [];
        const dropZone = document.getElementById('dropZone');
        const fileInput = document.getElementById('fileInput');

        dropZone.addEventListener('click', () => fileInput.click());
        dropZone.addEventListener('dragover', e => { e.preventDefault(); dropZone.style.borderColor = '#6366f1'; dropZone.style.background = 'rgba(99,102,241,0.05)'; });
        dropZone.addEventListener('dragleave', () => { dropZone.style.borderColor = '#4a5568'; dropZone.style.background = 'none'; });
        dropZone.addEventListener('drop', e => {
            e.preventDefault();
            dropZone.style.borderColor = '#4a5568'; dropZone.style.background = 'none';
            addFiles(e.dataTransfer.files);
        });
        fileInput.addEventListener('change', e => addFiles(e.target.files));

        function addFiles(fileList) {
            const validExts = ['.pdf', '.txt'];
            for (const f of fileList) {
                const ext = '.' + f.name.split('.').pop().toLowerCase();
                if (!validExts.includes(ext)) continue;
                if (selectedFiles.find(s => s.name === f.name)) continue;
                if (selectedFiles.length >= 20) break;
                selectedFiles.push(f);
            }
            renderFileList();
        }

        function removeFile(idx) {
            selectedFiles.splice(idx, 1);
            renderFileList();
        }

        function renderFileList() {
            const el = document.getElementById('fileList');
            if (!selectedFiles.length) { el.innerHTML = ''; return; }
            el.innerHTML = '<div style="font-size:0.9em; color:#9ca3af; margin-bottom:6px;"><strong>' + selectedFiles.length + ' file(s) selected</strong></div>' +
                selectedFiles.map((f, i) =>
                    `<div style="display:flex; justify-content:space-between; align-items:center; padding:4px 8px; background:#1e293b; border-radius:6px; margin-bottom:4px; font-size:0.85em;">
                        <span>📄 ${f.name} <span style="color:#6b7280;">(${(f.size/1024/1024).toFixed(1)} MB)</span></span>
                        <button onclick="removeFile(${i})" style="background:none; border:none; color:#ef4444; cursor:pointer; font-size:1.1em;">✕</button>
                    </div>`
                ).join('');
        }

        async function uploadBatch(endpoint) {
            if (!selectedFiles.length) {
                document.getElementById('result').innerHTML = '<div class="result error">Please select at least one file</div>';
                return;
            }
            const apiKey = document.getElementById('apiKey').value.trim() || 'eventheodds-flask-api-key-2025';
            document.getElementById('submitBtn').disabled = true;
            document.getElementById('result').innerHTML = '';
            const progressArea = document.getElementById('progressArea');
            progressArea.innerHTML = '';

            const jobs = [];
            // Upload all files in parallel (4 at a time)
            const MAX_PARALLEL = 4;
            const queue = [...selectedFiles];
            const active = new Set();

            function makeProgressBar(filename) {
                const id = 'prog_' + filename.replace(/[^a-zA-Z0-9]/g, '_');
                return `<div id="${id}" style="margin-bottom:8px;">
                    <div style="display:flex; justify-content:space-between; font-size:0.85em; margin-bottom:2px;">
                        <span>📄 ${filename}</span><span id="${id}_pct">uploading...</span>
                    </div>
                    <div style="background:#1e293b; border-radius:6px; height:20px; overflow:hidden;">
                        <div id="${id}_bar" style="height:100%; background:linear-gradient(90deg,#6366f1,#818cf8); width:0%; transition:width 0.3s; border-radius:6px;"></div>
                    </div>
                    <div id="${id}_msg" style="font-size:0.8em; color:#9ca3af; margin-top:2px;">Queued...</div>
                </div>`;
            }

            // Create all progress bars upfront
            progressArea.innerHTML = selectedFiles.map(f => makeProgressBar(f.name)).join('');

            async function uploadOne(file) {
                const id = 'prog_' + file.name.replace(/[^a-zA-Z0-9]/g, '_');
                const pctEl = document.getElementById(id + '_pct');
                const barEl = document.getElementById(id + '_bar');
                const msgEl = document.getElementById(id + '_msg');

                try {
                    const formData = new FormData();
                    formData.append('file', file);
                    if (pctEl) pctEl.textContent = 'uploading...';
                    if (msgEl) msgEl.textContent = 'Uploading file...';

                    const resp = await fetch(BASE_URL + endpoint, {
                        method: 'POST',
                        headers: {'X-API-Key': apiKey},
                        body: formData
                    });
                    const data = await resp.json();

                    if (data.job_id) {
                        jobs.push({filename: file.name, job_id: data.job_id});
                        // Start polling this job
                        pollJobProgress(data.job_id, id);
                    } else if (data.error) {
                        if (pctEl) pctEl.textContent = 'ERROR';
                        if (barEl) { barEl.style.width = '100%'; barEl.style.background = '#ef4444'; }
                        if (msgEl) msgEl.textContent = data.error;
                    }
                } catch(e) {
                    if (pctEl) pctEl.textContent = 'ERROR';
                    if (barEl) { barEl.style.width = '100%'; barEl.style.background = '#ef4444'; }
                    if (msgEl) msgEl.textContent = e.message;
                }
            }

            // Process queue with concurrency limit
            async function processQueue() {
                const promises = [];
                while (queue.length > 0 || active.size > 0) {
                    while (active.size < MAX_PARALLEL && queue.length > 0) {
                        const file = queue.shift();
                        const p = uploadOne(file).then(() => active.delete(p));
                        active.add(p);
                        promises.push(p);
                    }
                    if (active.size > 0) await Promise.race(active);
                }
                await Promise.all(promises);
            }

            await processQueue();
            document.getElementById('submitBtn').disabled = false;
            selectedFiles = [];
            renderFileList();
            loadStatus();
        }

        function pollJobProgress(jobId, elemId) {
            const pctEl = document.getElementById(elemId + '_pct');
            const barEl = document.getElementById(elemId + '_bar');
            const msgEl = document.getElementById(elemId + '_msg');

            const poll = async () => {
                try {
                    const resp = await fetch(BASE_URL + '/progress/' + jobId, {cache:'no-store'});
                    const data = await resp.json();
                    const pct = data.progress || 0;

                    if (barEl) barEl.style.width = pct + '%';
                    if (pctEl) pctEl.textContent = pct + '%';
                    if (msgEl) msgEl.textContent = data.message || 'Processing...';

                    if (data.status === 'completed') {
                        if (barEl) barEl.style.background = 'linear-gradient(90deg,#22c55e,#4ade80)';
                        if (pctEl) pctEl.textContent = '100% ✅';
                        if (msgEl) msgEl.textContent = data.message;
                        loadStatus();
                        return;
                    } else if (data.status === 'failed') {
                        if (barEl) { barEl.style.width = '100%'; barEl.style.background = '#ef4444'; }
                        if (pctEl) pctEl.textContent = 'FAILED';
                        if (msgEl) msgEl.textContent = data.message;
                        return;
                    }
                } catch(e) { /* ignore poll errors */ }
                setTimeout(poll, 800);
            };
            poll();
        }

        async function uploadWithEnrich() { await uploadBatch('/enrich-upload'); }

        document.getElementById('uploadForm').onsubmit = async (e) => {
            e.preventDefault();
            await uploadBatch('/upload');
        };
        
        loadStatus();
    </script>
</body>
</html>rQ   rQ   rF   rD   r  r  J  s    YrF   z/healthGET)methodsc                  j    t        dt        t        j                        t        t
        t        d      S )zHealth checkhealthy)ru  r  	has_faisshas_sentence_transformershas_pymupdf)r   r*  r  r  r	  r  rC  rQ   rF   rD   healthr  )	  s0     |556%>"  rF   z/statusc            	          t        d t        j                  D              } t        t	        t        j                        t	        |       t
        t        t        rdndt
        rdd      S dd      S )zGet system statusc              3   @   K   | ]  }|j                  d d        yw)r  r  Nr@   r9  r  s     rD   r;  zstatus.<locals>.<genexpr>9	  s     F!!%%"%Fs   r  z
hash-basedzFAISS + BM25 HybridzCosine + BM25 Hybrid)r  total_filesr  r  embedding_modelsearch_type)r  r  r  r   r*  r	  r  )sourcess    rD   ru  ru  5	  sl     F|/E/EFFG|5567|%>4M0S_09,   @V  rF   z/uploadPOSTc                     dt         j                  vrt        ddi      dfS t         j                  d   } | j                  st        ddi      dfS t	        | j                        j
                  j                         }|dvrt        ddi      dfS t        | j                        }t        j                  |z  }| j                  t        |             t        t        j                               dd	 }t        |d
dd       t        j                   t"        |||fd      }|j%                          t        d|d|d      dfS )zUpload and process a documentr  rq  No file providedr   Empty filenamer6  r<  z$Only PDF and TXT files are supportedN   startingr   z%File uploaded, starting processing...T)targetr  daemonz"File uploaded. Processing started.r  r  r%  rY     )r   r  r   rY  r   rA  rB  r   r;   r   r  rP   uuiduuid4r  	threadingThreadr  start)r  extsafe_filenamer4  r  threads         rD   upload_filer  E	  s8    W]]"!345s::== D==!123S88 t}}

$
$
*
*
,C
""!GHI3NN $DMM2M.IIIc)n r"FFJ+RS&i/F
 LLN7!	 
 	 rF   z/progress/<job_id>c                     t         5  | t        v rt        t        |          cddd       S 	 ddd       t        dddd      dfS # 1 sw Y   xY w)zGet upload progressNr1  r   zJob not found)ru  rv  r%    )r  r  r   )r  s    rD   get_progressr  m	  sX     
 4_$?6234 4$4 iQ?STVYYY4 4s   AAz/askc                  z   t        j                  d      xs i } | j                  dd      j                         }| j                  dd      }|st	        ddi      dfS | j                  d	d
      }t
        j                  ||      }|s't	        dg t        t
        j                        dd      S g }d}t               }g }|D ]  }	|	j                  di       }
|
j                  dd      }|r|r|
j                  d      r|dz  }d|
j                  dd       }|
j                  dg       }|r.|j                  |dd        |ddj                  |dd        z  }|
j                  dg       }|r|j                  |       |j                  |       |j                  |	d   dd         dj                  |      }d}	 t        j                  j!                  t        j                  j!                  t        j                  j#                  t$                          }t        j                  j                  |dd       }t        j                  j'                  |      rNt)        |d!d"#      5 }|j+                         j                         }t-        d$t        |       d%       ddd       nt-        d&|        d(| d)|dd*  d+| d,}	 t1               }|j3                  |d-d./      }|r|j                         }nd0|dd1 z   }g }|D ]  }	|	j                  di       }
|j                  |
j                  d4d5      t5        |	d6   d      |	d   dd7 |
j                  dd      |
j                  d      r|
j                  dd      dd8 nd|
j                  dg       d9        t	        ||t        t
        j                        t        |      |t7        |      dd: |dd
 d;      S # 1 sw Y   1xY w# t.        $ r}t-        d'|        Y d}~Qd}~ww xY w# t.        $ r!}t-        d2|        d3|dd1 z   }Y d}~5d}~ww xY w)<z2Query the RAG system with enriched context supportTsilentquestionr  use_enrichedrq  zQuestion is requiredr   r  r  r  z8I don't have enough information to answer that question.r   )answerr  chunks_searchedenriched_countr  r#  Fr#  rT   z**Summary**: r$  Nr#   z
**Key Points**: z; r%   r%  r  rU   z

---


airagagentznfl_context_2025.txtr=  r>  r?  z[RAG] Injected z chars of live NFL contextz [RAG] Context file not found at z[RAG] Context injection error: zYou are the AI Guru, an expert sports betting assistant.
Answer the user's question using the provided Context and Live Data.

=== LIVE REAL-TIME DATA (2025-2026 NFL SEASON) ===
z1

=== RETRIEVED KNOWLEDGE (STRATEGY & BOOKS) ===
i  z

=== QUESTION ===
a  

INSTRUCTIONS:
1. Prioritize 'LIVE REAL-TIME DATA' for specific 2025-26 standings, records, and current stats.
2. Use 'RETRIEVED KNOWLEDGE' for betting theory, concepts, and general strategy.
3. If the question asks about a specific team's CURRENT record (e.g. Patriots), use the Live Data.
4. Synthesize everything into a natural, helpful response.
5. If you cannot answer, say so, but provide relevant context.

ANSWER:r   r'   r\  z_I could not generate a specific answer (Grok API return empty). Here is the relevant context:

i	  z[RAG] LLM generation error: z/Error generating answer. Here is the context:

r  r1  r     r#  )r  r  previewr#  r#  r%  r,   )r  r  r  chunks_returnedr  r%  r$  )r   get_jsonr@   r
  r   r  r  r*  r  r  r  rH  r  rV  r>   pathdirnameabspathrK   r  rJ  rK  r  r,  r  r  roundr  )r8   r  r  r  r   context_partsr  
all_themesall_key_pointsr=  metais_enrichedpartr$  r%  contextlive_contextr   context_pathrN  r.  r  llm_servicegenerated_answerr  r  s                             rD   askr  v	  sn    4(.BDxx
B'--/H88ND1L!7893>>aA !!(a!0GP"<#9#9:	
  	 MNJN 5uuZ$hhz51KDHHY,?aN"488Ir#:";<D,3J%%j!n5,TYYz"1~-F,GHHXXh+F!!&)  &   9ds!34+5.   /G L577??277??277??83L#MNww||Hl<RS77>>,'lC': Wa vvx~~/L(9'::TUVW W 5l^DE
   	$   

 	F,	V l&//4UX/Y%++-Fx  |C  DI  EI  |J  JF G 	uuZ$hhx31W:q)|DS)U38<8Ktxx	2.t4QUhhx,
 		 |556w<(z"3B'$Ra(  oW W
  5/s3445B  V,QC01DwuPT~UVsI   >B+O+ )7O O+ 9P O(#O+ +	P4PP	P:P55P:z/searchc                  6   t        j                  d      xs i } | j                  dd      j                         }| j                  dd      }|st	        ddi      d	fS t
        j                  ||
      }t	        ||t        t
        j                        d      S )zSearch documentsTr  r  r  r  r,   rq  zQuery is requiredr   r  )r   r  r  )	r   r  r@   r
  r   r  r  r*  r  )r8   r  r  r   s       rD   r  r  	  s     4(.BDHHWb!'')EbA!456;;!!%1!-G|556  rF   z
/documentsc            
         i } t         j                  D ]c  }|j                  dd      }|| vrdd|j                  dd      d| |<   | |   dxx   dz  cc<   |j                  d      sT| |   d	xx   dz  cc<   e t        t         j                        }t	        d
 t         j                  D              }t        | j                         D cg c]  \  }}d|i| c}}t        |       ||t        |dkD  r||z  dz  ndd      d      S c c}}w )z)List all documents with enrichment statusr  r1  r   r  )chunk_countr  r  r  rT   r#  r  c              3   D   K   | ]  }|j                  d       sd  ywr#  rT   Nr  r  s     rD   r;  z!list_documents.<locals>.<genexpr>
  s     PqaeeJ>OP     r  )r  total_sourcesr  total_enrichedenrichment_percentage)r  r  r@   r*  rk  r   r`  r  )r  rX  r  r  r  r  r  s          rD   list_documentsr  
  s    G%% 
39-  "#!$)!DGFO
 	&!+&77:FO,-2-
3 |--.LPL$:$:PPN5<]]_ETQx(a(EW$(!&P\_`P`(E(Kfgij!k  Es   
D
z/chunksc                  P   t         j                  j                  d      } t         j                  j                  dd      j                         dk(  }t         j                  j                  dd      j                         dk(  }t	        t         j                  j                  dd            }t	        t         j                  j                  dd	            }t
        j                  }| r$|D cg c]  }|j                  d      | k(  s| }}|r"|D cg c]  }|j                  d      s| }}n#|r!|D cg c]  }|j                  d      r| }}t        |      }|dz
  |z  }||z   }	|||	 }
g }t        |
|
      D ]  \  }}|j                  ||j                  dd      |j                  d|      |j                  dd      |j                  d      r|j                  dd      dd nd|j                  dg       dd |j                  dg       t        |j                  dd            dkD  r|j                  dd      dd dz   n|j                  dd      t        |j                  dd            d	        t        ||||||z   dz
  |z  d| ||dd      S c c}w c c}w c c}w )z)List chunks with pagination and filteringr  r#  r  truerawr:  rT   per_pager  )r  r1  r  Fr#  Nr#  r$  r%   r%  r  i,  r  )	r  r  r  r#  r#  r$  r%  content_previewcontent_length)r:  r   rh   rI  )r  enriched_onlyraw_only)r  
paginationfilters)r   r  r@   rB  rN   r  r  r*  r}  rV  r   )r  r  r  r:  r   filteredr  rh   r  end	paginatedr  r  rX  s                 rD   list_chunksr
  %
  s    \\h'FLL$$Z4::<FM||r*002f<Hw||*+D7<<##J34H %%H'E!155?f+DAEE'=!155+<A==	'A!quuZ/@AAA MEAX!E
(
Cs#I FIU3 3ggh	2
A.
E27:wwy7Iswwy"-ds3t'',3BQ7ggh+GJ377S\^`KaGbehGhswwy"5ds;eCnqnunuv  BD  oE!#'')R"89

 
	  h&*x7	
 * 
  9 F >As$   *JJJ'J4J#J#z/chunks/<int:chunk_index>c                     | dk  s| t        t        j                        k\  rt        ddi      dfS t        j                  |    }t        | |j	                  dd      |j	                  d      |j	                  d      |j	                  d	d
      |j	                  d      |j	                  dg       |j	                  dg       |j	                  dd      t        |j	                  dd            |j	                  d      |j	                  d      d      S )z$Get a single chunk with full detailsr   rq  zChunk not foundr  r  r1  r  r  r#  Fr#  r$  r%  r  r  r  r  )r  r  r  r  r#  r#  r$  r%  r  r  r  r  )r*  r  r  r   r@   )chunk_indexrX  s     rD   	get_chunkr  _
  s     Q+\-C-C)DD!234c99

 
 
-C''(I.GGJ'1GGJ.779%gglB/''(B'779b)cggi45779%/  rF   z$/chunks/by-source/<path:source_name>c                     t        |       }t        j                  D cg c]  }|j                  d      |k(  s| }}|st	        ddi      dfS t        d |D              }t	        |t        |      ||D cg c]r  }|j                  d      |j                  dd      |j                  d	      r|j                  d	d
      dd nd|j                  dg       |j                  dd
      dd dt c}d      S c c}w c c}w )z$Get all chunks for a specific sourcer  rq  zSource not foundr  c              3   D   K   | ]  }|j                  d       sd  ywr  r  )r9  r  s     rD   r;  z'get_chunks_by_source.<locals>.<genexpr>
  s     @qaeeJ.?@r  r  r#  Fr#  r  Nr#  r%  r  )r  r#  r#  r%  r  )r  r  enriched_chunksr  )r   r  r  r@   r   rk  r*  )r  safe_sourcer  r  r  r  s         rD   get_chunks_by_sourcer  w
  s    "+.K%//RA155?k3QaRFR!345s::@F@@NF)   j)j%056UU95EquuY+DS14eeHb) uuY3DS9
 	   Ss   C6C66A7C;z/delete/<path:filename>DELETEc                 N   t        |       }t        t        j                        }t        j                  D cg c]  }|j	                  d      |k7  s| c}t        _        |t        t        j                        z
  }|dkD  rt        j                  rt        j                  D cg c]  }|d   	 }}t        j                  |      }t        rrt        j                  t        j                        t        _
        |j                  d      }t        j                  |       t        j                  j                  |       n]|t        _        nQt        r-t        j                  t        j                        t        _
        nt        j                   g       t        _        t        j#                          t        j%                          t&        j(                  |z  }|j+                         r|j-                          t/        dd| d| d|d	      S t/        d
di      dfS c c}w c c}w )zDelete a documentr  r   r  r  TzDeleted z (z chunks))r  r%  chunks_removedrq  zDocument not foundr  )r   r*  r  r  r@   r  r	  r
  r}  r  r  r  r~  r  r  r  r  r  r  r;   r   r  unlinkr   )	rY  r  original_countr  removed_countr  r  r  r4  s	            rD   delete_documentr  
  s    $H-M //0N)5)?)?dA155?VcCcadL"S)?)?%@@Mq!!.:.D.DE)EHE%;;HEJ%*%6%6|7Q7Q%R" * 1 1) <""=1""&&}5*4'%*%6%6|7Q7Q%R"*,((2,'..0 NN]2	!-=/J+
  	 G123S88I e Fs   HH%H"z/reloadc                  t    t         j                          t        dt        t         j                        d      S )zReload the indexT)r  r  )r  r  r   r*  r  rQ   rF   rD   reloadr  
  s3     |556  rF   z/enrichc                     t         5  t        j                  d      dk(  r7t        1t        j	                         rt        dt        d      dfcddd       S t        j                  d      dk(  rndt        d<   t        j                  d      xs d	t        d<   t        j                  t        j                        j                         d
z   t        d<   t                ddd       t        j                  d      xs i } | j                  d      fd}t        j                  |d      at        j!                          t        dddd      dfS # 1 sw Y   uxY w)z,Enrich documents with LLM-generated metadataru  r  NzEnrichment already running)rq  ru  i  r  r%  r  r  rx  Tr  r  c                  4   	 t        t        j                  j                  dt        j                  dd      xs d            } 	 t
        j                          rAt        t
        j                        D cg c]  \  }}|j                  d      k(  s| }}}n+t        t        t        t
        j                                    }t        |      }|dk(  rt        5  dt        d<   dt        d	<   d
t        d<   t        j                  t        d<   t        d<   dt        d<   dt        d<   t!        j"                  t$        j&                        j)                         dz   t        d<   | t        d<   t+                d d d        y t-        d |D              }||z
  }t!        j"                  t$        j&                        j)                         dz   }t        5  dt        d<   |rt        ||z  dz        ndt        d	<   d| d| d| dt        d<   t        j                  t        d<   t        j                  d      xs |t        d<   |t        d<   t        d<   |t        d<   |t        d<   | t        d<   t+                d d d        |dk  rt        5  dt        d<   dt        d	<   dt        d<   t        j                  t        d<   t!        j"                  t$        j&                        j)                         dz   t        d<   t+                d d d        y d}|D ]  }	t
        j                  |	   }
|
j                  d      r)t        j/                  |
      }|
j1                  |       t!        j"                  t$        j&                        j)                         dz   |
d<   	 t
        j3                  |
      }t        t
        j4                        t        t
        j                        k(  r!|j7                         t
        j4                  |	<   t        t
        j8                        t        t
        j                        k(  rt
        j8                  |	   }t        |j;                               }|t
        j8                  |	<   t        t
        j8                        }|dkD  r%t
        j<                  |z  |z
  |z   |z  t
        _        |dz  }||z   }|rt        ||z  dz        nd}t        5  dt        d<   |t        d	<   d| d| d| d| d	t        d<   t        j                  t        d<   t!        j"                  t$        j&                        j)                         dz   t        d<   |t        d<   | t        d<   d d d        || z  dk(  sit
        j?                          t        5  t+                d d d         t
        j?                          t        5  dt        d<   dt        d	<   d| d | dt        d<   t        j                  t        d<   t!        j"                  t$        j&                        j)                         dz   t        d<   |t        d<   t+                d d d        y # t        $ r Y 	w xY wc c}}w # 1 sw Y   y xY w# 1 sw Y    xY w# 1 sw Y   y xY w# t        $ r Y w xY w# 1 sw Y   :xY w# 1 sw Y   xY w# 1 sw Y   y xY w# t        $ r}tA        d!|        dd l!}|jE                          t        5  d"t        d<   d#tG        |       t        d<   t        j                  t        d<   t!        j"                  t$        j&                        j)                         dz   t        d<   t+                d d d        n# 1 sw Y   nxY wY d }~y Y d }~y d }~ww xY w)$Nrt  r|  r,   r  r   r  ru  r  rv  zNo chunks found to enrich.r%  rw  rz  r{  r  rx  c              3   f   K   | ])  }t         j                  |   j                  d       s&d + ywr  )r  r  r@   )r9  r  s     rD   r;  z;enrich_documents.<locals>.run_enrichment.<locals>.<genexpr>   s*     "j@V@VWX@Y@]@]^h@i1"js   '11r  zResuming enrichment: rn  z' already enriched. Enriching remaining r  ry  z0No chunks need enrichment (all already enriched)r#  enriched_atrT   z	Enriched z chunks (this run: ro  zSuccessfully enriched z chunks (target: z[RAG] Enrichment error: r  r  )$rN   r>   r?   r@   r  r  r  r,  r}  r  r  ranger*  enrichment_lockenrichment_servicer   r   r  r   r  r  r  rk  r*  r  r  r  rB  r  rs  r  r{  r  r  r  rP   )r|  r  r  target_indicesrz  already_enriched	remainingry  enriched_this_run	doc_indexrX  enriched_docnew_textold_lennew_lenndonerv  r.  r  source_filters                       rD   run_enrichmentz(enrich_documents.<locals>.run_enrichment
  s   z	2"2::>>2OQbQfQfgy{}Q~  RE  CE  $F   G668
 09,:P:P0Q!v1UVUZUZ[cUdhuUu!!v!v!%eC0F0F,G&H!I~.Lq $ 
62=%h/47%j13O%i04F4V4V%j12?%h/89%n578%m46>ll8<<6P6Z6Z6\_b6b%l3<L%&8935
6 ""jn"jj$'77I!hll3==?#EJ  2.7!(+`l5E5TX[4[0\rs!*-1FGWFXXYZfYg  hO  PY  OZ  Z]  0^!),0B0R0R!*-2C2G2G2U2cYc!,/2<!,/.;!(+4@!.13C!-08H!"45/12 A~$ 62=%h/47%j13e%i04F4V4V%j16>ll8<<6P6Z6Z6\_b6b%l3356  !+ ):	",,Y777:&1>>sC

<(%-\\(,,%?%I%I%Kc%QM"+BB3GH<889SAWAW=XXCK>>CS55i@<334L<R<R8SS".":":9"E"%hnn&6"7>E00; 8 89q5;G;V;VYZ;Z]d;dgn;nrs:sL7 "Q&!'*;;?K3| 3s:;QR$ M2;%h/4<%j15>tfAl^Sfgxfyyz  |E  {F  FG  4H%i04F4V4V%j16>ll8<<6P6Z6Z6\_b6b%l37;%m4<L%&89M %'771< 335( :79: :Q):X ++-  2.9!(+03!*-1GHYGZZklxkyyz/{!),0B0R0R!*-2:,,x||2L2V2V2X[^2^!,/3?!-0/12 2E  
 "w
6 2 26 2 ! M M: :
2 2  		2,QC01!  2.6!(+18Q/A!),0B0R0R!*-2:,,x||2L2V2V2X[^2^!,//12 2 2 2 2			2sH  A[ Y [ 7Y&Y&A[ BY,9[ A[ B'Y8:[ A8Z[ B
[ DZ7%[ BZ!)[ ;[ Z. %[ BZ;[ 	Y#[ "Y##	[ ,Y51[ 5[ 8Z=[ Z
[ [ 	Z[ Z[ !Z+	&[ .Z8	3[ ;[ [ [ 	^(^8A;]<3	^<^	^^r  r  zEnrichment started)r  r%  ru  r  )r!  r  r@   enrichment_threadis_aliver   r   r  r   r  r  r  r   r  r  r  r  )r8   r/  r.  s     @rD   enrich_documentsr3  
  sL    
 .  *i7<M<Y^o^x^x^z5+  . .   *i7*7h'+<+@+@+K  ,F  PFi(.6ll8<<.H.R.R.TWZ.Zl++-. 4(.BDHHX&M|2~ "((tL'  		 e. .s   AEBEEz/enrichment-statusc                  X    t         5  t        t              cddd       S # 1 sw Y   yxY w)zGet current enrichment statusN)r!  r   r  rQ   rF   rD   get_enrichment_statusr5  i  s%     
 *()* * *s    )z/enrich-uploadc                     dt         j                  vrt        ddi      dfS t         j                  d   } | j                  st        ddi      dfS t	        | j                        j
                  j                         }|dvrt        ddi      dfS t        | j                        t        j                  z  | j                  t                     t        t        j                               dd	 t        d
dd       fd}t        j                   |d      }|j#                          t        ddd      dfS )z,Upload and process with immediate enrichmentr  rq  r  r   r  r  z Only PDF and TXT files supportedNr  r  r   z5File uploaded, starting processing with enrichment...c            
      &   	 t        
ddd       t        j                  	      } | rt        | j	                               dk  r#t        
ddd       t
        j                  	      } | rt        | j	                               dk  rt        
ddd	       y t        
dd
d       t        j                  |       }t        j                  | |      }|st        
ddd       y t        |      D ]"  \  }}|d<   ||d<   t        |      |d<   ||d<   $ t        
dddt        |       d       
fd}t        j                  ||      }t        
ddd       t        j                  |       t        
ddd       t        j                          t        
dddt        |       dt        |      t        j                         y # t        $ rF}t!        d|        dd l}|j%                          t        
dddt'        |              Y d }~y d }~ww xY w)Nr  r,   zExtracting text...r     zTrying OCR...r  r   zFailed to extract text   zChunking document...zNo valid chunks extractedr  r  r  r  r"  z
Enriching z chunks with LLM...c           
      X    dt        | |z  dz        z   }t        d|d|  d| d       y )Nr"  r9  r  zEnriching chunk rn  r  )rN   r  )currentrh   pctr  s      rD   enrich_progresszEenrich_on_upload.<locals>.process_and_enrich.<locals>.enrich_progress  s=    3%2566c=MgYVWX]W^^a;bcrF   U   zAdding to vector store..._   r  r  z$Successfully processed and enriched z chunks!)r  rw  z[RAG] Error: r  )r  r  r   r*  r
  r  rp  r  r}  r"  rq  r  r  r  r   r,  r  r  r  rP   )r+  r  r  r  r  r=  r  r.  r  r4  r  r  s            rD   process_and_enrichz,enrich_on_upload.<locals>.process_and_enrich  s   6	EFL"6JK --i8D3tzz|,s2b/J"//	:3tzz|,s2!5MNFL"6LM$99$NH"--dH=F!5PQ%f- 25"/h$%j!(+Fn%)1o&	2 FL"
3v;-Ob6cdd 1BB6?[OFL"6QR''GFL"6GHS6s?7K6LHU 1+;;	  	EM!%&!FHa73q6(1CDD		Es&   BG A
G C!G 	H
<HHTr0  z2File uploaded. Processing with enrichment started.r  r  )r   r  r   rY  r   rA  rB  r   r;   r   r  rP   r  r  r  r  r  r  )r  r  r@  r  r4  r  r  s       @@@rD   enrich_on_uploadrA  p  s-    W]]"!345s::== D==!123S88
t}}

$
$
*
*
,C
""!CDEsJJ#DMM2M.IIIc)nr"FFJ+bc7Er %7EF
LLNG!	 
 	 rF   __main__RAG_PORTi  z;[RAG] Starting Advanced Sports Betting RAG Service on port z[RAG] FAISS: z, SentenceTransformers: z, PyMuPDF: z[RAG] Documents loaded: z0.0.0.0)hostportdebug)rJ   r>   sysr!  r  r  r  rr  r  r  queuecollectionsr   r   r   pathlibr   	functoolsr	   dataclassesr
   typingr   r   r   r   r   flaskr   r   r   r   werkzeug.utilsr   r   r  r  rF  rG  rH  rI  r  r  rB  rC  rD  r  r  r  rE  r   r  rR  rC  ImportErrorpdfminer.high_levelr   rF  rE  PyPDF2r   rG  numpyr  	HAS_NUMPYsentence_transformersr  r  r
  r	  r'  r&  r  r0  r  r
  r  rG   appr;   r  r  r  r"  r  rN   r?   r@   r  Lockr  r!  Queueprocessing_queuer1  r   r~  r  r  r  r  r  r  rP   r  r  router  r  ru  r  r  r  r  r  r
  r  r  r  r  r3  r5  rA  rE  r*  r  runrQ   rF   rD   <module>r\     s   
 
     	     '   ! 3 3 > > *
 ,8 ,8 ,8` !$sdhi|}~#sdhi|}~3$dexyz!SDfgz{|"cTgh{|}   	    "tdY]^"tZ^_$u5Q	 t$&d;)51!&US ! &


. ##$%
 =" B   
  BJJ/BJJ'7BJJ)2==9BJJ.>BJJ("--8BJJ,bmm<BJJ%r}}5  	KDL J
I
&9 $
I
L2 2jM Mfz z@+ +bA AN Ho#4

   
V$6" &v.  BJJNN+H"MN  	  ).."5;;=   )),DD A *F 

0PQ A B     ! #	
C 	
 	
 	
c 	
8AC 8AD 8AC 8A~ 3[ [| 9ug& ' 9ug& ' 9vh'#  (#L %1Z 2Z 6F8${  %{x 9vh'  ($ <%) *8 9ug&6 '6r &8 9. 1E7C D0 $x.@A*9  B*9Z 9vh'  ( 9vh'Z  (Zz %1* 2* fX.U  /Uv zrzz~~j$/0D	Gv
NO	M)$<=V<WWbcnbo
pq	$S)?)?%@$A
BCGGUG3 }V  K  L  J  I  & %&  I  Ls~   U< V
 $V -V& 4V4 =W W <VV
VVV#"V#&V10V14V?>V?WWWW