
    OiJ              	       2   d dl mZ 	 d dlmZmZ dZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d d	lmZmZmZmZm Z m!Z! h d
Z"h dZ#dZ$dZ%g dg dg dg dg dg dg dg ddZ& G d d      Z'y# e$ r dZdZY w xY w)    )SentenceTransformer)rerank_candidatesRERANKER_ENABLEDTFN)Path)fuzz)	EMBEDDINGS_DIRHYBRID_SEARCH_OVERSAMPLEHYBRID_VECTOR_WEIGHTHYBRID_KEYWORD_WEIGHTHYBRID_MIN_KEYWORD_SCOREHYBRID_BM25_WEIGHTMAX_CHUNKS_PER_SOURCETECHNICAL_TERMSSYNONYMS)VectorStoreErrorIndexLoadErrorIndexSaveErrorEmbeddingErrorValidationErrorResourceError>N   alsobeenbotheachevenfromgivehavehelpintolikemakemanymoremostmuchneedonlyontoshowsomesuchtellthanthatthemtheythisuponverywantwerewhenwillwithyouraboutafterbasedbeingcouldgimmeguidemightothershallstepstheirtherethesethosewherewhichwhilewouldanswerbeforedetailguidespleaseshouldwithinbecausebetweendetailsexplainlookingproviderelatedsummarythroughdescribequestion	according	regarding	summaries	summarizeinformation>   gbudpotbookbudsgramgrowhempweedbooksganjagramsrA   plantflowerrQ   manualplantsflowersgrowingharvestmanualscannabisdocument	cultivate	documents	marijuanacultivationg      ?g      ?)
hydroponichydroponicshydro	soil-less	soil lesssoillesszwater cultureznutrient solutionznutrient tankzebb and flow	reservoirzdeep water culturedwczdrip systemzflood and drainsoilless mixsoilless mixture)	r   r   r   r   zperlite mixz	coco coirrockwoolzexpanded clayz	grow cube)medicalmedicine	medicinalpatientdoctor	physiciantherapeutictherapy	treatmentdosagedoseclinicalsymptom	condition	diagnosisprescriptionwellnesshealth
healthcarezpain reliefzanxiety reliefzanti-inflammatory)finance	financialfundingcapitalloancreditdebt
investmentinvestorroizreturn on investmentz	cash flowprofitprofitabilityrevenueincomemarginbudget)legallawlawsstatute
regulation
regulatory
compliancelicense	licensingpermit
permittingdecriminalizelegalizationcourtpolicy	ordinance)businessentrepreneurentrepreneurshipstartupcompanycorporationmarket	marketingsalesbranding	franchisezsupply chain	wholesaleretailcustomerclientstrategyscaling
operations
management)zmental health
psychologyr   
counseling	wellbeingstressanxietyptsd
depressionmindfulness
meditation)history
historical	chronicletimelineeracenturyancientzmodern history	historiancivilizationheritagelegacyanthropologyarchaeologyculturezcultural historyrecords)r   zsoilless growingr   r   r   r   r   r   c                   z    e Zd ZddZd Zd Zd Zd Zd Zd Z	d	 Z
dd
Zd Zd Zd Zd Zd ZddZddefdZy)VectorStoreNc                     ddl m} ||}t        |      | _        d | _        g | _        g | _        g | _        g | _        d| _	        t        dz  | _        t        dz  | _        y )Nr   )EMBEDDING_MODEL        zfaiss_index.indexzdocuments_metadata.json)configr   r   modelindexr|   searchable_texts_rawsearchable_texts_lowerdoc_lengthsavg_doc_lengthr   
index_filemetadata_file)self
model_namer   s      0/var/www/html/leadgen/airagagent/vector_store.py__init__zVectorStore.__init__j   se    *(J(4

$&!&(#!(+>>+.GG    c                    | j                   j                         sy| j                  j                         r	 t        j                  t        | j                               | _        t        | j                  dd      5 }t        j                  |      | _
        ddd       | j                          t        dt        | j                         d       yddlm} |dz  }|j                         r	 t        d       t        j                  t        | j                               | _        ddl}t        |d      5 }|j                  |      | _
        ddd       | j)                          | j                          |j+                          t        dt        | j                         d       yy# 1 sw Y   xY w# t        j                  $ r0}t        t        | j                        d	t        |      d
      d}~wt         $ r0}t        t        | j                         dt        |      d
      d}~ww xY w# 1 sw Y   xY w# t         $ r&}t        t        |      dt        |      d
      d}~ww xY w)z&Load existing FAISS index and metadataFrutf-8encodingNu   ✓ Loaded existing index with z
 documentsTjson_decode
error_typeoriginal_error
faiss_loadr   )r   zdocuments_metadata.pklz'Migrating from pickle to JSON format...rbu   ✓ Migrated index with z documents to JSON formatpickle_migration)r   existsr   faiss
read_indexstrr   openjsonloadr|   _prepare_searchable_textsprintlenJSONDecodeErrorr   	Exceptionr   r   picklesave_metadataunlink)r   fer   old_pickle_filer  s         r   load_existing_indexzVectorStore.load_existing_indexx   s   %%'$$&"--c$//.BC
$,,cGD 2%)YYq\DN2..07DNN8K7LJWX 	*(+CC!!#?@"--c$//.BC
/40 4A%+[[^DN4 ""$..0&&(0T^^1D0EE^_` O2 2
 '' $**+#0CFK   $(#/3q6J 4 4  $(#5QP sn   AF6 <F):F6 .AH? 6H3AH? )F3.F6 6H0	+G44H0 +H++H03H<8H? ?	I.!I))I.c                 .   |syt        |t              st        ddt        |      i      t	        |      D ]G  \  }}t        |t
              st        d| d|t        |      d      d|vs7t        d| dd	|i       |D cg c]  }|d   	 }}	 | j                  j                  |d
ddd      }| j                  .t        j                  |j                  d         | _        g | _        t#        j$                  |      j'                  d      }t        j(                  |       | j                  j+                  |       | j                   j-                  |       |D ]W  }| j/                  |      }	| j0                  j3                  |	       | j4                  j3                  |	j7                                Y t9        dt        |       d       yc c}w # t        $ r5}|r|d   dd dz   nd}t        |t        |      t        |      d      d}~ww xY w)z%Add new documents to the vector storeNzChunks must be a listreceived_typezChunk z must be a dictionary)chunk_index
chunk_typecontentz missing 'content' keyr      TF
batch_sizeshow_progress_barconvert_to_numpynormalize_embeddingsr      z... )r  r!     float32u
   ✓ Added z chunks to vector store)
isinstancelistr   type	enumeratedictr   encoder  r   r
  r  r   r  IndexFlatIPshaper|   nparrayastypenormalize_L2addextend_build_searchable_textr   appendr   lowerr  )
r   chunksichunkcontents
embeddingsr  problematic_contentembeddings_npsearchable_texts
             r   add_documentszVectorStore.add_documents   s   &$'!"9OTRX\;Z[[ "&) 	^HAueT*%qc1F&GYZjnotjuIvww%%qc1G&H=Z[J\]]		^ 399E)$99	o**"&!%%* + J ::**:+;+;A+>?DJDN ,33I>=)

}% 	f% 	HE"99%@O%%,,_=''../D/D/FG	H
 	
3v;-'>?@C :  	o?G(1+ds"3e";R !4Q_bck_l6mnn	os   G G 	H0HHc                 L   | j                   t        dddi      	 t        j                  | j                   t	        | j
                               | j                          t        d       y# t        $ r/}t        t	        | j
                        dt	        |      i      d}~ww xY w)z!Save the FAISS index and metadataNz No index to save - index is Noneindex_statenoneu   ✓ Saved vector store indexr  )	r   r   r  write_indexr
  r   r  r  r  )r   r  s     r   
save_indexzVectorStore.save_index   s    :: !CmU[E\]]	Sdjj#doo*>? 01 	S T__!58H#a&7QRR	Ss   AA+ +	B#4*BB#c                    	 t        | j                  dd      5 }t        j                  | j                  |dd       ddd       y# 1 sw Y   yxY w# t
        $ r0}t        t        | j                        dt        |      d	      d}~ww xY w)
z#Save document metadata to JSON filewr   r   F   )ensure_asciiindentN	json_saver  )r  r   r  dumpr|   r  r   r
  )r   r  r  s      r   r  zVectorStore.save_metadata   s    	qd((#@ KA		$..!%JK K K 	q T%7%7!8hklmhn:opp	qs3   A $AA AA A 	B+BBc                 Z   |s| j                  |       yt        d| dt        |       d       | j                  D cg c](  }|j	                  di       j	                  d      |k7  r|* }}t        dt        |       d       ||z   }t        d	t        |       d
       | j
                  | j
                  j                  }n+| j                  j                  dg      }|j                  d   }t        j                  |      | _        g | _        g | _        |D cg c]  }|d   	 }}d}	t        dt        |      |	      D ]  }
||
|
|	z    }	 | j                  j                  ||	ddd      }t        j                   |      j#                  d      }t        j$                  |       | j
                  j'                  |       t)        ||
|
|	z          D ]Z  \  }}| j+                  |      }| j                  j-                  |       | j                  j-                  |j/                                \  || _        t        d| dt        |       d       t        d| j
                  j6                          yc c}w c c}w # t0        $ r }t3        d|
 t5        |      |
d      d}~ww xY w)aB  
        Safely replace all documents for a source with new chunks.
        This rebuilds the index to ensure consistency (prevents index corruption).
        
        Args:
            source_name: Name of the source file to replace
            new_chunks: List of new chunk dictionaries to replace old ones with
        Nz Replacing documents for source: z (z new chunks)metadatasourcez
  Keeping z documents from other sourcesz  Rebuilding index with z total documents...testr'  r  r  r   FTr   r(  z"Error embedding batch starting at )r  batch_startu"   ✓ Successfully replaced source 'z': z chunksz  Total documents in index: )_remove_source_from_indexr  r  r|   getr   dr   r.  r0  r  r/  r   r   ranger1  r2  r3  r4  r5  r,  r7  r8  r9  r  r   r
  ntotal)r   source_name
new_chunksdockept_documentsupdated_documents	dimensionsample_embeddingr=  r!  r;  batchr>  r@  jrA  r  s                    r   replace_source_documentsz$VectorStore.replace_source_documents   s    **;70RJ?PP\]^  >>
wwz2&**84C 
 

 	
3~.//LMN +Z7 	(->)?(@@STU ::!

I  $zz00&:(..q1I &&y1
 %'!&(# /@@sC	N@@
q#h-4 	AQZ0E!ZZ..)&+%)). / 
 !# 4 ; ;I F""=1

}- ((9!a*n(MN PFAs&*&A&A#&FO--44_E//667L7L7NOP	6 +2;-s3z?BSSZ[\,TZZ->->,?@A
< A.  $8<'*1va@ s$   -I7I<C"J	J*
J%%J*c                    | j                   D cg c](  }|j                  di       j                  d      |k7  r|* }}t        |      t        | j                         k(  rt        d|        yt        dt        | j                         t        |      z
   d|        | j                  | j                  j
                  }n+| j                  j                  dg      }|j                  d   }t        j                  |      | _        g | _        g | _        |D cg c]  }|d	   	 }}d
}t        dt        |      |      D ]  }||||z    }	| j                  j                  |	|ddd      }
t        j                  |
      j!                  d      }t        j"                  |       | j                  j%                  |       t'        ||||z          D ]Z  \  }}| j)                  |      }| j                  j+                  |       | j                  j+                  |j-                                \  || _         t        d| dt        | j                                 yc c}w c c}w )zCRemove all documents for a source by rebuilding index without them.rP  rQ  zNo documents found for source: Nz	Removing z documents for source: rR  r'  r  r  r   FTr   r(  u   ✓ Removed source 'z'. Total documents: )r|   rU  r  r  r   rV  r   r.  r0  r  r/  r   r   rW  r1  r2  r3  r4  r5  r,  r7  r8  r9  )r   rY  r[  r\  r^  r_  r=  r!  r;  r`  r>  r@  ra  rA  s                 r   rT  z%VectorStore._remove_source_from_indexA  sB     >>
wwz2&**84C 
 

 ~#dnn"553K=AB	#dnn-N0CCDD[\g[hij ::!

I#zz00&:(..q1I&&y1
$&!&(#.<=sC	N==
q#h-4 	LAQZ0E**%"'!%%* + J HHZ077	BM}-JJNN=)#N1q:~$FG L3"&"="=c"B))00A++22?3H3H3JKL	L$ ($[M1Ec$..FYEZ[\Y
, >s   -IIc                     d}| j                   D ]A  }|j                  di       j                  d      |k(  s(d|j                  di       d<   |dz  }C |dkD  r"| j                          t	        d| d| d	       |S )
z
        DEPRECATED: Use replace_source_documents instead.
        This method only marks documents as deleted but doesn't remove vectors from index,
        which can cause index corruption. Kept for backward compatibility but not recommended.
        r   rP  rQ  Tdeletedr'  u   ⚠ Marked z documents from z: as deleted (deprecated method - may cause index mismatch))r|   rU  
setdefaultr  r  )r   rY  countr[  s       r   mark_source_as_deletedz"VectorStore.mark_source_as_deletedq  s     >> 	Cwwz2&**84C<@z2.y9
	
 19 Kw&6{mC}~r   c                   : | j                   t        dddi      t        | j                        dk(  rg S |st	        dddi      t        |t              s!t	        dd	t        |      j                  i      t        |j                               dk(  rt	        d
dt        |      i      |j                         }t        j                  d|      }g }g }g }t               }|D ]#  }	|	t        v s|j!                  t        |	          % t"        D ]7  }
|
|v s|
|vs|j%                  |
       |
t&        vs'|j%                  |
       9 |D ]2  }
|
|vs|j%                  |
       |
t&        vs"|j%                  |
       4 |D ]4  }	t        |	      dk  r|	t(        v r|	|v s|	|v r$|j%                  |	       6 g }t               }||z   D ]m  }
|
|v r|j%                  |
t        j*                  dt        j,                  |
      z   dz   t        j.                        |
|v |
|v d       |j1                  |
       o | j3                          |r| j5                  |      ni :|r_t7        d |D              sMt9        |:fdd      }d}|D ]5  }:j;                  |d   d      }|dk\  r|dk  r nd|d<   |dz  }|dk\  s5 n 	 | j<                  j?                  |g      }tA        jB                  |      jE                  d      }tG        jH                  |       tK        |tL        z  |      }| j                   jO                  ||      \  }}g }g }tU        |d   |d         D ]  \  }}|t        | j                        k\  r | j                  |   }|j;                  di       j;                  d      rQ|j%                  |       | jV                  |   }| jX                  |   }d}|rt[        j\                  ||      dz  }g }d }d} d }!d }"t7        d! |D              }#t7        d" |D              }$|r|D ]X  }|d#   }%|%jO                  |      s|j%                  |d          |d$   rd}!|j;                  d      rd}"t_        d%|d&z         }d}Z t        |      dkD  r(t_        d%|d't_        t        |      dz
  d      z  z         }| ja                  ||      } | s|r|dd( } |r|!s|#st_        |d)      }|su|D ]p  }	t        |	      dkD  st        j*                  dt        j,                  |	      z   dz   t        j.                        }&|&jO                  |      sat_        d%|d*z         } n |tb        k  rd}|r|$r|"s|$s|#r|!s|#s|s%| je                  ||:      }'tf        ti        |      z  tj        |z  z   tl        |'z  z   }(|d+   |d   |(ti        |      ||'|| |d,	})|j%                  |)        to        d- |D              }*|D +cg c]  }+|+j;                  d      s|+ },}+|,s|D +cg c]
  }+|+d$   s	|+ },}+|,s|},|,r9|*|k  r4||*z
  }-| jq                  |,|-t        |      :.      }.|js                  |.       |ju                  d/ d       g }/i }0t               }1|D ]  })|)j;                  di       j;                  d0d1      }2|0j;                  |2d      }3|3tv        k\  rA|/j%                  |)       |3dz   |0|2<   |)j;                  d2      }4|4|1j1                  |4       t        |/      |k\  s n |,r|/D 5ch c]$  }5|5j;                  di       j;                  d0d1      & }6}5tK        d|t        |6      z
        }7|7dkD  r| jq                  |,|7|1|6:3      }8|8D ]  }9|9j;                  di       j;                  d0d1      }2|2|6v r*|/j%                  |9       |6j1                  |2       |0j;                  |2d      dz   |0|2<   |9j;                  d2      }4|4|1j1                  |4       t        |/      |k\  s n t        |/      |k  r-|D ](  })|)|/v r|/j%                  |)       t        |/      |k\  s( n tx        r%tz        rt        |/      dkD  r	 t}        ||/|4      }/|/S |/S # tP        $ r}tS        d|        g cY d}~S d}~ww xY wc c}+w c c}+w c c}5w # tP        $ r}tS        d5|        Y d}~|/S d}~ww xY w)6zFSearch for similar documents using hybrid vector + keyword re-ranking.NzIndex not loadedrD  rE  r   zQuery cannot be emptyquery_lengthzQuery must be a string
query_typezQuery cannot be only whitespacequery_contentz\b\w+\b   z\b)termpattern	is_domain
is_primaryc              3   >   K   | ]  }|j                  d         ywrq  NrU  .0ps     r   	<genexpr>z%VectorStore.search.<locals>.<genexpr>  s     'Vl(;'Vs   c                 .    j                  | d   d      S )Nrn  r   rt  )rw  	idf_tables    r   <lambda>z$VectorStore.search.<locals>.<lambda>  s    immAfIs; r   T)keyreversern  r   g       @g      @rq  r'  rJ  r(  zError during search: rP  re  g      Y@Fc              3   &   K   | ]	  }|d      yw)rp  N ru  s     r   rx  z%VectorStore.search.<locals>.<genexpr>  s     &P!q~&P   c              3   &   K   | ]	  }|d      ywrs  r  ru  s     r   rx  z%VectorStore.search.<locals>.<genexpr>  s     'RA,'Rr  ro  rp        ?      ?g?   g?g333333?r  	r  rP  scorevector_scorekeyword_score
bm25_scorehas_keyword_matchkeyword_snippet	doc_indexc              3   B   K   | ]  }|j                  d       rd  yw)r  r'  Nrt  )rv  	candidates     r   rx  z%VectorStore.search.<locals>.<genexpr>@  s%       
}}01  
s   )exclude_indicesrz  c                     | d   S )Nr  r  )items    r   r{  z$VectorStore.search.<locals>.<lambda>W  s
    g r   rQ  Unknownr  )r  exclude_sourcesrz  )top_kz(Reranking failed, using original order: )?r   r   r  r|   r   r)  r
  r+  __name__stripreprr9  refindallsetr   updater   r8  GENERIC_DOMAIN_TERMSQUERY_STOPWORDScompileescape
IGNORECASEr5  _ensure_searchable_texts_compute_idf_tableanysortedrU  r   r.  r1  r2  r3  r  r4  maxr	   searchr  r  zipr   r   r   partial_ratiomin_extract_keyword_snippetr   _compute_bm25_scorer
   floatr   r   sum_keyword_fallback_searchr6  sortr   RERANKER_AVAILABLEr   r   );r   querykquery_lowerquery_wordsdomain_termsprimary_domain_termsgeneral_termsexpanded_termswordrn  keyword_patternsadded_termssorted_patternspromotedpattern_infoterm_idfquery_embedding
oversamplescoresindicesr  
candidatesretrieved_indicesr  idxr[  searchable_text_lowersearchable_text_rawr  matched_termsr  r  matched_domainmatched_primaryrequire_domain_matchrequire_primary_matchterm_patternword_patternr  combined_scorer  keyword_hit_countrw  fallback_patternsneededfallback_candidatesbalanced_candidatessource_countsused_indicesrQ  rg  doc_idxcexisting_sourcesneeded_sourcesextra_candidatesextra_candidaterz  s;                                                             @r   r  zVectorStore.search  s	   ::"#5v7NOOt~~!#I!"9NA;NOO%%!":\4PU;K_K_<`aau{{}"!"CoW[\aWbEcddkkmjj[9! 	6Dx%%htn5	6 $ 	6D{"t<'?##D)33(//5		6 # 	6D<'##D)33(//5		6   	'D4yA~&|#t}'<  &	' e =0 		"D{"##::ebiio&=&Er}}U!\1"&::	%  OOD!		" 	%%'AQD++,<=WY	
 C'VEU'V$V$ ;O
 H / 	$==f)=sCs?c>-1\*Aq=			"jj//8O hh7>>yIO/Q!991=J"jj//LOFG
 
fQi4 [	)JE3c$..))..%Cwwz2&**95$$S)$($?$?$D!"&";";C"@  M$ $ 2 2;@U VY^ ^ M %"O"N#O#&&P?O&P#P $''RAQ'R$R!$4 	1L#/	#:L#**+>?%,,\&-AB'4-1N+//=26(+C1D(E,0)	1 }%)$']S3s=GY\]G]_`Ca=a-a$bM"&"?"?@SUe"f&+<&9$3&?O$^DX$'s$;M !' "D4y1}')zz%"))D/2IE2QSUS`S`'a'../BC,/]S5H,IM!" 77 #(,1En+4E11#7GSJ %uU|3%56"Z/0  y>
O' %e!.(%6#2 
I i(w[	)|    
) 
 
 )9P1AEE,<OQPP ,< Oq+ O O  0!2Q!6**F"&"?"?! #$5 6#	 #@ # 126E u# 	I]]:r266xKF!%%fa0E--&&y1$)AIM&!mmK0G"  )&'1,	 XklSTj" 5 9 9(I Nll A,<(=$=>N!#'#@#@%"$0$4' $A $  (8 O,00R@DDXyYF!11 '..?$((0,9,=,=fa,H1,LM&)-11+>G*$((1./14 "#a'' 	 33#**95*+q0 "2s;N7ORS7SF&7?RZ[&\# #"""i  	)!-.I	V Q O@  mH  F@DEE""FsU    Bb  b5b5#
b:.b:-)b?=c 	b2b-'b2-b2	c&c!!c&c                 J   |j                  di       }g }|j                  d      }|rct        j                  ddt        |      t        j                        }|j                  dd      j                  dd      }|j                  |       |j                  d	      xs |j                  d
      }|r|j                  t        |             |j                  d      r)|j                  t        |j                  d                   |j                  d      r8|j                  |j                  dg       D cg c]  }t        |       c}       |j                  d      r8|j                  |j                  dg       D cg c]  }t        |       c}       |j                  d      r)|j                  t        |j                  d                   |j                  t        |j                  dd                   dj                  |      }	|	j                         |j                  dg       }
t        j                         D ]A  \  }}||
v rt        fd|D              s |
j                  |       |j                  |       C |
rt        t        |
            }dj                  |      |d<   |j                  d      sH|j                  dd      }d|d    d}|j!                  |      s| | j#                         |d<   d|d<   dj                  |      S c c}w c c}w )z>Compose searchable text using enriched metadata + raw content.rP  rQ  z\.(pdf|txt)$r&  )flags_ -document_titletitler\   
key_pointsthemesclean_excerptr  topicsc              3   &   K   | ]  }|v  
 y wNr  )rv  variant
lower_texts     r   rx  z5VectorStore._build_searchable_text.<locals>.<genexpr>  s     AW7j(As   z, topic_label_topic_taggedz	[Topics: z] T)rU  r  subr
  r  replacer8  r6  joinr9  rf  CANONICAL_TOPIC_KEYWORDSitemsr  r  r  
startswithr  )r   r[  rP  searchable_text_partsrY  cleaned_sourcer  pointtheme	base_textr  	canonicalvariantsdistinct_topicsr\   topic_prefixr  s                   @r   r7  z"VectorStore._build_searchable_text  s   77:r* " ll8,VVOR[9IQSQ^Q^_N+33C=EEc3ON!((8!&67P8<<;P!((^)<=<<	"!((X\\)-D)EF<<%!(((,,|]_B`)a#e*)ab<<!!(((,,xY[B\)]#e*)]^<<(!((X\\/-J)KL$$SB)?%@AHH23	__&
$$Xr2#;#A#A#C 	8IxF"AAAi(%,,Y7	8 $S[1O&*ii&@H]#<<0",,y"5!*8M+B*C2F)),7-9N7)*D*J*J*LHY',0)xx-..9 *b)]s   6L?L c                    | j                   D cg c]  }| j                  |       c}| _        | j                  D cg c]  }|j                          c}| _        | j                  D cg c]  }t        |j                                c}| _        | j                  r1t        | j                        t        | j                        z  | _	        yd| _	        yc c}w c c}w c c}w )z/Build cached searchable text for all documents.r   N)
r|   r7  r   r9  r   r  splitr   r  r   )r   r[  texts      r   r  z%VectorStore._prepare_searchable_texts  s     ~~%
 '',%
!
 &*%>%>'
!DJJL'
# +/*C*C
"&C


 "%d&6&6"7#d>N>N:O"OD"%D%
'

s   CC* C!c                     t        | j                        t        | j                        k7  s+t        | j                        t        | j                        k7  r| j	                          yy)z8Ensure searchable texts cache is in sync with documents.N)r  r   r|   r   r  )r   s    r   r  z$VectorStore._ensure_searchable_texts  sN     ))*c$...AA++,DNN0CC**, Dr   c                 (   i }t        | j                        }|dk(  r|S |D ]n  }|d   }||v r|d   }d}| j                  D ]  }|j                  |      s|dz  } t        j                  ||z
  dz   |dz   z  dz         }	t        |	d      ||<   p |S )z6Compute IDF values for query terms using BM25 formula.r   rn  ro  r'  r  r  r   )r  r   r  mathlogr  )
r   r  rz  
total_docsinforn  ro  dfr   idfs
             r   r  zVectorStore._compute_idf_table  s    	445
?$ 
	,D<Dy 9oGB33 >>$'!GB ((JOc1b3h?#EFC!#smIdO
	, r   c                    |r|sy|t        | j                        k\  ry| j                  |   }|t        | j                        k  r| j                  |   nt        |j                               }| j                  dkD  r| j                  n|xs d}d}|D ]u  }|d   }	|d   }
t        |
j                  |            }|dk(  r-|j                  |	d      }|t        dz   z  }|t        dt        z
  t        ||z  z  z   z  z   }||||z  z  z  }w |S )z9Compute BM25 score for a document given keyword patterns.r   r   r'  rn  ro  r  )	r  r   r   r  r   r  rU  BM25_K1BM25_B)r   r  r  rz  doc_textdoc_lenavg_lenbm25r  rn  ro  tfr  	numeratordenominators                  r   r  zVectorStore._compute_bm25_score  s!   yD7788..y91:SAQAQ=R1R$""9-X[\d\j\j\lXm)-)<)<q)@$%%glQR$ 		4D<D9oGW__X./BQw--c*Cgm,Iw#,7WCT9U*UVVKC9{233D		4 r   c                    g }|xs
 t               }|rt        |      n	t               }t        | j                        D ]  \  }||v r| j                  |   }	|	j	                  di       j	                  d      r=t        fd|D              r|	j	                  di       j	                  dd      }
|
|v rx| j                  |      }|sd}| j                  |||xs i       }t        |z  t        |z  z   }|	d   |	d   |d||d	||d
	}|j                  |       |j                  |       |j                  |
       t        |      |k\  s |S  |S )zFScan all documents for keyword matches when vector search misses them.rP  re  c              3   F   K   | ]  }|d    j                          yw)ro  N)r  )rv  r  r  s     r   rx  z7VectorStore._keyword_fallback_search.<locals>.<genexpr>  s!     \44	?))*=>\s   !rQ  r  r  r  r   Tr  )r  r,  r   r|   rU  r  r  r  r   r   r8  r5  r  )r   r  r  r  r  rz  r  local_exclude_sourcesr  r[  rQ  snippetr  r  combinedr  r  s                   @r   r  z$VectorStore._keyword_fallback_search  s    )2SU8GO 4SU(1$2K2K(L %	$C$o%..%Cwwz2&**95\K[\\R044XyI22778KM]^ #//5EyTVW)M9&-. 
  #9~ #J%$'%2"&)-'.!$
	 $**95##C(%))&1&'61""O%	N #"r   windowc                 6   |r|sy|D ]  }|d   j                  |      }|st        d|j                         |dz  z
        }t        t	        |      |j                         |dz  z         }||| }t        j                  dd|      j                         }|c S  y)z9Extract a concise snippet around the first keyword match.Nro  r   rJ  z\s+r  )	r  r  startr  r  endr  r  r  )	r   raw_textr  r  r  matchr  r  r  s	            r   r  z$VectorStore._extract_keyword_snippet4  s    /$ 	DO**84EAu{{}v{:;#h-v{)BC"5-&&g6<<>	 r   r  )   )NNN)r  )r  
__module____qualname__r   r  rB  rG  r  rb  rT  rh  r  r7  r  r  r  r  r  intr  r  r   r   r   r   i   sh    H/b0Ad
SqPBd.]`"N#`//b&"-&,-#^3 r   r   )(sentence_transformersr   rerankerr   r   r  ImportErrorr  numpyr1  r  r  r  pathlibr   	rapidfuzzr   r   r   r	   r
   r   r   r   r   r   r   
exceptionsr   r   r   r   r   r   r  r  r
  r  r  r   r  r   r   <module>r)     s    5<    	   
 
 
   	

M+ ZX XG  s   
B
 
	BB